In [1]:
import plotly.express as px
import pandas as pd
import random

# 生成测试数据
years = list(range(2000, 2023))
publications = [random.randint(400, 800) for _ in years]
keywords = [f"Keyword_{year}" for year in years]
numbers = [random.randint(1, 10) for _ in years]

data = {'Year': years, 'Publications': publications, 'Keywords': keywords, 'Numbers': numbers}

df = pd.DataFrame(data)

# 使用plotly进行绘图
fig = px.scatter(df, 
                 x='Year', 
                 y='Publications', 
                 size='Publications',
                 color='Publications',
                 hover_name='Keywords',
                 hover_data=['Keywords', 'Numbers'],  # 添加了Keywords和Numbers到悬浮数据中
                 size_max=20,
                 title='Publication Numbers of Ciliopathy Related Genes (2000-2022)',
                 labels={"Publications": "Publication Numbers", "Year": "Publication Years"},
                 color_continuous_scale="Geyser",  # _r表示颜色反转
                #  color_continuous_scale=px.colors.sequential.Viridis
                 )

fig.update_layout(
    # font=dict(family="Arial", size=14),
    template="plotly_white",
    # template="simple_white",
    width=1000,  # 图表宽度
    height=600,  # 图表高度
)

fig.update_traces(marker_symbol='diamond')

# 显示图表
fig.show()

marker_symbol 参数可以设置不同的标记形状,主要有以下几种:

'circle' - 默认的圆形标记
'square' - 正方形标记
'diamond' - 菱形标记
'cross' - 十字标记
'x' - X形标记
'triangle-up' - 向上三角形标记
'triangle-down' - 向下三角形标记
'triangle-left' - 左向三角形标记
'triangle-right' - 右向三角形标记
'pentagon' - 五边形标记
'hexagon' - 六边形标记
'hexagon2' - 六边形标记的另一种样式
'octagon' - 八边形标记
'star' - 星形标记
'line' - 线条标记
'dash' - 短线条标记
'dot' - 圆点标记
'circle-open' - 空心圆标记
'circle-dot' - 圆环点标记
'square-open' - 空心方块标记
'square-dot' - 带圆点的方块标记

In [2]:
import plotly.express as px
import pandas as pd
import numpy as np

df = pd.read_csv('Mt_keywords_counts_year.csv')

# df的count列转为float类型
df['Count'] = df['Count'].astype(float)
df['Counts'] = df['Count'] + np.random.uniform(-0.5, 0.5, size=len(df))
df['Years'] = df['Year'] + np.random.uniform(-0.3, 0.3, size=len(df))

fig = px.scatter(df, 
                 x='Years', 
                 y='Counts', 
                 size='Count',
                 color='Count',
                 hover_name='Keyword',  # 使用关键词作为悬浮名称
                 hover_data=['Year', 'Count'],  
                 size_max=20,
                 title='',
                 labels={"Count": "Numbers", "Year": "Years"},
                 color_continuous_scale="Rdbu_r",
                )

fig.update_layout(
    template="plotly_white",
    width=1000,
    height=600,
    xaxis=dict(title=dict(text='Years', font=dict(size=18))),
    yaxis=dict(title=dict(text='Number', font=dict(size=18)),
    )
)


fig.show()

In [3]:
import pandas as pd
from collections import Counter
import json
import re

In [4]:
# 读取csv数据
data = pd.read_csv("./Mt_paper_all.csv").copy()

In [5]:
# 如果data_filter中的Keywords列有空值，就删除该行
data.dropna(subset=['Keywords'], inplace=True)
data

Unnamed: 0,PMID,Author,Title,Abstract,Keywords,Journal,Institution,Country,DOI,Publication Year
0,25025273,"['Singh, Bijender']",Myceliophthora thermophila syn. Sporotrichum t...,Myceliophthora thermophila syn. Sporotrichum t...,"Biomolecules, Myceliophthora thermophila, Spor...",Critical reviews in biotechnology,"a Laboratory of Bioprocess Technology, Departm...",India,10.3109/07388551.2014.923985,2016
1,32640074,"['Dos Santos Gomes, A C', 'Casciatori, F P', '...",Growth kinetics of Myceliophthora thermophila ...,AIMS: This work aimed to estimate the growth o...,"N-acetylglucosamine, enzymes, growth kinetics,...",Journal of applied microbiology,"Instituto de Biociencias, Letras e Ciencias Ex...",Brazil,10.1111/jam.14774,2021
2,24995002,"['Karnaouri, Anthi', 'Topakas, Evangelos', 'An...",Genomic insights into the fungal lignocellulol...,The microbial conversion of solid cellulosic b...,"CAZy, Myceliophthora thermophila, biofuels, li...",Frontiers in microbiology,"Biotechnology Laboratory, Department of Synthe...",Sweden,10.3389/fmicb.2014.00281,2014
3,35450635,"['Sun, Peicheng', 'de Munnik, Melanie', 'van B...",Extending the diversity of Myceliophthora ther...,Lytic polysaccharide monooxygenases (LPMOs) pl...,"Active site segment, LPMOs, Lignocellulose, Ma...",Carbohydrate polymers,"Laboratory of Food Chemistry, Wageningen Unive...",the Netherlands. Electronic address: peicheng....,10.1016/j.carbpol.2022.119373,2022
4,31534479,"['Dos Santos Gomes, Ana Carolina', 'Falkoski, ...",Myceliophthora thermophila Xyr1 is predominant...,BACKGROUND: Myceliophthora thermophila is a th...,"Cellulose degradation, Myceliophthora thermoph...",Biotechnology for biofuels,"1Fungal Physiology, Westerdijk Fungal Biodiver...",The Netherlands. ISNI: 0000000120346234. GRID:...,10.1186/s13068-019-1556-y,2019
...,...,...,...,...,...,...,...,...,...,...
245,24128582,"['Aljawish, Abdulhadi', 'Chevalot, Isabelle', ...",Laccase-catalysed oxidation of ferulic acid an...,The enzymatic oxidation of ferulic acid (FA) a...,"Antioxidant, Cytotoxicity, Dye, Enzymatic oxid...",Food chemistry,Laboratoire d'Ingenierie des Biomolecules (LIB...,France,10.1016/j.foodchem.2013.07.119,2014
253,30960948,"['Su, Jing', 'Shim, Euijin', 'Noro, Jennifer',...",Conductive Cotton by In Situ Laccase-Polymeriz...,Conductive cotton fabrics were obtained via in...,"1-hydroxybenzotriazol (HBT), coatings, conduct...",Polymers,International Joint Research Laboratory for Te...,China. jingsu@ceb.uminho.pt,10.3390/polym10091023,2018
257,33807631,"['Contato, Alex Graca', 'de Oliveira, Tassio B...",Prospection of Fungal Lignocellulolytic Enzyme...,The lignocellulosic biomass comprises three ma...,"Hymenaea courbaril, Tamarindus indica, biopros...",Microorganisms,"Departamento de Bioquimica e Imunologia, Facul...",Brazil,10.3390/microorganisms9030533,2021
263,36866191,"['Bampidis, Vasileios', 'Azimonti, Giovanna', ...",Safety and efficacy of a feed additive consist...,Following a request from the European Commissi...,"Natupulse(R), digestibility enhancers, efficac...",EFSA journal. European Food Safety Authority,?,,10.2903/j.efsa.2023.7873,2023


In [None]:
# data只保留title和keywords两列
data_filter = data[['Keywords', 'Publication Year']]

# 保存data_filter为csv
data_filter.to_csv("./Mt_paper_filter.csv", index=False)

In [None]:
import pandas as pd
from collections import defaultdict
import re

# 函数用于检查是否是有效的关键词
def is_valid_keyword(keyword):
    # 这里我们使用正则表达式排除仅包含数字的字符串
    # 并且确认关键词长度超过2
    return re.match(r'^[A-Za-z\s]+$', keyword) and len(keyword) > 2

# 初始化一个defaultdict来存储关键词计数
keywords_yearly_counts = defaultdict(lambda: defaultdict(int))

# 遍历DataFrame的每一行
for index, row in data_filter.iterrows():
    year = row['Publication Year']
    keywords = row['Keywords'].split(',')
    
    # 去除空格并统一格式
    keywords = [keyword.strip().capitalize() for keyword in keywords if is_valid_keyword(keyword)]
    
    for keyword in keywords:
        keywords_yearly_counts[keyword][year] += 1

# 转换为DataFrame
keywords_yearly_df = pd.DataFrame([
    {'Keyword': keyword, 'Year': year, 'Count': count}
    for keyword, years in keywords_yearly_counts.items()
    for year, count in years.items()
])

# 排序并保存
keywords_yearly_df = keywords_yearly_df.sort_values(by=["Keyword", "Year"])
keywords_yearly_df.to_csv('Mt_keywords_counts_year.csv', index=False)