In [30]:
import pandas as pd
from collections import Counter
import json
import re

# 读取Mt文献数据
data = pd.read_excel("./Mt_paper_all.xlsx")
data.head()

Unnamed: 0,PMID,Author,Title,Abstract,Keywords,Journal,Institution,Country,DOI,Publication Year
0,25025273,"['Singh, Bijender']",Myceliophthora thermophila syn. Sporotrichum t...,Myceliophthora thermophila syn. Sporotrichum t...,"Biomolecules, Myceliophthora thermophila, Spor...",Critical reviews in biotechnology,"a Laboratory of Bioprocess Technology, Departm...",India,10.3109/07388551.2014.923985,2016
1,32640074,"['Dos Santos Gomes, A C', 'Casciatori, F P', '...",Growth kinetics of Myceliophthora thermophila ...,AIMS: This work aimed to estimate the growth o...,"N-acetylglucosamine, enzymes, growth kinetics,...",Journal of applied microbiology,"Instituto de Biociencias, Letras e Ciencias Ex...",Brazil,10.1111/jam.14774,2021
2,24995002,"['Karnaouri, Anthi', 'Topakas, Evangelos', 'An...",Genomic insights into the fungal lignocellulol...,The microbial conversion of solid cellulosic b...,"CAZy, Myceliophthora thermophila, biofuels, li...",Frontiers in microbiology,"Biotechnology Laboratory, Department of Synthe...",Sweden,10.3389/fmicb.2014.00281,2014
3,35450635,"['Sun, Peicheng', 'de Munnik, Melanie', 'van B...",Extending the diversity of Myceliophthora ther...,Lytic polysaccharide monooxygenases (LPMOs) pl...,"Active site segment, LPMOs, Lignocellulose, Ma...",Carbohydrate polymers,"Laboratory of Food Chemistry, Wageningen Unive...",the Netherlands. Electronic address: peicheng....,10.1016/j.carbpol.2022.119373,2022
4,31534479,"['Dos Santos Gomes, Ana Carolina', 'Falkoski, ...",Myceliophthora thermophila Xyr1 is predominant...,BACKGROUND: Myceliophthora thermophila is a th...,"Cellulose degradation, Myceliophthora thermoph...",Biotechnology for biofuels,"1Fungal Physiology, Westerdijk Fungal Biodiver...",The Netherlands. ISNI: 0000000120346234. GRID:...,10.1186/s13068-019-1556-y,2019


In [31]:
# 如果Keywords列中为空，则删除该行
data_filter = data.dropna(subset=['Keywords']).copy()

# 保存data_filter
data_filter.to_excel("./Mt_paper_filter.xlsx", index=False)

In [35]:
# 取data的Keywords这一列，并转为str类型
keywords = data_filter["Keywords"].astype(str).copy()

# 将keywords整合到一个列表中
keywords_list = []

for i in keywords:
    keywords_list.extend(i.split(","))

# 将keywords_list中的空格去掉
keywords_list = [i.strip() for i in keywords_list]

# 将keywords_list每一个值的首字母改为大写
keywords_list = [i.capitalize() for i in keywords_list]

keywords_list

['Biomolecules',
 'Myceliophthora thermophila',
 'Sporotrichum thermophile',
 'Thermophilic mould',
 'Biotechnological applications',
 'Hydrolytic enzymes',
 'N-acetylglucosamine',
 'Enzymes',
 'Growth kinetics',
 'Logistic model',
 'Solid-state cultivation',
 'Cazy',
 'Myceliophthora thermophila',
 'Biofuels',
 'Lignocellulolytic enzymes',
 'Plant biomass',
 'Active site segment',
 'Lpmos',
 'Lignocellulose',
 'Mass spectrometric fragmentation',
 'Oxidative cleavage',
 'Reduction',
 'Xyloglucan',
 'Cellulose degradation',
 'Myceliophthora thermophila',
 'Pentose catabolism',
 'Xylan degradation',
 'Xylanolytic regulator',
 'Crispr-cas9',
 'Cellulases',
 'Mtalp1',
 'Myceliophthora thermophila',
 'Protease',
 'Myceliophthora thermophila',
 'Genome-scale metabolic model',
 'Thermophilic fungi',
 'Transcriptomics',
 'Alkalistable',
 'Cellulose saccharification',
 'Extremozyme',
 'Recombinant cellobiohydrolase',
 'Solvent tolerance',
 'Thermostable cellulose',
 'Biomass',
 'Cellulose degra

In [36]:
keywords_counts = Counter(keywords_list)

# 将keywords_counts转换为DataFrame
keywords_counts = pd.DataFrame.from_dict(keywords_counts, orient="index").reset_index()
keywords_counts.columns = ["keywords", "counts"]

# 将keywords_counts按照counts进行降序排列
keywords_counts = keywords_counts.sort_values(by="counts", ascending=False)
keywords_counts.to_csv("./Mt_keywords_counts.csv", index=False)

以下是存疑内容：

In [37]:
# json_data = keywords_counts.rename(columns={"counts": "value", "keywords": "name"}).to_dict("records")

# 遍历数据框中的每一行，将DataFrame转换为所需的json格式
json_data = []
for _, row in keywords_counts.iterrows():
    data = {"value": row["counts"], "name": row["keywords"]}
    json_data.append(data)

# json_data = [
# {"value": row["counts"],
# "name": row["keywords"]}
# for _, row in keywords_counts.iterrows()
# ]

json_string = json.dumps(json_data, indent=4, ensure_ascii=False)

# 将json_string写入json文件
with open("./Mt_keywords_counts.json", "w", encoding="utf-8") as f:
    f.write(json_string)