In [26]:
import pandas as pd
import numpy as np
from functools import reduce

from Bio import Medline
'''
=========================================
助记符描述
---------------------------------------
AB摘要
CI版权信息
AD附属机构
IRAD调查员隶属关系
AID物品标识符
AU作者
FAU完整作者
CN公司作者
DCOM完成日期
DA创建日期
LR上次修订日期
DEP电子出版日期
DP出版日期
EDAT Entrez日期
GS基因符号
GN一般说明
GR授予编号
IR研究者姓名
FIR调查员全名
是ISSN
IP问题
TA期刊标题缩写
JT期刊标题
LA语言
LID位置标识符
MID手稿标识符
MHDA MeSH日期
MH MeSH术语
JID NLM唯一ID
RF参考编号
OAB其他摘要
OCI其他版权信息
OID其他ID
OT其他期限
OTO其他期限所有者
自有所有者
PG寻呼
PS个人姓名作为主题
FPS全名作为主题
PL出版地点
PHST发布历史状态
PST发布状态
PT出版物类型
PUBM发布模型
PMC PubMed中央标识符
PMID PubMed唯一标识符
RN注册号/EC号
NM物质名称
SI辅助源ID
SO来源
SFM太空飞行任务
STAT状态
SB子集
TI标题
TT翻译标题
VI卷
CON评论
CIN注释
EIN勘误表
EFR勘误表
CRI已更正并于重新发布
CRF从
PRIN部分收回
PROF部分收回
RPI于重新发布
RPF从重新发布
RIN缩回
ROF收回
UIN更新
UOF更新
SPIN总结
ORI中的原始报告
========================================='''

In [27]:
df = pd.read_csv('./csv-Neurospora-set.csv', sep=',').copy()

# 删除空值
df = df.dropna(subset=['PMID'])
# 删除重复值
df = df.drop_duplicates(subset=['PMID'])
df = df.reset_index(drop=True)
# 如果df的DOI列为空，则删除该列
df = df.dropna(subset=['DOI'])

print("文献数量：", df.shape[0])
df.head()

文献数量： 6892


Unnamed: 0,PMID,Title,Authors,Citation,First Author,Journal/Book,Publication Year,Create Date,PMCID,NIHMS ID,DOI
0,35858815,Glucuronoyl esterase facilitates biomass degra...,"Wang R, Arioka M.",J Gen Appl Microbiol. 2023 Mar 6;68(6):278-286...,Wang R,J Gen Appl Microbiol,2023,2022/7/20,,,10.2323/jgam.2022.06.002
1,36481248,Optimized fluorescent proteins for 4-color and...,"Wang Z, Bartholomai BM, Loros JJ, Dunlap JC.",Fungal Genet Biol. 2023 Jan;164:103763. doi: 1...,Wang Z,Fungal Genet Biol,2023,2022/12/8,PMC10501358,NIHMS1927181,10.1016/j.fgb.2022.103763
2,37220856,Domains required for the interaction of the ce...,"Wang B, Dunlap JC.",J Biol Chem. 2023 Jul;299(7):104850. doi: 10.1...,Wang B,J Biol Chem,2023,2023/5/23,PMC10320511,,10.1016/j.jbc.2023.104850
3,37313736,Permissiveness and competition within and betw...,"Mela AP, Glass NL.",Genetics. 2023 Aug 9;224(4):iyad112. doi: 10.1...,Mela AP,Genetics,2023,2023/6/14,PMC10411585,,10.1093/genetics/iyad112
4,36744948,Developing a Temperature-Inducible Transcripti...,"Tabilo-Agurto C, Del Rio-Pinilla V, Eltit-Vill...",mBio. 2023 Feb 28;14(1):e0329122. doi: 10.1128...,Tabilo-Agurto C,mBio,2023,2023/2/6,PMC9973361,,10.1128/mbio.03291-22


In [28]:
from Bio import Medline
from Bio import Entrez
import string
import time
from functools import reduce


Entrez.email = "dongjiacheng963@gmail.com"
# Entrez.api_key = "6fcb01c2989c35aa1da3d9d0118abedfc409"


def get_keywords(PMID_list):
    """Thank to Dr. Mao Zhitao for his help in this function."""
    
    PMID_info = []
    for i in PMID_list:
        # 解释：db：数据库，id：文献id，rettype：返回的文献格式，retmode：返回的文献格式  #获取PMID为pmid的论文，返回medline格式的文本
        handle = Entrez.efetch(db="pubmed", id=i, rettype="medline", retmode="text")
        records = Medline.parse(handle)
        records = list(records)

        for record in records:
            title = record.get("TI", "?")  # 标题  # 解释get函数：获取字典中的值，如果没有则返回？
            abstract = record.get("AB", "?")  # 摘要
            keywords = record.get("OT", "?") # 关键词
            Journal_abbrev = record.get("TA", "?") # 期刊缩写
            Journal = record.get("JT", "?")
            country = record.get("AD", "?")[0].split(", ")[-1].strip(string.punctuation)
            pmid = record.get("PMID", "?")
            Author = record.get("FAU", "?")
            doi = record.get("SO", "?")
            date = record.get("DEP", "?")
            year = record.get("DP", "?")
            institution = record.get("AD", "?")[0]

            PMID_info.append(
                {
                    "PMID": pmid,
                    "DOI": doi,
                    "Author": Author,
                    "Title": title,
                    "Abstract": abstract,
                    "Keywords": keywords,
                    "Journal_abbrev": Journal_abbrev,
                    "Journal": Journal,
                    "Institution": institution,
                    "Country": country,
                    "Date": date,
                    "Year": year,
                }
            )
    return PMID_info


def optimized_fetch(PMID_list):
    PMID_info = []
    start_time = time.time()

    for i, pmid in enumerate(PMID_list):
        if i % 3 == 0 and i != 0:
            elapsed_time = time.time() - start_time
            if elapsed_time < 1:
                time.sleep(1 - elapsed_time)
            start_time = time.time()

        info = get_keywords([pmid])
        PMID_info.extend(info)

    return PMID_info

In [None]:
# # 设置Pubmed API访问频率
# request_interval = 5  # 间隔时间

# # 按每10篇文章，获取文献信息
# list_PMID_info = [] 
# list_interval = np.arange(0, 10000, 10)
# for i in list_interval:
#     list_PMID = df['PMID'].tolist()[i:i+10]
#     PMID_info = get_keywords(list_PMID)
#     list_PMID_info.append(PMID_info)
#     time.sleep(request_interval)

# # 合并列表2
# list_info = reduce(lambda x,y: x+y, list_PMID_info)
# df_info = pd.DataFrame(list_info)
# # df_info['DOI'] = df_info['DOI'].str.extract(r'(10\.\d{4,9}\/[-._;()/:A-Za-z0-9]+)')
# # df_info['DOI'] = df_info['DOI'].str.rstrip('.')
# df_info

In [None]:
pmid_list = df['PMID'].tolist() 
# 使用 optimized_fetch 函数获取文献信息
list_PMID_info = optimized_fetch(pmid_list)
df_info = pd.DataFrame(list_PMID_info)
df_info

In [31]:
# 保存为csv文件
df_info.to_csv('./df_info_输出示例.csv', index=False, sep=',', encoding='utf_8_sig')

In [32]:
# df_info的PMID列有重复值，需要删除
df_info = df_info.drop_duplicates(subset=['PMID'])
df_info = df_info.reset_index(drop=True)
df_info

Unnamed: 0,PMID,DOI,Author,Title,Abstract,Keywords,Journal_abbrev,Journal,Institution,Country,Date,Year
0,35858815,10.2323/jgam.2022.06.002,"[Wang, Ruijie, Arioka, Manabu]",Glucuronoyl esterase facilitates biomass degra...,Glucuronoyl esterase (GE) is a promising agent...,"[Neurospora crassa, cellulase, glucuronoyl est...",J Gen Appl Microbiol,The Journal of general and applied microbiology,"Department of Biotechnology, The University of...",The University of Tokyo,20220721,2023 Mar 6
1,36481248,10.1016/j.fgb.2022.103763,"[Wang, Ziyan, Bartholomai, Bradley M, Loros, J...",Optimized fluorescent proteins for 4-color and...,Fungal cells are quite unique among life in th...,?,Fungal Genet Biol,Fungal genetics and biology : FG & B,"Geisel School of Medicine at Dartmouth, Depart...",USA,20221205,2023 Jan
2,37220856,10.1016/j.jbc.2023.104850,"[Wang, Bin, Dunlap, Jay C]",Domains required for the interaction of the ce...,In the negative feedback loop composing the Ne...,"[FFC, FRH, FRQ, Neurospora, WC-1, WC-2, WCC, a...",J Biol Chem,The Journal of biological chemistry,"Department of Molecular and Systems Biology, G...",USA. Electronic address: Bin.Wang@Dartmouth.edu,20230521,2023 Jul
3,37313736,10.1093/genetics/iyad112,"[Mela, Alexander P, Glass, N Louise]",Permissiveness and competition within and betw...,A multinucleate syncytium is a common growth f...,"[Neurospora crassa, allorecognition, cell fusi...",Genetics,Genetics,"The Plant and Microbial Biology Department, Un...",USA,?,2023 Aug 9
4,36744948,10.1128/mbio.03291-22,"[Tabilo-Agurto, Cyndi, Del Rio-Pinilla, Veroni...",Developing a Temperature-Inducible Transcripti...,"Heat shock protein (HSP)-encoding genes (hsp),...","[HSP, Neurospora, Neurospora crassa, heat shoc...",mBio,mBio,Departamento de Genetica Molecular y Microbiol...,Chile,20230206,2023 Feb 28
...,...,...,...,...,...,...,...,...,...,...,...,...
318,30986238,10.1371/journal.pone.0214546,"[Shim, Euijin, Su, Jing, Noro, Jennifer, Teixe...",Conductive bacterial cellulose by in situ lacc...,Conductive and colored bacterial cellulose (BC...,?,PLoS One,PloS one,"Department of Clothing and Textiles, Sookmyung...",South Korea,20190415,2019
319,17932916,10.1002/prot.21699,"[Zumarraga, Miren, Camarero, Susana, Shleev, S...",Altering the laccase functionality by in vivo ...,The generation of diversity for directed prote...,?,Proteins,Proteins,"Instituto de Catalisis y Petroleoquimica, CSIC...",Spain,?,2008 Apr
320,8597577,10.1016/0167-4838(95)00210-3,"[Xu, F, Shin, W, Brown, S H, Wahleithner, J A,...",A study of a series of recombinant fungal lacc...,A series of fungal laccases (Polyporus pinsitu...,?,Biochim Biophys Acta,Biochimica et biophysica acta,"Novo Nordisk Biotech, Davis, CA 95616 USA.",CA 95616 USA,?,1996 Feb 8
321,12817956,10.1021/ic026099n,"[Palmer, Amy E, Szilagyi, Robert K, Cherry, Jo...",Spectroscopic characterization of the Leu513Hi...,"A variety of spectroscopic techniques, combine...",?,Inorg Chem,Inorganic chemistry,"Department of Chemistry, Stanford University, ...",USA,?,2003 Jun 30


In [33]:
# 保存为csv文件
df_info.to_csv('./df_info_输出示例-2.csv', index=False, sep=',', encoding='utf_8_sig')

In [28]:
# 根据df_info['Journal']的信息，绘制柱状图
import plotly.express as px
import plotly.graph_objects as go

df_info_gro_jour = df_info.groupby('Journal').count().sort_values(by='PMID', ascending=False).iloc[:50, :]
fig = px.scatter(df_info_gro_jour, x=df_info_gro_jour.index, y=df_info_gro_jour['PMID'], title='Journals', labels={'x':'Journal', 'y':'Count'},)
fig.update_layout(height=1000, xaxis_tickangle=45, template='plotly', font_size=20)
fig.show()