In [1]:
from Bio import Entrez
import pandas as pd
import re

# 指定Entrez使用的邮件地址，这是必需的
Entrez.email = "3575318018@qq.com"

# 保存表格的名称开头，便于分类
# key_name = "An"
# name = "Aspergillus niger"
# key_name = "Tr"
# name = "Trichoderma reesei"
key_name = "Nc"
name = "Neurospora crassa"

# 指定搜索的关键词和数据库
search_term = name + " AND GSE[Entry Type]"
database = "gds"

# 使用Entrez进行搜索并获取ID列表
handle1 = Entrez.esearch(db=database, term=search_term, retstart=0, retmax=200)
record = Entrez.read(handle1)
id_list = record["IdList"]

# 获取所有ID的详细信息
handle2 = Entrez.esummary(db=database, id=",".join(id_list))
records = Entrez.read(handle2)
records = list(records)

In [2]:
def get_gse_info(records):
    """根据records,获取相关菌种的gse数据集关键信息.
    """
    gse_list = []
    for record in records:
        title = record["title"]
        summary = record["summary"]
        Gse_id = record['Accession']
        samples_num = record['n_samples']
        
        if "PubMedIds" in record and len(record["PubMedIds"]) > 0:
            pmid = str(record["PubMedIds"][0])
            match = re.search(r'\d+', pmid)
            if match:
                pmid = match.group(0)
            else:
                pmid = "N/A"
        else:
            pmid = "N/A"

        gse_list.append({'GSE Series':Gse_id,'Title':title,'Summary':summary,'Samples':samples_num,'PMID':pmid})

    df_gse = pd.DataFrame(gse_list)
    df_gse.to_csv(key_name + '_' + 'GSE_Info_All.csv', index=False, columns=['GSE Series','Title','Summary','Samples','PMID'])  # All代表是NCBI上记录的此菌种的所有GSE数据集
    
    # 将df_gse中GSE Series的GSE号保存在一个txt文件中，行尾序列为LF格式
    with open(key_name + '_' + 'All_GSE.txt', 'w') as f:
        for i in df_gse['GSE Series']:
            f.write(i + '\n')

    return df_gse


def get_gsm_and_sample_name(records):
    """根据records,获取 GSM Sample ID 与 name 及样本对应的GSE Series ID.
    """
    sample_list = []
    for record in records:
        Gse_id = record['Accession']
        samples = record['Samples']
        for key in samples:
            accession = key['Accession']
            title = key['Title']
            sample_list.append({'GEO Series': Gse_id,'Accession': accession,'Title': title})
            
    df = pd.DataFrame(sample_list)
    df.to_csv(key_name + '_' + 'GSM_ID.csv', index=False, columns=['GEO Series','Accession','Title'])
    
    return df

In [3]:
get_gse_info(records)

Unnamed: 0,GSE Series,Title,Summary,Samples,PMID
0,GSE237909,Cellular communication and fusion regulates ce...,"In this study, we characterized Aoadv-1, Aoso,...",18,
1,GSE232935,Histone deacetylation and cytosine methylation...,This SuperSeries is composed of the SubSeries ...,31,
2,GSE232934,Histone deacetylation and cytosine methylation...,Chromosomes must correctly fold in eukaryotic ...,12,
3,GSE232933,Histone deacetylation and cytosine methylation...,Chromosomes must correctly fold in eukaryotic ...,19,
4,GSE220169,The nutrient-sensing GCN2 signaling pathway is...,Circadian clocks are evolved to adapt to the d...,12,37083494
...,...,...,...,...,...
153,GSE14909,Molecular characterization of a cryptochrome i...,Cryptochromes were identified in plants and an...,12,20305004
154,GSE13977,Expression of lignin-degrading enzymes in soil...,Microarrays have become established tools for ...,68,
155,GSE12893,Characterization of the Ku70 homologue HdfA de...,Targeting an engineered DNA fragment to a spec...,11,19269344
156,GSE8932,Genome-wide characterization of light-inducibl...,"To better understand the roles of WC-1, WC-2 a...",135,19262566


In [4]:
get_gsm_and_sample_name(records)

Unnamed: 0,GEO Series,Accession,Title
0,GSE237909,GSM7656285,C_36_1
1,GSE237909,GSM7656271,A_0_2
2,GSE237909,GSM7656279,A_36_1
3,GSE237909,GSM7656282,B_36_1
4,GSE237909,GSM7656283,B_36_2
...,...,...,...
5848,GSE12690,GSM318587,wt strain N150 MeDIP rep 1
5849,GSE12690,GSM318604,double argonaute mutant strain N3406 MeDIP
5850,GSE12690,GSM318578,wt H3K9me3 rep 1
5851,GSE12690,GSM318582,HP1-FLAG ChIP-chip


In [None]:
# for record in records:
#     Samples_name = record['Samples']
#    #  for i in Samples_name:
#       #  Samples_name_singel = Samples_name['Accession']
#       #  Samples_name_title = Samples_name['Title']

#     print("Samples_name: {}".format(Samples_name))

# print("Title: {}".format(title))
# print("Summary: {}".format(summary))
# print("Pmid: {}".format(pmid))
# print("GSE: {}".format(Gse_id))
# print("Samples: {}".format(samples_num))

# for record in records:
#     samples = record['Samples']
#     for sample in samples:
#         accession = sample['Accession']
#         title = sample['Title']
#         print("Accession: {}, Title: {}".format(accession, title))