### 解析GSE.soft文件

In [111]:
import re
import pandas as pd
import os

def extract_information(file_path):
    """从GSE的soft文件中提取提交单位与国家
    """

    information = []

    with open(file_path) as file:
        content = file.read()

    # 使用正则表达式进行匹配提取信息
    accession_pattern = r'!Series_geo_accession = (\w+)'
    accessions = re.findall(accession_pattern, content)
    information.append(accessions[0])

    institute_pattern = r'!Series_contact_institute = (.*)'
    institutes = re.findall(institute_pattern, content)
    information.append(institutes[0])

    country_pattern = r'!Series_contact_country = (.*)'
    countries = re.findall(country_pattern, content)
    information.append(countries[0])

    organism_pattern = r'!Series_platform_organism = (.*)'
    organism = re.findall(organism_pattern, content)
    information.append(organism[0])

    return information

单个测试

In [94]:
# location = './Mt/GSE_soft/GSE110062_GSE.soft'
# extract_information(location)

### 解析下载的GSE，只保留研究核心菌种是目标菌种的GSE


In [112]:
path = './Nc/GSE_soft/'
key_name = 'Nc'
results = []

In [113]:
for filename in os.listdir(path):
    result = extract_information(os.path.join(path, filename))
    results.append(result)

df = pd.DataFrame(results, columns=['GEO Series ID', 'Institute', 'Country', 'Organism'])
display(df)

Unnamed: 0,GEO Series ID,Institute,Country,Organism
0,GSE195441,Fujian normal university,China,Neurospora crassa
1,GSE55429,Stanford University,USA,Dictyostelium discoideum
2,GSE201901,Geisel School of Medicine at Dartmouth,USA,Neurospora crassa
3,GSE101412,Yale University,USA,Neurospora crassa
4,GSE113321,"Tianjin Institute of Industrial Biotechnology,...",China,Neurospora crassa
...,...,...,...,...
89,GSE52153,"University of California, Berkeley",USA,Neurospora crassa
90,GSE150287,UT Southwestern Medical Center,USA,Neurospora crassa
91,GSE220169,"Institute of Microbiology, Chinese Academy of ...",China,Neurospora crassa OR74A
92,GSE73838,"Tianjin Institute of Industrial Biotechnology,...",China,Neurospora crassa


In [114]:
df.to_csv(key_name + '_soft.csv', index=False )

### 读取菌种在NCBI上有相应文献的所有的GSE

In [117]:
df1 = pd.read_csv(key_name + '_GSE_Info_All.csv').copy()

df1.rename(columns={'Title': 'Study Title'}, inplace=True)

# df1取GEO Series ID列，Samples列，PMID列
df1 = df1[['GSE Series', 'Samples', 'PMID', 'Study Title']]

# PMID列处理为整数
df1['PMID'] = df1['PMID'].fillna(0)
df1['PMID'] = df1['PMID'].astype(int)

df1

Unnamed: 0,GSE Series,Samples,PMID,Study Title
0,GSE237909,18,0,Cellular communication and fusion regulates ce...
1,GSE232935,31,0,Histone deacetylation and cytosine methylation...
2,GSE232934,12,0,Histone deacetylation and cytosine methylation...
3,GSE232933,19,0,Histone deacetylation and cytosine methylation...
4,GSE220169,12,37083494,The nutrient-sensing GCN2 signaling pathway is...
...,...,...,...,...
153,GSE14909,12,20305004,Molecular characterization of a cryptochrome i...
154,GSE13977,68,0,Expression of lignin-degrading enzymes in soil...
155,GSE12893,11,19269344,Characterization of the Ku70 homologue HdfA de...
156,GSE8932,135,19262566,Genome-wide characterization of light-inducibl...


In [118]:
df2 = pd.read_csv(key_name + '_GSE_Info_Pub.csv').copy()

# PMID列处理为整数
df2['PMID'] = df2['PMID'].astype(int)

df2

Unnamed: 0,PMID,Title,Abstract,Keywords,Journal,Country,Year
0,37083494,The nutrient-sensing GCN2 signaling pathway is...,Circadian clocks are evolved to adapt to the d...,"['CPC-3/CPC-1 pathway', 'N. crassa', 'Neurospo...",eLife,China,2023 Apr 21
1,36603054,Nutritional compensation of the circadian cloc...,Compensation is a defining principle of a true...,?,PLoS biology,United States of America,2023 Jan
2,36577368,A circadian clock translational control mechan...,Phosphorylation of Neurospora crassa eukaryoti...,"['CP: Molecular biology', 'CPC-3', 'GCN2', 'Ne...",Cell reports,USA,2022 Dec 27
3,35913159,Transcriptional Regulation by the Velvet Prote...,Asexual reproduction in fungi facilitates the ...,"['RNA-seq', 'conidiation', 'light', 'transcrip...",mBio,Spain,2022 Aug 30
4,35638725,Secondary Metabolism Gene Clusters Exhibit Inc...,Secondary metabolite clusters (SMCs) encode th...,"['Neurospora crassa', 'asexual development', '...",mSystems,USA,2022 Jun 28
...,...,...,...,...,...,...,...
104,20395474,Genome-wide evolutionary analysis of eukaryoti...,Eukaryotic cytosine methylation represses tran...,?,"Science (New York, N.Y.)",USA,2010 May 14
105,20305004,Genetic and molecular characterization of a cr...,"In plants and animals, cryptochromes function ...",?,Eukaryotic cell,USA,2010 May
106,19269344,Construction of an hdfA Penicillium chrysogenu...,The homologous recombination mechanism for DNA...,?,Fungal genetics and biology : FG & B,The Netherlands,2009 May
107,19262566,Genome-wide analysis of light-inducible respon...,White collar-1 (WC-1) and white collar-2 (WC-2...,?,The EMBO journal,USA,2009 Apr 22


In [119]:
df3 = pd.read_csv(key_name + '_soft.csv').copy()

# GEO Series ID列更名为GSE Series
df3 = df3.rename(columns={'GEO Series ID': 'GSE Series'})

df3

Unnamed: 0,GSE Series,Institute,Country,Organism
0,GSE195441,Fujian normal university,China,Neurospora crassa
1,GSE55429,Stanford University,USA,Dictyostelium discoideum
2,GSE201901,Geisel School of Medicine at Dartmouth,USA,Neurospora crassa
3,GSE101412,Yale University,USA,Neurospora crassa
4,GSE113321,"Tianjin Institute of Industrial Biotechnology,...",China,Neurospora crassa
...,...,...,...,...
89,GSE52153,"University of California, Berkeley",USA,Neurospora crassa
90,GSE150287,UT Southwestern Medical Center,USA,Neurospora crassa
91,GSE220169,"Institute of Microbiology, Chinese Academy of ...",China,Neurospora crassa OR74A
92,GSE73838,"Tianjin Institute of Industrial Biotechnology,...",China,Neurospora crassa


In [120]:
# df1与df2合并，依据PMID列, 以df1为主
df4 = pd.merge(df1, df2, on='PMID', how='left').copy()

# df去掉Country列
df4 = df4.drop(columns=['Country'])
df4 = df4.drop(columns=['Keywords'])

# df4中每一列的空值替换为空
df4 = df4.fillna('')

# df4中的Abstract列删除BACKGROUND:
df4['Abstract'] = df4['Abstract'].str.replace('BACKGROUND:', '')

# df4中的Yead列显示的是年份+月份+日期，只保留年份
df4['Year'] = df4['Year'].str[:4]

df4

Unnamed: 0,GSE Series,Samples,PMID,Study Title,Title,Abstract,Journal,Year
0,GSE237909,18,0,Cellular communication and fusion regulates ce...,,,,
1,GSE232935,31,0,Histone deacetylation and cytosine methylation...,,,,
2,GSE232934,12,0,Histone deacetylation and cytosine methylation...,,,,
3,GSE232933,19,0,Histone deacetylation and cytosine methylation...,,,,
4,GSE220169,12,37083494,The nutrient-sensing GCN2 signaling pathway is...,The nutrient-sensing GCN2 signaling pathway is...,Circadian clocks are evolved to adapt to the d...,eLife,2023
...,...,...,...,...,...,...,...,...
153,GSE14909,12,20305004,Molecular characterization of a cryptochrome i...,Genetic and molecular characterization of a cr...,"In plants and animals, cryptochromes function ...",Eukaryotic cell,2010
154,GSE13977,68,0,Expression of lignin-degrading enzymes in soil...,,,,
155,GSE12893,11,19269344,Characterization of the Ku70 homologue HdfA de...,Construction of an hdfA Penicillium chrysogenu...,The homologous recombination mechanism for DNA...,Fungal genetics and biology : FG & B,2009
156,GSE8932,135,19262566,Genome-wide characterization of light-inducibl...,Genome-wide analysis of light-inducible respon...,White collar-1 (WC-1) and white collar-2 (WC-2...,The EMBO journal,2009


In [121]:
# df4与df3合并，依据GEO Series列, 以df4为主
df5 = pd.merge(df4, df3, on='GSE Series', how='left')
df5

Unnamed: 0,GSE Series,Samples,PMID,Study Title,Title,Abstract,Journal,Year,Institute,Country,Organism
0,GSE237909,18,0,Cellular communication and fusion regulates ce...,,,,,Yunnan University,China,Orbilia oligospora
1,GSE232935,31,0,Histone deacetylation and cytosine methylation...,,,,,,,
2,GSE232934,12,0,Histone deacetylation and cytosine methylation...,,,,,,,
3,GSE232933,19,0,Histone deacetylation and cytosine methylation...,,,,,,,
4,GSE220169,12,37083494,The nutrient-sensing GCN2 signaling pathway is...,The nutrient-sensing GCN2 signaling pathway is...,Circadian clocks are evolved to adapt to the d...,eLife,2023,"Institute of Microbiology, Chinese Academy of ...",China,Neurospora crassa OR74A
...,...,...,...,...,...,...,...,...,...,...,...
153,GSE14909,12,20305004,Molecular characterization of a cryptochrome i...,Genetic and molecular characterization of a cr...,"In plants and animals, cryptochromes function ...",Eukaryotic cell,2010,,,
154,GSE13977,68,0,Expression of lignin-degrading enzymes in soil...,,,,,,,
155,GSE12893,11,19269344,Characterization of the Ku70 homologue HdfA de...,Construction of an hdfA Penicillium chrysogenu...,The homologous recombination mechanism for DNA...,Fungal genetics and biology : FG & B,2009,,,
156,GSE8932,135,19262566,Genome-wide characterization of light-inducibl...,Genome-wide analysis of light-inducible respon...,White collar-1 (WC-1) and white collar-2 (WC-2...,The EMBO journal,2009,,,


In [122]:
# df5列顺序调整
df5 = df5[['GSE Series', 
           'Samples', 
           'Study Title', 
           'Title',
           'Abstract',
           'Organism',
           'Country',
           'Institute', 
           'Journal',
           'Year',
           'PMID' 
           ]]

# GSE Series列更名为Series Accession
df5 = df5.rename(columns={'GSE Series': 'Series Accession'})

df5

Unnamed: 0,Series Accession,Samples,Study Title,Title,Abstract,Organism,Country,Institute,Journal,Year,PMID
0,GSE237909,18,Cellular communication and fusion regulates ce...,,,Orbilia oligospora,China,Yunnan University,,,0
1,GSE232935,31,Histone deacetylation and cytosine methylation...,,,,,,,,0
2,GSE232934,12,Histone deacetylation and cytosine methylation...,,,,,,,,0
3,GSE232933,19,Histone deacetylation and cytosine methylation...,,,,,,,,0
4,GSE220169,12,The nutrient-sensing GCN2 signaling pathway is...,The nutrient-sensing GCN2 signaling pathway is...,Circadian clocks are evolved to adapt to the d...,Neurospora crassa OR74A,China,"Institute of Microbiology, Chinese Academy of ...",eLife,2023,37083494
...,...,...,...,...,...,...,...,...,...,...,...
153,GSE14909,12,Molecular characterization of a cryptochrome i...,Genetic and molecular characterization of a cr...,"In plants and animals, cryptochromes function ...",,,,Eukaryotic cell,2010,20305004
154,GSE13977,68,Expression of lignin-degrading enzymes in soil...,,,,,,,,0
155,GSE12893,11,Characterization of the Ku70 homologue HdfA de...,Construction of an hdfA Penicillium chrysogenu...,The homologous recombination mechanism for DNA...,,,,Fungal genetics and biology : FG & B,2009,19269344
156,GSE8932,135,Genome-wide characterization of light-inducibl...,Genome-wide analysis of light-inducible respon...,White collar-1 (WC-1) and white collar-2 (WC-2...,,,,The EMBO journal,2009,19262566


只保留Soft文件对应的数据集（具体那些GSE需要先去NCBI上看看）

In [123]:
# 删除df中Organism为NaN的行
df6 = df5.dropna(subset=['Organism']) # 代表不是我们挑的数据集
df6

Unnamed: 0,Series Accession,Samples,Study Title,Title,Abstract,Organism,Country,Institute,Journal,Year,PMID
0,GSE237909,18,Cellular communication and fusion regulates ce...,,,Orbilia oligospora,China,Yunnan University,,,0
4,GSE220169,12,The nutrient-sensing GCN2 signaling pathway is...,The nutrient-sensing GCN2 signaling pathway is...,Circadian clocks are evolved to adapt to the d...,Neurospora crassa OR74A,China,"Institute of Microbiology, Chinese Academy of ...",eLife,2023,37083494
5,GSE199259,5,Orphan elements are clustered with allorecogni...,,,Neurospora crassa,USA,Yale University,,,0
6,GSE225621,12,Effect of upf1 knockout on gene expression of ...,,,Neurospora crassa OR74A,China,China Agricultural University,,,0
7,GSE201901,4,A role for gene expression and mRNA stability ...,Nutritional compensation of the circadian cloc...,Compensation is a defining principle of a true...,Neurospora crassa,USA,Geisel School of Medicine at Dartmouth,PLoS biology,2023,36603054
...,...,...,...,...,...,...,...,...,...,...,...
124,GSE50205,1,Identification of the transcriptional profile ...,,,Neurospora crassa,USA,UC Berkeley,,,0
129,GSE45406,112,Genome Wide Association identifies Novel Loci ...,,,Neurospora crassa,USA,UC Berkeley,,,0
133,GSE35227,26,Conserved and Essential Transcription Factors ...,Conserved and essential transcription factors ...,Rational engineering of filamentous fungi for ...,Neurospora crassa OR74A,USA,UC Berkeley,Proceedings of the National Academy of Science...,2012,22532664
135,GSE36719,13,Induction of lignocellulose-degrading enzymes ...,Induction of lignocellulose-degrading enzymes ...,Neurospora crassa colonizes burnt grasslands i...,Neurospora crassa OR74A,USA,UC Berkeley,Proceedings of the National Academy of Science...,2012,22474347


In [124]:
# 如果df6中PMID列的值为0，则删除该行
df7 = df6[df6['PMID'] != 0] # 代表没对应文献
df7

Unnamed: 0,Series Accession,Samples,Study Title,Title,Abstract,Organism,Country,Institute,Journal,Year,PMID
4,GSE220169,12,The nutrient-sensing GCN2 signaling pathway is...,The nutrient-sensing GCN2 signaling pathway is...,Circadian clocks are evolved to adapt to the d...,Neurospora crassa OR74A,China,"Institute of Microbiology, Chinese Academy of ...",eLife,2023,37083494
7,GSE201901,4,A role for gene expression and mRNA stability ...,Nutritional compensation of the circadian cloc...,Compensation is a defining principle of a true...,Neurospora crassa,USA,Geisel School of Medicine at Dartmouth,PLoS biology,2023,36603054
14,GSE181566,160,Circadian Clock-Controlled Translation of Spec...,A circadian clock translational control mechan...,Phosphorylation of Neurospora crassa eukaryoti...,Neurospora crassa,USA,Texas A&M University,Cell reports,2022,36577368
16,GSE181564,80,Circadian Clock-Controlled Translation of Spec...,A circadian clock translational control mechan...,Phosphorylation of Neurospora crassa eukaryoti...,Neurospora crassa,USA,Texas A&M University,Cell reports,2022,36577368
17,GSE180332,24,Transcriptional regulation by the Velvet compl...,Transcriptional Regulation by the Velvet Prote...,Asexual reproduction in fungi facilitates the ...,Neurospora crassa,Spain,University of Seville,mBio,2022,35913159
...,...,...,...,...,...,...,...,...,...,...,...
122,GSE44100,22,Inducer Free Cellulase Secretion in Neurospora...,Analysis of a conserved cellulase transcriptio...,Cellulose is recalcitrant to deconstruction to...,Aspergillus nidulans FGSC A4,USA,UC Berkeley,MicrobiologyOpen,2013,23766336
123,GSE50446,2,Transcriptional profiling of Neurospora crassa...,Early colony establishment in Neurospora crass...,Vegetative fusion is essential for the develop...,Neurospora crassa OR74A,USA,UC Berkeley,Genetics,2013,24037267
133,GSE35227,26,Conserved and Essential Transcription Factors ...,Conserved and essential transcription factors ...,Rational engineering of filamentous fungi for ...,Neurospora crassa OR74A,USA,UC Berkeley,Proceedings of the National Academy of Science...,2012,22532664
135,GSE36719,13,Induction of lignocellulose-degrading enzymes ...,Induction of lignocellulose-degrading enzymes ...,Neurospora crassa colonizes burnt grasslands i...,Neurospora crassa OR74A,USA,UC Berkeley,Proceedings of the National Academy of Science...,2012,22474347


In [125]:
# 重置索引
df6.reset_index(drop=True, inplace=True)

In [126]:
# df5保存为csv
df6.to_csv(key_name + '_dataset.csv', index=False)