In [None]:
import re, json, requests
import pandas as pd

# Samples

In [None]:
acc = pd.read_csv('../data/glycine_max_sra.csv')
acc = acc.loc[acc['assay_type'] == 'RNA-Seq']
acc = acc.set_index('biosample')
acc

In [None]:
acc.loc['SAMN30889879']

In [None]:
attr = pd.DataFrame.from_records(acc['jattr'].apply(eval), index = acc.index)
attr

In [None]:
attr.isna().mean().sort_values()

In [8]:
acc.to_csv('helper/acc.csv')
attr.to_csv('helper/attr.csv')

In [9]:
def get_biosample(biosamp_id):
    api_url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=biosample&id={biosamp_id}&rettype=full&retmode=text"
    res = requests.get(api_url)
    return res.text

def metadata_json(metadata):
    res = {}
    res['title'] = metadata.split('\n')[0].replace('1: ', '')
    identifiers = re.search(r'Identifiers: (.+)', metadata)
    if identifiers:
        res['Identifiers'] = {k.strip(): v.strip() for k, v in 
                                [item.split(':') if ':' in item else (item, "")
                                    for item in identifiers.group(1).split(';')]}
    organism = re.search(r'Organism: (.+)', metadata)
    if organism:
        res['Organism'] = organism.group(1).strip()
    attributes = re.findall(r'/(\w+)="([^"]+)"', metadata)
    if attributes:
        res['Attributes'] = {k: v for k, v in attributes}
    accession = re.search(r'Accession: (\w+)', metadata)
    if accession:
        res['Accession'] = accession.group(1)
    id_match = re.search(r'ID: (\d+)', metadata)
    if id_match:
        res['ID'] = id_match.group(1)
    
    return json.dumps(res)

In [10]:
try:
    biosamp = pd.read_csv('helper/biosamp.csv', index_col=0)
except:
    biosamp = pd.DataFrame(acc['biosample'])
    biosamp['metadata'] = acc['biosample'].apply(get_biosample)
    biosamp['jmeta'] = biosamp['metadata'].apply(metadata_json)
    biosamp = biosamp.set_index('biosample')
    biosamp = pd.DataFrame(biosamp)
    biosamp.to_csv('helper/biosamp.csv')

In [11]:
jmeta = pd.DataFrame.from_records(biosamp['metadata'].apply(metadata_json).apply(json.loads), index = biosamp.index)
jmeta

  jmeta = pd.DataFrame.from_records(biosamp['metadata'].apply(metadata_json).apply(json.loads), index = biosamp.index)


Unnamed: 0_level_0,title,Identifiers,Organism,Attributes,Accession,ID
biosample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SAMD00235537,Oryza sativa Japonica Group NG8334,"{'BioSample': 'SAMN00235537', 'GSS': 'LIBGSS_0...",Oryza sativa Japonica Group,"{'strain': 'NG8334', 'cultivar': 'Nipponbare'}",SAMN00235537,235537
SAMN30889879,Plant sample from Glycine max,"{'BioSample': 'SAMN30889879', 'Sample name': '...",Glycine max,"{'isolate': 'B sufficient', 'cultivar': 'missi...",SAMN30889879,30889879
SAMN09513405,Plant sample from Glycine max,"{'BioSample': 'SAMN09513405', 'Sample name': '...",Glycine max,"{'isolate': 'P. sojae 1.S.1.1', 'cultivar': '1...",SAMN09513405,9513405
SAMN09078292,mRNA gmax PI641156 NC-Raleigh,"{'BioSample': 'SAMN09078292', 'SRA': 'SRS32627...",Glycine max,"{'cultivar': 'PI641156 NC-Raleigh', 'tissue': ...",SAMN09078292,9078292
SAMN36760700,Plant sample from Glycine max,"{'BioSample': 'SAMN36760700', 'Sample name': '...",Glycine max,{'isolate': 'R89-2 line repeat 1 in CSSL popul...,SAMN36760700,36760700
...,...,...,...,...,...,...
SAMN25827181,12dpi-2C-Replicate2,"{'BioSample': 'SAMN25827181', 'Sample name': '...",Glycine max,"{'cultivar': 'Williams 82', 'age': '12 days po...",SAMN25827181,25827181
SAMN20971793,Mock-inoculated Williams rep 2 [WM-2a],"{'BioSample': 'SAMN20971793', 'SRA': 'SRS99190...",Glycine max,{'cultivar': 'Soybean cultivar Williams'},SAMN20971793,20971793
SAMN41726398,Plant sample from Glycine max,"{'BioSample': 'SAMN41726398', 'Sample name': '...",Glycine max,"{'isolate': 'HPHO_2', 'tissue': 'seed'}",SAMN41726398,41726398
SAMN07202264,LG92-1255_RNA-seq,"{'BioSample': 'SAMN07202264', 'SRA': 'SRS22599...",Glycine max,"{'tissue': 'leaves', 'cultivar': 'LG92-1255'}",SAMN07202264,7202264


In [12]:
jattr = pd.DataFrame.from_records(jmeta['Attributes'].dropna(), index = jmeta['Attributes'].dropna().index)
jident = pd.DataFrame.from_records(jmeta['Identifiers'].dropna(), index = jmeta['Identifiers'].dropna().index).drop('BioSample', axis=1)

  jattr = pd.DataFrame.from_records(jmeta['Attributes'].dropna(), index = jmeta['Attributes'].dropna().index)
  jident = pd.DataFrame.from_records(jmeta['Identifiers'].dropna(), index = jmeta['Identifiers'].dropna().index).drop('BioSample', axis=1)


In [13]:
biometa = pd.DataFrame(jmeta['title']).join(jattr, how='left', rsuffix='_jattr').join(jident, how='left', rsuffix='_jident')

In [14]:
biometa.to_csv('helper/biometa.csv')

# Unified

In [15]:
acc = pd.read_csv('helper/acc.csv')
attr = pd.read_csv('helper/attr.csv')
biometa = pd.read_csv('helper/biometa.csv')

  attr = pd.read_csv('helper/attr.csv')
  biometa = pd.read_csv('helper/biometa.csv')


In [16]:
acc = acc.drop_duplicates(keep='first', subset='biosample').set_index('biosample')
attr = attr.drop_duplicates(keep='first', subset='biosample').set_index('biosample')
biometa = biometa.drop_duplicates(keep='first', subset='biosample').set_index('biosample')

In [17]:
joined = acc.join(attr, how='outer', rsuffix='_attr').join(jmeta, how='outer', rsuffix='_jmeta')

In [18]:
# joined = pd.concat([acc, attr, jmeta], axis=1)
joined.to_csv('joined.csv')
joined

Unnamed: 0_level_0,acc,assay_type,center_name,consent,experiment,sample_name,instrument,librarylayout,libraryselection,librarysource,...,er_gfp_sam,subsrc_note_sam_s_dpl392,authors_sam,env_biome_sam,title,Identifiers,Organism,Attributes,Accession,ID
biosample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SAMD00025071,DRR029571,RNA-Seq,NUGSS,public,DRX026629,DRS020767,Illumina Genome Analyzer IIx,SINGLE,RANDOM,TRANSCRIPTOMIC,...,,,,,Error: uid 25071 cannot get document summary,,,,,
SAMD00025072,DRR029572,RNA-Seq,NUGSS,public,DRX026630,DRS020768,Illumina Genome Analyzer IIx,SINGLE,RANDOM,TRANSCRIPTOMIC,...,,,,,Error: uid 25072 cannot get document summary,,,,,
SAMD00025073,DRR029573,RNA-Seq,NUGSS,public,DRX026631,DRS020769,Illumina Genome Analyzer IIx,SINGLE,RANDOM,TRANSCRIPTOMIC,...,,,,,Error: uid 25073 cannot get document summary,,,,,
SAMD00025074,DRR029574,RNA-Seq,NUGSS,public,DRX026632,DRS020770,Illumina Genome Analyzer IIx,SINGLE,RANDOM,TRANSCRIPTOMIC,...,,,,,Error: uid 25074 cannot get document summary,,,,,
SAMD00025075,DRR029575,RNA-Seq,NUGSS,public,DRX026633,DRS020771,Illumina Genome Analyzer IIx,SINGLE,RANDOM,TRANSCRIPTOMIC,...,,,,,Error: uid 25075 cannot get document summary,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SAMN41829221,SRR29409034,RNA-Seq,KANSAS STATE UNIVERSITY,public,SRX24922515,Soybean308,NextSeq 500,SINGLE,cDNA,TRANSCRIPTOMIC,...,,,,,Plant sample from Glycine max,"{'BioSample': 'SAMN41829221', 'Sample name': '...",Glycine max,"{'cultivar': 'Pharaoh', 'tissue': 'Stem'}",SAMN41829221,41829221
SAMN41829222,SRR29409033,RNA-Seq,KANSAS STATE UNIVERSITY,public,SRX24922516,Soybean309,NextSeq 500,SINGLE,cDNA,TRANSCRIPTOMIC,...,,,,,Plant sample from Glycine max,"{'BioSample': 'SAMN41829222', 'Sample name': '...",Glycine max,"{'cultivar': 'Pharaoh', 'tissue': 'Stem'}",SAMN41829222,41829222
SAMN41829223,SRR29409032,RNA-Seq,KANSAS STATE UNIVERSITY,public,SRX24922517,Soybean310,NextSeq 500,SINGLE,cDNA,TRANSCRIPTOMIC,...,,,,,Plant sample from Glycine max,"{'BioSample': 'SAMN41829223', 'Sample name': '...",Glycine max,"{'cultivar': 'Pharaoh', 'tissue': 'Stem'}",SAMN41829223,41829223
SAMN41829224,SRR29409031,RNA-Seq,KANSAS STATE UNIVERSITY,public,SRX24922518,Soybean311,NextSeq 500,SINGLE,cDNA,TRANSCRIPTOMIC,...,,,,,Plant sample from Glycine max,"{'BioSample': 'SAMN41829224', 'Sample name': '...",Glycine max,"{'cultivar': 'Dt974290', 'tissue': 'Stem'}",SAMN41829224,41829224


# Bioprojects

In [19]:
def get_bioproject(bioproj_id):
    api_url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=bioproject&id={bioproj_id}"
    res = requests.get(api_url)
    return res.text

In [20]:
venancio = pd.read_json('venancio.json')
venancio.columns = ['bioproject', 'n', 'part', 'title', 'abstract', 'doi', 'pmid']
venancio

Unnamed: 0,bioproject,n,part,title,abstract,doi,pmid
0,PRJNA360609,8,seedling: 8,Transgenic and conventional cultivar compariso...,Soybean is one of the main sources of oil worl...,10.1105/tpc.18.00662,31227558
1,PRJNA369113,36,leaf: 36,Circadian transcriptome of soybean unifoliolat...,The goal of this study is to obtain the circad...,10.1073/pnas.1708508116,31676549
2,PRJNA369414,21,embryo: 15 | cotyledon: 6,Genome-Wide Reinforcement of DNA Methylation O...,Tissue culture is a major method for plant reg...,,
3,PRJNA369483,6,seed: 6,Characterizing seed weight related genes throu...,Cultivated soybean has domesticated in China f...,http://dx.doi.org/10.1007/s00299-017-2165-5,28653111
4,PRJNA372408,1,shoot: 1,Gene expression analysis of Glycine max willia...,Transcriptome sequencing as part of the JGI Fl...,,
...,...,...,...,...,...,...,...
489,PRJNA827520,6,leaf: 6,Glycine max Transcriptome or Gene expression,Glycine max treate with water or GZM to test t...,,
490,PRJNA827666,6,root: 6,Study on improvement of water and nutrient upt...,Application and/or natural Silicates (Si) is c...,,
491,PRJNA832118,24,leaf: 24,Glycine max Transcriptome or Gene expression,Transgenic GmFT3a soybean,,
492,PRJNA833532,12,root: 12,Glycine max Raw sequence reads,soybean aluminum stress,,


In [21]:
projects = pd.DataFrame({'bioproject': joined.bioproject.unique()})
projects

Unnamed: 0,bioproject
0,PRJDB3474
1,PRJDB3582
2,PRJDB7011
3,PRJDB7219
4,PRJDB7775
...,...
785,PRJNA1119938
786,PRJNA1120606
787,PRJNA1121049
788,PRJNA1121077


In [22]:
projects = projects.join(venancio.set_index('bioproject'), rsuffix='_venancio')
projects

Unnamed: 0,bioproject,n,part,title,abstract,doi,pmid
0,PRJDB3474,,,,,,
1,PRJDB3582,,,,,,
2,PRJDB7011,,,,,,
3,PRJDB7219,,,,,,
4,PRJDB7775,,,,,,
...,...,...,...,...,...,...,...
785,PRJNA1119938,,,,,,
786,PRJNA1120606,,,,,,
787,PRJNA1121049,,,,,,
788,PRJNA1121077,,,,,,


In [23]:
projects['xml'] = projects['bioproject'].apply(get_bioproject)

KeyboardInterrupt: 

In [None]:
projects