In [1]:
import itertools

import pandas as pd
pd.options.display.max_columns = 50

sra_run_table = pd.read_csv("../metadata/brawand2011_sra_run_table.csv")
print(sra_run_table.shape)
sra_run_table.head()

(149, 32)


Unnamed: 0,Run,Assay Type,AvgSpotLen,BioProject,BioSample,Center Name,Consent,DATASTORE filetype,DATASTORE provider,DATASTORE region,Experiment,GEO_Accession,Instrument,Library Name,LibraryLayout,LibrarySelection,LibrarySource,MBases,MBytes,Organism,Platform,ReleaseDate,sample_acc,Sample Name,source_name,SRA Study,tissue_source,gender,rin,Age,Extraction,sex
0,SRR306710,RNA-Seq,76,PRJNA143627,SAMN00632119,GEO,public,sra,"gs,ncbi,s3","gs.US,ncbi.public,s3.us-east-1",SRX081869,GSM752557,Illumina Genome Analyzer IIx,GSM752557: gga br F 1,SINGLE,cDNA,TRANSCRIPTOMIC,1480,1134,Gallus gallus,ILLUMINA,2011-10-14T00:00:00Z,SRS213961,GSM752557,Brain,SRP007412,"Facility of Linkoping University, Sweden",female,9.4,"~1 year\, adult",Rneasy Lipid,
1,SRR306711,RNA-Seq,76,PRJNA143627,SAMN00632120,GEO,public,sra,"gs,ncbi,s3","gs.US,ncbi.public,s3.us-east-1",SRX081870,GSM752558,Illumina Genome Analyzer IIx,GSM752558: gga br M 1,SINGLE,cDNA,TRANSCRIPTOMIC,1334,990,Gallus gallus,ILLUMINA,2011-10-14T00:00:00Z,SRS213962,GSM752558,Brain,SRP007412,"Facility of Linkoping University, Sweden",male,9.1,"~1 year\, adult",Standard,
2,SRR306712,RNA-Seq,76,PRJNA143627,SAMN00632121,GEO,public,sra,"gs,ncbi,s3","gs.US,ncbi.public,s3.us-east-1",SRX081871,GSM752559,Illumina Genome Analyzer IIx,GSM752559: gga cb F 1,SINGLE,cDNA,TRANSCRIPTOMIC,1795,1339,Gallus gallus,ILLUMINA,2011-10-14T00:00:00Z,SRS213963,GSM752559,Cerebellum,SRP007412,"Facility of Linkoping University, Sweden",female,9.6,"~1 year\, adult",Standard,
3,SRR306713,RNA-Seq,76,PRJNA143627,SAMN00632122,GEO,public,sra,"gs,ncbi,s3","gs.US,ncbi.public,s3.us-east-1",SRX081872,GSM752560,Illumina Genome Analyzer IIx,GSM752560: gga cb M 1,SINGLE,cDNA,TRANSCRIPTOMIC,1670,1282,Gallus gallus,ILLUMINA,2011-10-14T00:00:00Z,SRS213964,GSM752560,Cerebellum,SRP007412,"Facility of Linkoping University, Sweden",male,9.2,"~1 year\, adult",Rneasy Lipid,
4,SRR306714,RNA-Seq,76,PRJNA143627,SAMN00632123,GEO,public,sra,"gs,ncbi,s3","gs.US,ncbi.public,s3.us-east-1",SRX081873,GSM752561,Illumina Genome Analyzer IIx,GSM752561: gga ht F 1,SINGLE,cDNA,TRANSCRIPTOMIC,1748,1302,Gallus gallus,ILLUMINA,2011-10-14T00:00:00Z,SRS213965,GSM752561,Heart,SRP007412,"Facility of Linkoping University, Sweden",female,9.0,"~1 year\, adult",Standard,


## Check on metadata entries for duplicates or things that need cleaning

In [11]:
sra_run_table.Organism.value_counts()

Homo sapiens                21
Mus musculus                20
Ornithorhynchus anatinus    18
Pan troglodytes             15
Monodelphis domestica       15
Gallus gallus               14
Macaca mulatta              14
Pan paniscus                12
Gorilla gorilla             11
Pongo pygmaeus               9
Name: Organism, dtype: int64

In [4]:
sra_run_table.source_name.value_counts()

Liver                        26
Heart                        25
Kidney                       23
Cerebellum                   21
Brain                        17
Testis                       17
Brain\, prefrontal cortex    17
Brain\, frontal cortex        2
Brain\, temporal lobe         1
Name: source_name, dtype: int64

In [10]:
sra_run_table['tissue'] = sra_run_table['source_name'].str.split('\\').str[0]
sra_run_table.tissue.value_counts()

Brain         37
Liver         26
Heart         25
Kidney        23
Cerebellum    21
Testis        17
Name: tissue, dtype: int64

In [5]:
sra_run_table.sex.value_counts()

Female (note: this sample was originally provided as a "male" sample to us and therefore labeled this way in the Brawand et al. paper and original GEO submission; however, detailed data analyses carried out in the meantime clearly show that this samp    1
Name: sex, dtype: int64

In [6]:
sra_run_table.gender.value_counts()

male      95
female    53
Name: gender, dtype: int64

In [9]:
sra_run_table['sex_cleaned'] = sra_run_table.apply(lambda x: x['gender'] if pd.isnull(x['sex']) else 'female', axis=1)
sra_run_table.sex_cleaned.value_counts()

male      95
female    54
Name: sex_cleaned, dtype: int64

In [None]:
sra_run_table

In [2]:
def sanitize_name(name):
    stripped = name.strip()
    no_spaces = stripped.replace(' ', '_')
    no_colons = no_spaces.replace(':', '')
    return no_colons

sra_run_table['sanitized_name'] = sra_run_table['Library Name'].map(sanitize_name)
sra_run_table['sanitized_name'].sample(5)

125    GSM752690_ppa_ts_M_1
19     GSM752572_oan_cb_M_1
77     GSM752641_mml_lv_M_1
124    GSM752689_ppa_lv_M_1
128    GSM752693_hsa_br_M_1
Name: sanitized_name, dtype: object

In [3]:
sra_run_table['sanitized_name_with_run'] = sra_run_table['Run'] + "_" + sra_run_table['sanitized_name']
sra_run_table['sanitized_name_with_run'].sample(5)

128    SRR306840_GSM752693_hsa_br_M_1
96     SRR306808_GSM752661_ggo_lv_F_1
54     SRR306765_GSM752619_mmu_cb_M_2
126    SRR306838_GSM752691_hsa_br_F_1
58     SRR306769_GSM752623_mmu_kd_F_1
Name: sanitized_name_with_run, dtype: object

In [13]:
sra_run_table.to_csv("../metadata/brawand2011_metadata.csv", index=False)