In [71]:
import pandas as pd
from argparse import ArgumentParser
import os
import glob

In [109]:
tissue_df = pd.read_csv('tissue_metadata.tsv', sep='\t')
age_df = pd.read_csv('age_metadata.tsv', sep='\t')
sex_df = pd.read_csv('sex_metadata.tsv', sep='\t')
rep_df = pd.read_csv('rep_metadata.tsv', sep='\t')


tissue_df.set_index('short', inplace=True)
age_df.set_index('short', inplace=True)
sex_df.set_index('short', inplace=True)
rep_df.set_index('short', inplace=True)


lab = 'ali-mortazavi'
award = 'UM1HG009443'

In [142]:
lib_meta = pd.read_csv('adrenal_metadata.tsv', sep='\t')
lib_meta['# predicted nuclei'] = lib_meta['# predicted nuclei'].astype('int')

In [146]:
d = 'test'+'/'
ext = r'{}*.fastq.gz'.format(d)

biosamp_sub = pd.DataFrame()
exp_sub = pd.DataFrame() # only one for each pair of reps!
lib_sub = pd.DataFrame()
rep_sub = pd.DataFrame()
file_sub = pd.DataFrame()


print(ext)
for f in glob.glob(ext):
    tissue, age, sex, rep = get_metadata(f)
    sample_id = get_sample_id(f)
    biosamp_sub = add_biosamp(tissue, age, sex, rep, sample_id, biosamp_sub,
                             tissue_df, sex_df, age_df, rep_df, lib_meta)
    exp_sub = add_exp(tissue, age, sex, rep, exp_sub,
                             tissue_df, sex_df, age_df, rep_df)
    lib_sub = add_lib(tissue, age, sex, rep, sample_id, lib_sub,
                             tissue_df, sex_df, age_df, rep_df, lib_meta)

test/*.fastq.gz


In [147]:
# drop unnecessary columns
drop = ['tissue_desc', 'age_desc', 'rep_desc']
biosamp_sub.drop(drop, axis=1, inplace=True)

# for exp, also drop duplicate aliases 
exp_sub.drop_duplicates(inplace=True)

In [148]:
biosamp_sub

Unnamed: 0,biosample_ontology,organism,subcellular_fraction_term_name,donor,source,sex,model_organism_age,model_organism_age_units,rep,date_obtained,description,aliases,lab,award
0,/biosample-types/tissue_UBERON_0002369/,/organisms/mouse/,nucleus,encode:Castaneus,/sources/jackson-labs/,male,18-20,month,2,7/8/21,B6Cast F1 P18-20mo male adrenal 8,ali-mortazavi:biosample_P18-20mo_male_adrenal_2,ali-mortazavi,UM1HG009443
0,/biosample-types/tissue_UBERON_0002369/,/organisms/mouse/,nucleus,encode:Castaneus,/sources/jackson-labs/,female,10,day,1,7/8/21,B6Cast F1 P10 female adrenal 7,ali-mortazavi:biosample_P10_female_adrenal_1,ali-mortazavi,UM1HG009443
0,/biosample-types/tissue_UBERON_0002369/,/organisms/mouse/,nucleus,encode:Castaneus,/sources/jackson-labs/,female,10,day,2,7/8/21,B6Cast F1 P10 female adrenal 8,ali-mortazavi:biosample_P10_female_adrenal_2,ali-mortazavi,UM1HG009443


In [149]:
exp_sub

Unnamed: 0,aliases,biosample_ontology,description,assay_term_name,lab,award
0,ali-mortazavi:exp_P18-20mo_male_adrenal,/biosample-types/tissue_UBERON_0002369/,Short-read Split-seq B6Cast F1 P18-20mo male a...,single-cell RNA sequencing assay,ali-mortazavi,UM1HG009443
0,ali-mortazavi:exp_P10_female_adrenal,/biosample-types/tissue_UBERON_0002369/,Short-read Split-seq B6Cast F1 P10 female adrenal,single-cell RNA sequencing assay,ali-mortazavi,UM1HG009443


In [150]:
lib_sub

Unnamed: 0,tissue_desc,biosample_ontology,organism,subcellular_fraction_term_name,donor,source,sex,model_organism_age,model_organism_age_units,age_desc,...,aliases,biosample,nucleic_acid_term_name,documents,construction_method,strand_specificity,nucleic_acid_starting_quantity,nucleic_acid_starting_quantity_units,lab,award
0,adrenal,/biosample-types/tissue_UBERON_0002369/,/organisms/mouse/,nucleus,encode:Castaneus,/sources/jackson-labs/,male,18-20,month,P18-20mo,...,ali-mortazavi:library_P18-20mo_male_adrenal_2,ali-mortazavi:biosample_P18-20mo_male_adrenal_2,RNA,ali-mortazavi:Split-seq_computational_protocol...,Parse Single Cell Whole Transcriptome Kit,unstranded,3125,cells,ali-mortazavi,UM1HG009443
0,adrenal,/biosample-types/tissue_UBERON_0002369/,/organisms/mouse/,nucleus,encode:Castaneus,/sources/jackson-labs/,female,10,day,P10,...,ali-mortazavi:library_P10_female_adrenal_1,ali-mortazavi:biosample_P10_female_adrenal_1,RNA,ali-mortazavi:Split-seq_computational_protocol...,Parse Single Cell Whole Transcriptome Kit,unstranded,1562,cells,ali-mortazavi,UM1HG009443
0,adrenal,/biosample-types/tissue_UBERON_0002369/,/organisms/mouse/,nucleus,encode:Castaneus,/sources/jackson-labs/,female,10,day,P10,...,ali-mortazavi:library_P10_female_adrenal_2,ali-mortazavi:biosample_P10_female_adrenal_2,RNA,ali-mortazavi:Split-seq_computational_protocol...,Parse Single Cell Whole Transcriptome Kit,unstranded,1562,cells,ali-mortazavi,UM1HG009443


In [127]:
def get_metadata(f):
    tissue, age, sex, rep = f.split('_')
    rep = rep.split('.')[0]
    tissue = tissue.split('/')[-1]
    return tissue, age, sex, rep

def get_sample_id(f):
    s = f.split('/')[-1]
    s = s.split('.')[0]
    return s

In [145]:
def add_lib(tissue, age, sex, rep, sample_id, lib_sub,
                tissue_df, sex_df, age_df, rep_df, lib_meta):
    
    temp = tissue_df.loc[tissue]
    temp = pd.concat([temp, sex_df.loc[sex]], axis=0).to_frame()
    temp = pd.concat([temp, age_df.loc[age]], axis=0)
    temp = pd.concat([temp, rep_df.loc[int(rep)]], axis=0)

    temp = temp.transpose()

    biosamp_alias = 'ali-mortazavi:biosample_{}_{}_{}_{}'.format(temp.age_desc.values[0], temp.sex.values[0],
                                                         temp.tissue_desc.values[0], temp.rep.values[0])
    alias = 'ali-mortazavi:library_{}_{}_{}_{}'.format(temp.age_desc.values[0], temp.sex.values[0],
                                                         temp.tissue_desc.values[0], temp.rep.values[0])

    temp['aliases'] = alias
    temp['biosample'] = biosamp_alias
    temp['nucleic_acid_term_name'] = 'RNA'
    temp['documents'] = 'ali-mortazavi:Split-seq_computational_protocol_v1.0,ali-mortazavi:split-seq-v1'
    temp['construction_method'] = 'Parse Single Cell Whole Transcriptome Kit'
    temp['strand_specificity'] = 'unstranded'
    temp['nucleic_acid_starting_quantity'] = lib_meta.loc[lib_meta['Sample ID'] == sample_id, '# predicted nuclei'].values[0]
    temp['nucleic_acid_starting_quantity_units'] = 'cells'
    
    temp['lab'] = 'ali-mortazavi'
    temp['award'] = 'UM1HG009443'
    
    cols = ['aliases', 'biosample', 'nucleic_acid_term_name', 
            'documents', 'construction_method', 'nucleic_acid_starting_quantity',
            '']
    temp = 

    lib_sub = pd.concat([lib_sub, temp], axis=0)
    return lib_sub

In [120]:
def add_exp(tissue, age, sex, rep, exp_sub,
                tissue_df, sex_df, age_df, rep_df):
    temp = tissue_df.loc[tissue]
    temp = pd.concat([temp, sex_df.loc[sex]], axis=0).to_frame()
    temp = pd.concat([temp, age_df.loc[age]], axis=0)
    temp = temp.transpose()
    
    desc = 'Short-read Split-seq B6Cast F1 {} {} {}'.format(temp.age_desc.values[0], temp.sex.values[0], 
                                          temp.tissue_desc.values[0])
    alias = 'ali-mortazavi:exp_{}_{}_{}'.format(temp.age_desc.values[0], temp.sex.values[0],
                                                         temp.tissue_desc.values[0])

    temp['assay_term_name'] = 'single-cell RNA sequencing assay'
    temp['description'] = desc
    temp['aliases'] = alias
    temp['lab'] = 'ali-mortazavi'
    temp['award'] = 'UM1HG009443'
    
    # retain only necessary columns
    cols = ['aliases', 'biosample_ontology', 'description', 
            'assay_term_name', 'lab', 'award']
    temp = temp[cols]
    
    exp_sub = pd.concat([exp_sub, temp], axis=0)
    return exp_sub

In [138]:
def add_biosamp(tissue, age, sex, rep, sample_id, biosamp_sub,
                tissue_df, sex_df, age_df, rep_df, lib_meta):
    temp = tissue_df.loc[tissue]
    temp = pd.concat([temp, sex_df.loc[sex]], axis=0).to_frame()
    temp = pd.concat([temp, age_df.loc[age]], axis=0)
    temp = pd.concat([temp, rep_df.loc[int(rep)]], axis=0)

    temp = temp.transpose()

    desc = 'B6Cast F1 {} {} {} {}'.format(temp.age_desc.values[0], temp.sex.values[0], 
                                          temp.tissue_desc.values[0], temp.rep_desc.values[0])
    alias = 'ali-mortazavi:biosample_{}_{}_{}_{}'.format(temp.age_desc.values[0], temp.sex.values[0],
                                                         temp.tissue_desc.values[0], temp.rep.values[0])
    temp['date_obtained'] = lib_meta.loc[lib_meta['Sample ID'] == sample_id, 'Date shipped'].values[0]
    temp['description'] = desc
    temp['aliases'] = alias
    temp['lab'] = 'ali-mortazavi'
    temp['award'] = 'UM1HG009443'

    biosamp_sub = pd.concat([biosamp_sub, temp], axis=0)
    return biosamp_sub

In [94]:
f

'test/A_10_F_2.fastq.gz'

In [95]:
tissue, age, sex, rep = get_metadata(f)

In [96]:
print(tissue)
print(age)
print(sex)
print(rep)

A
10
F
2


In [97]:
tissue_df.head()

Unnamed: 0_level_0,tissue_desc,biosample_ontology,organism,subcellular_fraction_term_name,donor,source
short,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A,adrenal,/biosample-types/tissue_UBERON_0002369/,/organisms/mouse/,nucleus,encode:Castaneus,/sources/jackson-labs/
G,gastrocnemius,/biosample-types/tissue_UBERON_0001388/,/organisms/mouse/,nucleus,encode:Castaneus,/sources/jackson-labs/
C,cortex,/biosample-types/tissue_NTR_0000646/,/organisms/mouse/,nucleus,encode:Castaneus,/sources/jackson-labs/
H,hippocampus,/biosample-types/tissue_UBERON_0002305/,/organisms/mouse/,nucleus,encode:Castaneus,/sources/jackson-labs/


In [103]:
temp = tissue_df.loc[tissue]
temp = pd.concat([temp, sex_df.loc[sex]], axis=0).to_frame()
temp = pd.concat([temp, age_df.loc[age]], axis=0)
temp = pd.concat([temp, rep_df.loc[int(rep)]], axis=0)

temp = temp.transpose()

desc = 'B6Cast F1 {} {} {} {}'.format(temp.age_desc.values[0], temp.sex.values[0], 
                                      temp.tissue_desc.values[0], temp.rep_desc.values[0])
alias = 'ali-mortazavi:biosample_{}_{}_{}_{}'.format(temp.age_desc.values[0], temp.sex.values[0],
                                                     temp.tissue_desc.values[0], temp.rep.values[0])

temp['description'] = desc
temp['aliases'] = alias

In [104]:
temp

Unnamed: 0,tissue_desc,biosample_ontology,organism,subcellular_fraction_term_name,donor,source,sex,model_organism_age,model_organism_age_units,age_desc,rep_desc,rep,description,aliases
0,adrenal,/biosample-types/tissue_UBERON_0002369/,/organisms/mouse/,nucleus,encode:Castaneus,/sources/jackson-labs/,female,10,day,P10,8,2,B6Cast F1 P10 female adrenal 8,ali-mortazavi:biosample_P10_female_adrenal_2
