In [71]:
import pandas as pd
from argparse import ArgumentParser
import os
import glob

In [109]:
tissue_df = pd.read_csv('tissue_metadata.tsv', sep='\t')
age_df = pd.read_csv('age_metadata.tsv', sep='\t')
sex_df = pd.read_csv('sex_metadata.tsv', sep='\t')
rep_df = pd.read_csv('rep_metadata.tsv', sep='\t')


tissue_df.set_index('short', inplace=True)
age_df.set_index('short', inplace=True)
sex_df.set_index('short', inplace=True)
rep_df.set_index('short', inplace=True)


lab = 'ali-mortazavi'
award = 'UM1HG009443'

In [142]:
lib_meta = pd.read_csv('adrenal_metadata.tsv', sep='\t')
lib_meta['# predicted nuclei'] = lib_meta['# predicted nuclei'].astype('int')

In [211]:
long = False
opref = 'test/adrenal'

In [212]:
d = 'test'+'/'
ext = r'{}*.fastq.gz'.format(d)

biosamp_sub = pd.DataFrame()
exp_sub = pd.DataFrame() # only one for each pair of reps!
lib_sub = pd.DataFrame()
rep_sub = pd.DataFrame()
file_sub = pd.DataFrame()


print(ext)
for f in glob.glob(ext):
    tissue, age, sex, rep = get_metadata(f)
    sample_id = get_sample_id(f)
    print(sample_id)
    
    # assemble metadata for this sample
    temp = tissue_df.loc[tissue]
    temp = pd.concat([temp, sex_df.loc[sex]], axis=0).to_frame()
    temp = pd.concat([temp, age_df.loc[age]], axis=0)
    temp = pd.concat([temp, rep_df.loc[int(rep)]], axis=0)
    temp = temp.transpose()
    
    # add biosample
    biosample = temp.copy(deep=True)
    desc = 'B6Cast F1 {} {} {} {}'.format(temp.age_desc.values[0], temp.sex.values[0], 
                                          temp.tissue_desc.values[0], temp.rep_desc.values[0])
    biosamp_alias = 'ali-mortazavi:biosample_{}_{}_{}_{}'.format(temp.age_desc.values[0], temp.sex.values[0],
                                                         temp.tissue_desc.values[0], temp.rep.values[0])
    biosample['date_obtained'] = lib_meta.loc[lib_meta['Sample ID'] == sample_id, 'Date shipped'].values[0]
    biosample['description'] = desc
    biosample['aliases'] = biosamp_alias
    biosample['lab'] = lab
    biosample['award'] = award

    biosamp_sub = pd.concat([biosamp_sub, biosample], axis=0)
    
    # add experiment
    exp = temp.copy(deep=True)
    if long == False:
        desc = 'Short-read Split-seq B6Cast F1 {} {} {}'.format(temp.age_desc.values[0], temp.sex.values[0], 
                                              temp.tissue_desc.values[0])
        exp_alias = 'ali-mortazavi:exp_sr_{}_{}_{}'.format(temp.age_desc.values[0], temp.sex.values[0],
                                                     temp.tissue_desc.values[0])
    else:
        desc = 'Long-read Split-seq B6Cast F1 {} {} {}'.format(temp.age_desc.values[0], temp.sex.values[0], 
                                              temp.tissue_desc.values[0])
        exp_alias = 'ali-mortazavi:exp_lr_{}_{}_{}'.format(temp.age_desc.values[0], temp.sex.values[0],
                                             temp.tissue_desc.values[0])


    exp['assay_term_name'] = 'single-cell RNA sequencing assay'
    exp['description'] = desc
    exp['aliases'] = exp_alias
    exp['lab'] = lab
    exp['award'] = award
    
    cols = ['aliases', 'biosample_ontology', 'description', 
            'assay_term_name', 'lab', 'award']
    exp = exp[cols]
    
    exp_sub = pd.concat([exp_sub, exp], axis=0)
    
    # add library
    lib = temp.copy(deep=True)
    
    if long == False:
        lib_alias = 'ali-mortazavi:library_sr_{}_{}_{}_{}'.format(temp.age_desc.values[0], temp.sex.values[0],
                                                             temp.tissue_desc.values[0], temp.rep.values[0])
        lib['documents'] = 'ali-mortazavi:Split-seq_computational_protocol_v1.0,ali-mortazavi:split-seq-v1'
        lib['strand_specificity'] = 'unstranded'        
    else: 
        lib_alias = 'ali-mortazavi:library_lr_{}_{}_{}_{}'.format(temp.age_desc.values[0], temp.sex.values[0],
                                                     temp.tissue_desc.values[0], temp.rep.values[0])
        lib['documents'] = 'ali-mortazavi:LR-Split-seq_computational_protocol_v1.0,ali-mortazavi:split-seq-v1,ali-mortazavi:pacbio-split-seq-v1'
        lib['strand_specificity'] = 'forward'
        
    lib['aliases'] = lib_alias
    lib['biosample'] = biosamp_alias
    lib['nucleic_acid_term_name'] = 'RNA'
    lib['construction_method'] = 'Parse Single Cell Whole Transcriptome Kit'
    lib['nucleic_acid_starting_quantity'] = lib_meta.loc[lib_meta['Sample ID'] == sample_id, '# predicted nuclei'].values[0]
    lib['nucleic_acid_starting_quantity_units'] = 'cells'
    
    lib['lab'] = lab
    lib['award'] = award
    
    cols = ['aliases', 'biosample', 'nucleic_acid_term_name', 
            'documents', 'construction_method', 'nucleic_acid_starting_quantity',
            'nucleic_acid_starting_quantity_units', 'strand_specificity',
            'lab', 'award']
    lib = lib[cols]
    
    lib_sub = pd.concat([lib_sub, lib], axis=0)
    
    # add replicate
    rep = temp.copy(deep=True)
    
    if long == False:
        rep_alias = 'ali-mortazavi:rep_sr_{}_{}_{}_{}'.format(temp.age_desc.values[0], temp.sex.values[0],
                                                         temp.tissue_desc.values[0], temp.rep.values[0])
    else:
        rep_alias = 'ali-mortazavi:rep_lr_{}_{}_{}_{}'.format(temp.age_desc.values[0], temp.sex.values[0],
                                                         temp.tissue_desc.values[0], temp.rep.values[0])

    rep['aliases'] = rep_alias
    rep['library'] = lib_alias
    rep['experiment'] = exp_alias
    rep['biological_replicate_number'] = temp.rep.values[0]
    rep['technical_replicate_number'] = 1
                                                    
    cols = ['aliases', 'library', 'experiment', 'biological_replicate_number', 'technical_replicate_number']
    rep = rep[cols]

    rep_sub = pd.concat([rep_sub, rep], axis=0)
    
    # add file
    file = temp.copy(deep=True)
    
    if long == False:
        file_alias = 'ali-mortazavi:fastq_sr_{}_{}_{}_{}'.format(temp.age_desc.values[0], temp.sex.values[0],
                                                         temp.tissue_desc.values[0], temp.rep.values[0])
        file['read_length'] = 115
        file['run_type'] = 'single-ended'
        file['platform'] = 'encode:NextSeq2000'
        
    else:
        file_alias = 'ali-mortazavi:fastq_lr_{}_{}_{}_{}'.format(temp.age_desc.values[0], temp.sex.values[0],
                                                         temp.tissue_desc.values[0], temp.rep.values[0])
        file['platform'] = 'encode:PacBio_sequel_II'
    
    file['aliases'] = file_alias
    file['dataset'] = exp_alias
    file['submitted_file_name'] = os.path.abspath(f)
    file['replicate'] = rep_alias
    file['file_format'] = 'fastq'
    file['output_type'] = 'reads'
    
    cols = ['aliases', 'dataset', 'submitted_file_name', 
            'replicate', 'file_format', 'output_type', 'platform']
    if long == False:
        cols.append('read_length')
        cols.append('run_type')
    file = file[cols]
    file_sub = pd.concat([file_sub, file], axis=0)
    
    
# drop unnecessary columns
drop = ['tissue_desc', 'age_desc', 'rep_desc']
biosamp_sub.drop(drop, axis=1, inplace=True)

# for exp, also drop duplicate aliases 
exp_sub.drop_duplicates(inplace=True)

# save each table
if not long:
    opref = '{}_sr'.format(opref)
else:
    opref = '{}_lr'.format(opref)    
    
# biosample
fname = opref+'_biosample.tsv'
biosample.to_csv(fname, index=False, sep='\t')

# experiment
fname = opref+'_experiment.tsv'
exp.to_csv(fname, index=False, sep='\t')

# library
fname = opref+'_library.tsv'
lib.to_csv(fname, index=False, sep='\t')

# replicate
fname = opref+'_rep.tsv'
rep.to_csv(fname, index=False, sep='\t')

# file
fname = opref+'_file.tsv'
file.to_csv(fname, index=False, sep='\t')

test/*.fastq.gz
A_18m_M_2
A_10_F_1
A_10_F_2


In [204]:
biosamp_sub

Unnamed: 0,biosample_ontology,organism,subcellular_fraction_term_name,donor,source,sex,model_organism_age,model_organism_age_units,rep,date_obtained,description,aliases,lab,award
0,/biosample-types/tissue_UBERON_0002369/,/organisms/mouse/,nucleus,encode:Castaneus,/sources/jackson-labs/,male,18-20,month,2,7/8/21,B6Cast F1 P18-20mo male adrenal 8,ali-mortazavi:biosample_P18-20mo_male_adrenal_2,ali-mortazavi,UM1HG009443
0,/biosample-types/tissue_UBERON_0002369/,/organisms/mouse/,nucleus,encode:Castaneus,/sources/jackson-labs/,female,10,day,1,7/8/21,B6Cast F1 P10 female adrenal 7,ali-mortazavi:biosample_P10_female_adrenal_1,ali-mortazavi,UM1HG009443
0,/biosample-types/tissue_UBERON_0002369/,/organisms/mouse/,nucleus,encode:Castaneus,/sources/jackson-labs/,female,10,day,2,7/8/21,B6Cast F1 P10 female adrenal 8,ali-mortazavi:biosample_P10_female_adrenal_2,ali-mortazavi,UM1HG009443


In [205]:
exp_sub

Unnamed: 0,aliases,biosample_ontology,description,assay_term_name,lab,award
0,ali-mortazavi:exp_sr_P18-20mo_male_adrenal,/biosample-types/tissue_UBERON_0002369/,Short-read Split-seq B6Cast F1 P18-20mo male a...,single-cell RNA sequencing assay,ali-mortazavi,UM1HG009443
0,ali-mortazavi:exp_sr_P10_female_adrenal,/biosample-types/tissue_UBERON_0002369/,Short-read Split-seq B6Cast F1 P10 female adrenal,single-cell RNA sequencing assay,ali-mortazavi,UM1HG009443


In [206]:
lib_sub

Unnamed: 0,aliases,biosample,nucleic_acid_term_name,documents,construction_method,nucleic_acid_starting_quantity,nucleic_acid_starting_quantity_units,strand_specificity,lab,award
0,ali-mortazavi:library_sr_P18-20mo_male_adrenal_2,ali-mortazavi:biosample_P18-20mo_male_adrenal_2,RNA,ali-mortazavi:Split-seq_computational_protocol...,Parse Single Cell Whole Transcriptome Kit,3125,cells,unstranded,ali-mortazavi,UM1HG009443
0,ali-mortazavi:library_sr_P10_female_adrenal_1,ali-mortazavi:biosample_P10_female_adrenal_1,RNA,ali-mortazavi:Split-seq_computational_protocol...,Parse Single Cell Whole Transcriptome Kit,1562,cells,unstranded,ali-mortazavi,UM1HG009443
0,ali-mortazavi:library_sr_P10_female_adrenal_2,ali-mortazavi:biosample_P10_female_adrenal_2,RNA,ali-mortazavi:Split-seq_computational_protocol...,Parse Single Cell Whole Transcriptome Kit,1562,cells,unstranded,ali-mortazavi,UM1HG009443


In [207]:
rep_sub

Unnamed: 0,aliases,library,experiment,biological_replicate_number,technical_replicate_number
0,ali-mortazavi:rep_sr_P18-20mo_male_adrenal_2,ali-mortazavi:library_sr_P18-20mo_male_adrenal_2,ali-mortazavi:exp_sr_P18-20mo_male_adrenal,2,1
0,ali-mortazavi:rep_sr_P10_female_adrenal_1,ali-mortazavi:library_sr_P10_female_adrenal_1,ali-mortazavi:exp_sr_P10_female_adrenal,1,1
0,ali-mortazavi:rep_sr_P10_female_adrenal_2,ali-mortazavi:library_sr_P10_female_adrenal_2,ali-mortazavi:exp_sr_P10_female_adrenal,2,1


In [208]:
file_sub

Unnamed: 0,aliases,dataset,submitted_file_name,replicate,file_format,output_type,platform,read_length,run_type
0,ali-mortazavi:fastq_sr_P18-20mo_male_adrenal_2,ali-mortazavi:exp_sr_P18-20mo_male_adrenal,/Users/fairliereese/Documents/programming/mort...,ali-mortazavi:rep_sr_P18-20mo_male_adrenal_2,fastq,reads,encode:NextSeq2000,115,single-ended
0,ali-mortazavi:fastq_sr_P10_female_adrenal_1,ali-mortazavi:exp_sr_P10_female_adrenal,/Users/fairliereese/Documents/programming/mort...,ali-mortazavi:rep_sr_P10_female_adrenal_1,fastq,reads,encode:NextSeq2000,115,single-ended
0,ali-mortazavi:fastq_sr_P10_female_adrenal_2,ali-mortazavi:exp_sr_P10_female_adrenal,/Users/fairliereese/Documents/programming/mort...,ali-mortazavi:rep_sr_P10_female_adrenal_2,fastq,reads,encode:NextSeq2000,115,single-ended


In [198]:
def get_metadata(f):
    tissue, age, sex, rep = f.split('_')
    rep = rep.split('.')[0]
    tissue = tissue.split('/')[-1]
    return tissue, age, sex, rep

def get_sample_id(f):
    s = f.split('/')[-1]
    s = s.split('.')[0]
    return s

In [216]:
import sys

In [217]:
sys.argv[0]

'/Users/fairliereese/miniconda3/lib/python3.7/site-packages/ipykernel_launcher.py'