In [None]:
import os
import pandas as pd
import numpy as np
import datetime

In [None]:
out_folder=os.path.join("..","data")

In [None]:
results_dir=os.path.join(out_folder,"pipeline_out","results")
metadata_file= os.path.join(out_folder,"complete_metadata.csv")

In [None]:
lab_annotations_file = os.path.join("..","meta","lab_annotations.csv")
lab_annotations=pd.read_csv(lab_annotations_file)
lab_annotations['ORIGINAL_BIOSAMPLE']=lab_annotations['DONOR']+'_'+lab_annotations['VISIT']+'_'+lab_annotations['TISSUE']+'_ATAC_R1'
lab_annotations.set_index('ORIGINAL_BIOSAMPLE',inplace=True)
lab_annotations.columns=lab_annotations.columns.map(lambda x: "LAB:{}".format(x))


In [None]:
metrics_file=os.path.join("..","meta","demux_metrics.csv")
demux_metrics=pd.read_csv(metrics_file)
demux_metrics.columns=demux_metrics.columns.map(lambda x: "DEMUX:{}".format(x))
demux_metrics['DEMUX:LANE_ID']=demux_metrics['DEMUX:RUN']+"_"+demux_metrics['DEMUX:FLOWCELL']+"_"+demux_metrics['DEMUX:LANE'].astype(str)
demux_metrics.drop(['DEMUX:LANE'],axis=1,inplace=True)

In [None]:
demux_metrics=demux_metrics.join(
    lab_annotations[['LAB:CQ','LAB:BATCH']],
    on='DEMUX:BIOSAMPLE',
    how='outer'
)

In [None]:
metadata=demux_metrics.set_index(['DEMUX:DEMUX_NAME','DEMUX:LANE_ID'],drop=False)

In [None]:
metadata['SAMPLE:DONOR']=metadata['DEMUX:BIOSAMPLE'].apply(lambda x: x.split('_')[0])
metadata['SAMPLE:VISIT']=metadata['DEMUX:BIOSAMPLE'].apply(lambda x: x.split('_')[1])
metadata['SAMPLE:TISSUE']=metadata['DEMUX:BIOSAMPLE'].apply(lambda x: x.split('_')[2])

In [None]:
join_fun=lambda x: ";".join(x.astype(str)) if (len(x.unique())>1) else x.unique()
aggr_fun=dict.fromkeys(metadata,join_fun)
aggr_fun['DEMUX:PF_READS']='sum'

In [None]:
metadata= metadata.groupby('DEMUX:BIOSAMPLE').agg(aggr_fun)

In [None]:
metadata.columns

In [None]:
pipeline_data_file=os.path.join("..","data","pipeline_out","pipeline_stats_summary.tsv")
pipeline_data=pd.read_csv(pipeline_data_file,sep='\t',index_col=0).drop(['organism','toggle','Time','Success'],axis=1)
pipeline_data.columns = pipeline_data.columns.map(lambda x: "RUN:{}".format(x.upper()))

In [None]:
metadata=metadata.join(pipeline_data)

In [None]:
metadata.columns

In [None]:
promoters=pd.read_csv(
    os.path.join("..","references","homo_sapiens.GRCh38.Regulatory_Build.regulatory_features.20190329.parsed.csv"),
    dtype={'chrom':str},
    index_col='ID'
)
promoters=list(promoters.loc[promoters['type']=='promoter'].index)

In [None]:
def get_promoter_frips(sample):
    reg_reads=pd.read_csv(os.path.join(results_dir,sample,"{}_oracle.quantification.bed".format(sample)),
                          sep='\t',
                          names=['CHR','START','END','ID','NA','NA2',sample,'NA3','NA4','NA5'],
                          usecols=['ID',sample],
                          index_col='ID')
    return float(
        reg_reads.loc[promoters].sum()
    )/float(
        metadata.loc[sample,'RUN:FILTERED_MAPPED_READS']
    )

In [None]:
import multiprocessing
a_pool = multiprocessing.Pool(8)

In [None]:
promoter_frips=a_pool.map(get_promoter_frips, list(metadata.index))

In [None]:
metadata['QC:PROMOTER_FRIP']=promoter_frips

In [None]:
metadata.to_csv(metadata_file,index=False)