In [1]:
## overall scripts to run
# qsub Scripts/A08_compile_final_metadata.sub


In [2]:
# # again can submit single script, or run interactively
# # for interactive mode, just need to specify working dir below
# # then comment out the "%%bash / cat" lines and run python code in-notebook

# envvar_needed = ['dir_proj']
# try:
#     os.environ['metadat_well']
# except KeyError:
#     envspec = pd.read_csv("../snmCT_parameters.env", sep = "=", comment="#", header = None
#                ).set_axis(['varname', 'varpath'], axis = 1
#                ).query('varname in @envvar_needed')
#     for index, row in envspec.iterrows():
#         os.environ[row["varname"]] = row["varpath"]
# os.chdir(os.environ['dir_proj'])

In [3]:
%%bash
cat > ../Scripts/A08_compile_final_metadata.sub

#!/bin/bash
#$ -cwd
#$ -o sublogs/A08_compile_final.$JOB_ID
#$ -j y
#$ -N A08_compile_metadata
#$ -l h_rt=0:30:00,h_data=4G
#$ -hold_jid A05_compile_DNA,A07_compile_RNA



echo "Job $JOB_ID started on:   " `hostname -s`
echo "Job $JOB_ID started on:   " `date `
echo " "



# environment init -----------------------------------------------------------

. /u/local/Modules/default/init/modules.sh # <--
module load anaconda3 # <--
conda activate snmCTseq # <--

export $(cat snmCT_parameters.env | grep -v '^#' | xargs) # <--



# run metadat scripts --------------------------------------------------------

# - recommend running these interactively in python/Jupyter to check outputs
# - have a distinct RNA/DNA script available in case one modality finishes first
#   & the analyst wants a quick look at the data

python Scripts/A08a_final_metadat_DNA.py
python Scripts/A08b_final_metadat_RNA.py
python Scripts/A08c_metadata_RNADNA.py




echo -e "\n\n'A08_compile_all' completed.\n\n"


echo "Job $JOB_ID ended on:   " `hostname -s`
echo "Job $JOB_ID ended on:   " `date `
echo " "

## A08a. DNA

In [4]:
%%bash
cat > ../Scripts/A08a_final_metadat_DNA.py

# A08b_final_metadat_DNA.py ====================================================
# assumes no changes to script output names from A05/A07

# setup ------------------------------------------------------------------------

import pandas as pd
import glob


# load tables ------------------------------------------------------------------

def read_tbl_wrapper(filepath, prefix = ""):
    return(pd.read_csv(filepath, delimiter = "\t", index_col = 0).add_prefix(prefix))

df_trim_fastp = read_tbl_wrapper("Metadata/A05a_trimming.tsv", "Premap_")

df_bismark_pe = read_tbl_wrapper("Metadata/A05b_DNA_maprate_PE.tsv", "PE_")
df_bismark_se1trim = read_tbl_wrapper("Metadata/A05b_DNA_maprate_SE1trim.tsv", "SE1t_")
df_bismark_se2trim = read_tbl_wrapper("Metadata/A05b_DNA_maprate_SE2trim.tsv", "SE2t_")
df_bismark_se1unmap = read_tbl_wrapper("Metadata/A05b_DNA_maprate_SE1unmap.tsv", "SE1u_")
df_bismark_se2unmap = read_tbl_wrapper("Metadata/A05b_DNA_maprate_SE2unmap.tsv", "SE2u_")

df_picard_pe = read_tbl_wrapper("Metadata/A05c_DNA_picard_PE.tsv", "PE_")
df_picard_se = read_tbl_wrapper("Metadata/A05c_DNA_picard_SE.tsv", "SE_")

df_DNAfrac = read_tbl_wrapper("Metadata/A05d_DNA_global_mCfracs.tsv", "")

df_samstat_pe = read_tbl_wrapper("Metadata/A05e_DNA_samstats_PE.tsv", "PE_")
df_samstat_se = read_tbl_wrapper("Metadata/A05e_DNA_samstats_SE.tsv", "SE_")

df_coverage_1x = read_tbl_wrapper("Metadata/A05f_DNA_cov_chrXdivY.tsv", "")
df_coverage_sex = read_tbl_wrapper("Metadata/A05f_DNA_cov_percent1x.tsv", "")


# merge on rowname index (wellprefix) ------------------------------------------

# SE1trim, SE2trim = singletons from trimming
# SE1unmap, SE2unmap = singletons from PE-mapping 

metadata_DNA = pd.concat([df_trim_fastp,
                         df_bismark_pe, df_bismark_se1trim, df_bismark_se2trim, df_bismark_se1unmap, df_bismark_se2unmap,
                         df_picard_pe, df_picard_se,
                         df_DNAfrac,
                         df_samstat_pe, df_samstat_se,
                         df_coverage_1x, df_coverage_sex], axis = 1)
metadata_DNA = metadata_DNA.apply(pd.to_numeric, errors = "coerce")



# few 'combined' PE & SE metadata stats ----------------------------------------
# some attempts at combined/weighted metrics

# notes: (i) most output above reports paired-end read-pairs/fragments versus reads, hence *2
#        (ii) reads that are attempted to be remapped in "stage-two" SE mapping
#            are subsets of "PE_TotalReadsIn"; thus can't simply sum(total mapped) / sum(reads in)

# estimated total mapping rate (at read level) ----------------------------------------

metadata_DNA['Combined_TotalReadsIn'] = \
    metadata_DNA['PE_TotalReadPairsIn']*2 + \
    metadata_DNA['SE1t_TotalReadsIn'] + \
    metadata_DNA['SE2t_TotalReadsIn']

metadata_DNA['Combined_UniqueMappedReads'] = \
    metadata_DNA.filter(regex = "UniqueMapped", axis = 1
    ).assign(PE_UniqueMappedPairs=metadata_DNA['PE_UniqueMappedPairs']*2
    ).sum(axis = 1)

metadata_DNA['Combined_ReadMappingRate'] = \
    metadata_DNA['Combined_UniqueMappedReads'] / metadata_DNA['Combined_TotalReadsIn']


# percent of dedupe reads that pass filtering ----------------------------------------

metadata_DNA['PE_Filtered_Rate'] = \
    metadata_DNA['PE_FilteredSeqCount'] / \
    (2*metadata_DNA['PE_picard_npairsin']*(1 - metadata_DNA['PE_picard_perc_dupe']))
metadata_DNA['SE_Filtered_Rate'] = \
    metadata_DNA['SE_FilteredSeqCount'] / \
    (metadata_DNA['SE_picard_nreadsin']*(1 - metadata_DNA['SE_picard_perc_dupe']))
metadata_DNA['Combined_Filtered_ReadCount'] = \
    metadata_DNA['PE_FilteredSeqCount'] + metadata_DNA['SE_FilteredSeqCount']
metadata_DNA['Combined_Filtered_ReadPercent'] = \
    metadata_DNA['Combined_Filtered_ReadCount'] / \
    (metadata_DNA['PE_picard_npairsin']*(1 - metadata_DNA['PE_picard_perc_dupe'])*2 +
    metadata_DNA['SE_picard_nreadsin'])


# final total # fragments --------------------------------------------------------------
# (whereas Combined_Filtered_ReadCount above in terms of reads)

metadata_DNA['Combined_Filtered_FragmentCount'] = \
    metadata_DNA['PE_FilteredSeqCount']/2 + metadata_DNA['SE_FilteredSeqCount']


# final DNA metadata -----------------------------------------------------------

metadata_DNA.to_csv("Metadata/A08a_metadata_DNA.tsv", sep = "\t")

## A08b. compile RNA metadata

In [5]:
%%bash
cat > ../Scripts/A08b_final_metadat_RNA.py

# A08b_final_metadat_RNA.py ====================================================
# assumes no changes to script output names from A06/A07

# setup ------------------------------------------------------------------------

import pandas as pd
import os


# load tables ------------------------------------------------------------------

def read_tbl_wrapper(filepath, prefix = ""):
    return(pd.read_csv(filepath, delimiter="\t", index_col = 0).add_prefix(prefix))

if os.path.exists("Metadata/A07a_trimming.tsv"):
    df_trim_fastp = read_tbl_wrapper("Metadata/A07a_trimming.tsv", "Premap_")
else:
    df_trim_fastp = read_tbl_wrapper("Metadata/A05a_trimming.tsv", "Premap_")
    

df_star_pe = read_tbl_wrapper("Metadata/A07b_RNA_maprate_PE.tsv", "PE_")
df_star_se1 = read_tbl_wrapper("Metadata/A07b_RNA_maprate_SE1.tsv", "SE1_")
df_star_se2 = read_tbl_wrapper("Metadata/A07b_RNA_maprate_SE2.tsv", "SE2_")

df_dedupe_pe = read_tbl_wrapper("Metadata/A07c_RNA_picard_PE.tsv", "PE_")
df_dedupe_se1 = read_tbl_wrapper("Metadata/A07c_RNA_picard_SE1.tsv", "SE1_")
df_dedupe_se2 = read_tbl_wrapper("Metadata/A07c_RNA_picard_SE2.tsv", "SE2_")

df_featurecounts_gene = read_tbl_wrapper("Metadata/A07d_RNA_featcounts_gene.tsv", "Gene_")
df_featurecounts_exon = read_tbl_wrapper("Metadata/A07d_RNA_featcounts_exon.tsv", "Exon_")

df_samstat_pe = read_tbl_wrapper("Metadata/A07e_RNA_samstats_PE.tsv", "PE_")
df_samstat_se1 = read_tbl_wrapper("Metadata/A07e_RNA_samstats_SE1.tsv", "SE1_")
df_samstat_se2 = read_tbl_wrapper("Metadata/A07e_RNA_samstats_SE2.tsv", "SE2_")

df_picard_pe = read_tbl_wrapper("Metadata/A07f_RNA_picard_PE.tsv", "PE_")
df_picard_se1 = read_tbl_wrapper("Metadata/A07f_RNA_picard_SE1.tsv", "SE1_")
df_picard_se2 = read_tbl_wrapper("Metadata/A07f_RNA_picard_SE2.tsv", "SE2_")


metadata_rna = \
    pd.concat([df_trim_fastp,
               df_star_pe, df_star_se1, df_star_se2,
               df_dedupe_pe, df_dedupe_se1,  df_dedupe_se2,
              df_featurecounts_gene, df_featurecounts_exon,
              df_samstat_pe, df_samstat_se1, df_samstat_se2,
              df_picard_pe, df_picard_se1, df_picard_se2],
              axis = 1)
metadata_rna = metadata_rna.apply(pd.to_numeric, errors='coerce')



# few 'combined' PE & SE metadata stats ----------------------------------------
# some attempts at combined/weighted metrics

# notes: (i) most output above reports paired-end read-pairs/fragments versus reads, hence *2
#       (ii) reads that are attempted to be remapped in "stage-two" SE mapping
#            are subsets of "PE_TotalReadsIn"; thus can't simply sum(total mapped) / sum(reads in)
#     & (iii) "PE-mapping singletons" can also be generated where R1 assigned DNA while R2 assigned RNA,
#           or with STAR, R1 can map while R2 doesn't (e.g., -f 0x0048) and these are retained in PE.bam;
#           thus # based solely on the mapper's output miscount mapping singletons as PE instead of SE
#           & the single-end mapping & filtering rate interpretations are slightly off versus bismark
#           [may write QC scripts based on the _annotations files to make this more accurate in the future]


# estimated total mapping rate (at read level) ---------------------------------

# total reads in approximate because uses the rounded value of % reads mapped
# (& other caveats above); in practice better to use "Combined_TotalReadsIn" from mC calc in A07b

metadata_rna['Combined_TotReadsIn'] = \
    metadata_rna['PE_NumReadsIn'] * (metadata_rna['PE_PercentReadsUniqueMapped']/100) * 2 + \
    metadata_rna['SE1_NumReadsIn'] + \
    metadata_rna['SE2_NumReadsIn']

metadata_rna['Combined_NumReadsUniqueMapped'] = \
    metadata_rna['PE_NumReadsUniqueMapped']*2 + \
    metadata_rna['SE1_NumReadsUniqueMapped'] + \
    metadata_rna['SE2_NumReadsUniqueMapped']

metadata_rna['Combined_MappingRate'] = \
    metadata_rna['Combined_NumReadsUniqueMapped'] / metadata_rna['Combined_TotReadsIn']

# percent of mapped reads that pass filtering ----------------------------------
# note: SE1_TotalReadsFiltered should = SE1_FilteredSeqCount

metadata_rna['PE_Filtered_Rate'] = \
    metadata_rna['PE_FilteredSeqCount'] / \
    metadata_rna['PE_NumReadsUniqueMapped']
metadata_rna['SE1_Filtered_Rate'] = \
    metadata_rna['SE1_FilteredSeqCount'] / \
    metadata_rna['SE1_NumReadsUniqueMapped']
metadata_rna['SE2_Filtered_Rate'] = \
    metadata_rna['SE2_FilteredSeqCount'] / \
    metadata_rna['SE2_NumReadsUniqueMapped']

metadata_rna['Combined_Filtered_ReadCount'] = \
    metadata_rna['PE_FilteredSeqCount']*2 + \
    metadata_rna['SE1_FilteredSeqCount'] + \
    metadata_rna['SE2_FilteredSeqCount']
metadata_rna['Combined_Filtered_FragmentCount'] = \
    metadata_rna['PE_FilteredSeqCount'] + \
    metadata_rna['SE1_FilteredSeqCount'] + \
    metadata_rna['SE2_FilteredSeqCount']

# final assigned reads/fragments -----------------------------------------------

metadata_rna['Combined_AssignedReads_Exon'] = \
    metadata_rna['Exon_PE_Assigned']*2 + \
    metadata_rna['Exon_SE1_Assigned'] + \
    metadata_rna['Exon_SE2_Assigned']

metadata_rna['Combined_AssignedFragments_Exon'] = \
    metadata_rna['Exon_PE_Assigned'] + \
    metadata_rna['Exon_SE1_Assigned'] + \
    metadata_rna['Exon_SE2_Assigned']

metadata_rna['Combined_AssignedReads_Gene'] = \
    metadata_rna['Gene_PE_Assigned']*2 + \
    metadata_rna['Gene_SE1_Assigned'] + \
    metadata_rna['Gene_SE2_Assigned']

metadata_rna['Combined_AssignedFragments_Gene'] = \
    metadata_rna['Gene_PE_Assigned'] + \
    metadata_rna['Gene_SE1_Assigned'] + \
    metadata_rna['Gene_SE2_Assigned']


# final RNA metadata -----------------------------------------------------------

metadata_rna.to_csv("Metadata/A08b_metadata_RNA.tsv", sep = "\t")

## A07c. join DNA + RNA metadata

In [6]:
%%bash
cat > ../Scripts/A08c_metadata_RNADNA.py

# A08c_metadata_RNADNA.py: compile A08a & A08b
# this final metadata file is what's typically used for QC

# setup ------------------------------------------------------------------------

import glob
import pandas as pd



# read and merge ---------------------------------------------------------------

def read_tbl_wrapper(filepath, prefix = ""):
    return(pd.read_csv(filepath, delimiter="\t", index_col = 0).add_prefix(prefix))

metadata_DNA = read_tbl_wrapper("Metadata/A08a_metadata_DNA.tsv", "DNA_")
metadata_RNA = read_tbl_wrapper("Metadata/A08b_metadata_RNA.tsv", "RNA_")

metadata_joined = \
    pd.concat([metadata_DNA,
               metadata_RNA.drop(
                   metadata_RNA.columns[metadata_RNA.columns.str.contains("Premap")],
                   axis = 1)],
              axis = 1)

# calculate few joint library size metrics
metadata_joined['Joint_TotalReadCount'] = \
    metadata_joined["DNA_Combined_Filtered_ReadCount"] + metadata_joined['RNA_Combined_Filtered_ReadCount']
metadata_joined['Joint_PercentRead_DNA'] = \
    metadata_joined["DNA_Combined_Filtered_ReadCount"] / metadata_joined['Joint_TotalReadCount']

metadata_joined['Joint_TotalFragmentCount'] = \
    metadata_joined["DNA_Combined_Filtered_FragmentCount"] + metadata_joined['RNA_Combined_Filtered_FragmentCount']
metadata_joined['Joint_PercentFragment_DNA'] = \
    metadata_joined["DNA_Combined_Filtered_FragmentCount"] / metadata_joined['Joint_TotalFragmentCount']

# RNA mapping rate, excluding DNA
metadata_joined['RNA_DNAAdj_ReadMappingRate'] = \
    metadata_joined['RNA_Combined_Filtered_ReadCount'] / \
    (metadata_joined['RNA_Combined_TotReadsIn'] - metadata_joined['DNA_Combined_Filtered_ReadCount'])

# DNA mapping rate, excluding RNA
metadata_joined['DNA_RNAAdj_ReadMappingRate'] = \
    metadata_joined['DNA_Combined_Filtered_ReadCount'] / \
    (metadata_joined['DNA_Combined_TotalReadsIn'] - metadata_joined['RNA_Combined_Filtered_ReadCount'])



# final DNA + RNA metadata -----------------------------------------------------

metadata_joined.to_csv("Metadata/A08c_metadata_RNADNA.tsv", sep = "\t")

