In [1]:
# # A07_compile_final_metadata overall cmds ===========================================

# qsub Scripts/A07a_parse_metadata.sub # ‡
# qsub Scripts/A07b_compile_metadata.sub # ‡

# # ‡ fast enough to run interactively

# # for interactive mode, just need to specify working dir & two environment variables
# # then can run python code instead of 
# import os
# os.chdir('../') # move to $dir_proj
# os.environ['metadat_well'] = "Metadata/A01c_well_filepath.csv"
# os.environ['ref_chromsizes'] = "/u/project/cluo/chliu/Genomes/IGVF_hg38_pluslambda/chromsizes.tsv"

In [2]:
%%bash
cat > ../Scripts/A07a_parse_metadata.sub

#!/bin/bash
#$ -cwd
#$ -o sublogs/A07a_parse_metadata.$JOB_ID.$TASK_ID
#$ -j y
#$ -l h_rt=2:00:00,h_data=8G
#$ -N A07a_parse_metadata
#$ -t 1-7



echo "Job $JOB_ID.$SGE_TASK_ID started on:   " `hostname -s`
echo "Job $JOB_ID.$SGE_TASK_ID started on:   " `date `
echo " "





# environment init -------------------------------------------------------------

. /u/local/Modules/default/init/modules.sh # <--
module load anaconda3 # <--
conda activate snm3Cseq_taurus # <--

export $(cat snm3C_parameters.env | grep -v '^#' | xargs) # <--



# run each helper script (A07a*) ------------------------------------------------

# note: in practice these can each be submitted interactively/as its own task,
# as some of these scripts should be much lower resource than others
# however, this -t 1-7 job parallelization is just for tidyness

echo "metadata script # $SGE_TASK_ID running:"

case $SGE_TASK_ID in

  1)
    echo "python Scripts/A07a1_trimming.py"
    python Scripts/A07a1_trimming.py
    ;;

  2)
    echo "python Scripts/A07a2_mapping_rate.py"
    python Scripts/A07a2_mapping_rate.py
    ;;

  3)
    echo "python Scripts/A07a3_dedupe.py"
    python Scripts/A07a3_dedupe.py
    ;;

  4)
    echo "python Scripts/A07a4_global_mC_fracs.py"
    python Scripts/A07a4_global_mC_fracs.py
    ;;

  5)
    echo "python Scripts/A07a5_samtools_stats.py"
    python Scripts/A07a5_samtools_stats.py
    ;;

  6)
    echo "python Scripts/A07a6_coverage.py"
    python Scripts/A07a6_coverage.py
    ;;

  7)
    echo "python Scripts/A07a7_contacts.py"
    python Scripts/A07a7_contacts.py
    ;;
    
  *)
    ;;
esac





echo "completed 'A07a_parse_metadata.'"



echo "Job $JOB_ID.$SGE_TASK_ID ended on:   " `hostname -s`
echo "Job $JOB_ID.$SGE_TASK_ID ended on:   " `date `


In [3]:
%%bash
cat > ../Scripts/A07b_compile_metadata.sub

#!/bin/bash
#$ -cwd
#$ -o sublogs/A07b_compile_metadata.$JOB_ID
#$ -j y
#$ -N A07b_compile_metadata
#$ -l h_rt=0:30:00,h_data=4G
#$ -hold_jid A07a_parse_metadata



echo "Job $JOB_ID started on:   " `hostname -s`
echo "Job $JOB_ID started on:   " `date `
echo " "





# environment init -----------------------------------------------------------

. /u/local/Modules/default/init/modules.sh # <--
module load anaconda3 # <--
conda activate snm3Cseq_taurus # <--

export $(cat snm3C_parameters.env | grep -v '^#' | xargs) # <--



# run metadat scripts --------------------------------------------------------

python Scripts/A07b_compile_metadata.py





echo -e "\n\n'A07b_compile_metadata' completed.\n\n"



echo "Job $JOB_ID ended on:   " `hostname -s`
echo "Job $JOB_ID ended on:   " `date `


In [4]:
%%bash
cat > ../Scripts/A07a1_trimming.py

# A07a1_trimming.py ============================================================

# setup ------------------------------------------------------------------------

import re
import pandas as pd
import glob
import os

filepath_wellmetadat = os.environ['metadat_well']
metadata_well = pd.read_csv(filepath_wellmetadat)

def parse_fastp_report(filepath):
    jsonfile = pd.read_json(filepath)
    dict_out = {
        'nreads_pretrim' : jsonfile['summary']['before_filtering']['total_reads'],
        'percreads_passtrim' : jsonfile['summary']['after_filtering']['total_reads'] /
              jsonfile['summary']['before_filtering']['total_reads'],
        'q20_pretrim' : jsonfile['summary']['before_filtering']['q30_rate'],
        'q20_posttrim' : jsonfile['summary']['after_filtering']['q30_rate'],
        'r1_len' : jsonfile['summary']['after_filtering']['read1_mean_length'],
        'r2_len' : jsonfile['summary']['after_filtering']['read2_mean_length'],
        'gc_perc' : jsonfile['summary']['after_filtering']['gc_content']}
    return(dict_out)



# gather metadata --------------------------------------------------------------

filelist=metadata_well['A03a_json_fastp']
boolean_fileexists = [os.path.exists(f) for f in filelist]
list_fastp = [parse_fastp_report(f) for f in filelist[boolean_fileexists]]
df_fastp = pd.DataFrame(list_fastp,
                        index=metadata_well['wellprefix'][boolean_fileexists])

# print percent files missing
print("number of target files: " + str(len(filelist)))
print("fraction files missing: ")
print(round(1 - sum(boolean_fileexists)/len(boolean_fileexists), 3))

# column QC
print("Number of NAs per column:")
print(df_fastp.isna().sum().to_string())

print("\nNumber of duplicated wells:")
ndupe=df_fastp.index.duplicated().sum()
print(ndupe)



# final export
df_fastp.to_csv("Metadata/A07a1_trimming.tsv", sep='\t')
print("done.\n\n")

In [5]:
%%bash
cat > ../Scripts/A07a2_mapping_rate.py

# A07a2_mapping_rate.py ========================================================

# setup ------------------------------------------------------------------------

import os
import glob
import re
import pandas as pd

filepath_wellmetadat = os.environ['metadat_well']
metadata_well = pd.read_csv(filepath_wellmetadat)
    
def parse_bismark_report(filepath):

    """
    parse bismark.txt output
    adapted from YAP @ https://github.com/lhqing/cemba_data to include PE & SE output
    commented out term_dict lines of limited interest
    note that paired-end metrics usually yield fragments, versus reads
    """

    term_dict = {
        'Sequence pairs analysed in total': 'TotalReadPairsIn',
        'Sequences analysed in total': 'TotalReadsIn',
        'Number of paired-end alignments with a unique best hit': 'UniqueMappedPairs',
        'Number of alignments with a unique best hit from the different alignments': 'UniqueMappedReads',
        'Mapping efficiency': 'MappingRate',
        # # other potential metrics, not usually used
#         'Sequence pairs with no alignments under any condition': 'UnmappedPairs',
#         'Sequences with no alignments under any condition': 'UnmappedReads',
#         'Sequences did not map uniquely': 'AmbigReads',
#         'Sequence pairs did not map uniquely': 'AmbigPairs',
#         'CT/GA/CT': 'ReadsOT',
#         'GA/CT/CT': 'ReadsOB',
#         'GA/CT/GA': 'ReadsCTOT',
#         'CT/GA/GA': 'ReadsCTOB',
#         'CT/CT': 'ReadsOT',
#         'CT/GA': 'ReadsOB',
#         'GA/CT': 'ReadsCTOT',
#         'GA/GA': 'ReadsCTOB',
#         'Total number of C\'s analysed': 'TotalC',
#         'C methylated in CpG context': 'BismarkmCGRate',
#         'C methylated in CHG context': 'BismarkmCHGRate',
#         'C methylated in CHH context': 'BismarkmCHHRate',
#         'C methylated in unknown context (CN or CHN)' : 'BismarkmCNCHNRate',
#         'C methylated in Unknown context (CN or CHN)' : 'BismarkmCNCHNRate'
        }

    with open(filepath) as report:
        report_dict = {}
        for line in report:
            try:
                lhs, rhs = line.split(':')
            except ValueError:
                continue
            try:
                report_dict[term_dict[lhs]] = rhs.strip().split('\t')[0].strip('%')
            except KeyError:
                pass
            
    return(report_dict)



# prep file lists --------------------------------------------------------------

# extract bismark logs that exist
target_filepath_columns = metadata_well.columns[
    metadata_well.columns.str.contains("A04a_bismarktxt_")]
filelist = metadata_well.loc[:, target_filepath_columns].values.tolist()
filelist = [f for sublist in filelist for f in sublist]
boolean_fileexists = [os.path.exists(f) for f in filelist]

# print percent files missing
print("number of target files: " + str(len(filelist)))
print("fraction files missing: ")
print(round(1 - sum(boolean_fileexists)/len(boolean_fileexists), 3))

# well list
# anticipate 10 unique logs per well (2 full length, up to 6 splits)
welllist = []
for w in metadata_well.wellprefix:
    welllist.extend([w]*10)

# loop through files
print("reading bismark logs...")
list_bismarkmap = [parse_bismark_report(f) for f in pd.Series(filelist)[boolean_fileexists]]

df_mapping_byalign = pd.DataFrame(list_bismarkmap).apply(pd.to_numeric)
df_mapping_byalign.index=pd.Series(welllist)[boolean_fileexists]

# pairing metadata by alignment type

print("pairing metadata with split/alignment type...")
df_mapping_byalign['Alignment'] = pd.Series(
    target_filepath_columns.str.replace("A04a_bismarktxt_", "").to_list() \
    * metadata_well.shape[0])[boolean_fileexists].to_list()

alignnames_presplit = ["R1p", "R2p", "R1trims", "R2trims"]

# can optionally save this detailed breakdown by alignment source
# df_mapping_byalign.to_csv("Metadata/A07a2_mappingrate_detailed.tsv", sep='\t')



# final metadata out -----------------------------------------------------------
# join the TAURUS pre-split (full R1 and R2 mapping), then post-split

print("now grouping by wellprefix...")

df_final = pd.DataFrame(index = metadata_well.wellprefix)

df_presplit = \
    df_mapping_byalign[df_mapping_byalign['Alignment'].isin(alignnames_presplit)
                      ].reset_index().groupby('index').agg('sum')
df_final['NumReadsIn'] = df_presplit.TotalReadsIn
df_final['UniqueMappedReads_PreSplit'] = df_presplit.UniqueMappedReads
df_final['MappingRate_PreSplit'] = df_presplit.UniqueMappedReads/df_presplit.TotalReadsIn

df_postsplit = \
    df_mapping_byalign[~df_mapping_byalign['Alignment'].isin(alignnames_presplit)
                      ].reset_index().groupby('index').agg('sum')

df_final['NumReadsIn_PostSplit'] = df_postsplit.TotalReadsIn
df_final['UniqueMappedReads_PostSplit'] = df_postsplit.UniqueMappedReads
df_final['MappingRate_PostSplit'] =  df_postsplit.UniqueMappedReads/df_postsplit.TotalReadsIn

# final combined mapping rates
# Nmappre + Nmappost/Nsplits; Nsplits = (Ninpost/Nunmapped) ~ 2-3 per read

print("doing final summary by well...")

df_final['Alignments_Total_SplitAdj'] = \
    df_final.UniqueMappedReads_PreSplit + \
    df_final.UniqueMappedReads_PostSplit / \
    (df_final.NumReadsIn_PostSplit/(df_final.MappingRate_PreSplit*df_final.NumReadsIn))
df_final['MappingRate_Total'] = \
    df_final.Alignments_Total_SplitAdj / df_final.NumReadsIn

# column QC
print("Number of NAs per column:")
print(df_final.isna().sum().to_string())

print("\nNumber of duplicated wells:")
ndupe=df_final.index.duplicated().sum()
print(ndupe)



# final export
df_final.to_csv("Metadata/A07a2_mappingrate.tsv", sep='\t')
print("done.\n\n")

In [6]:
%%bash
cat > ../Scripts/A07a3_dedupe.py

# A07a3_dedupe.py ==============================================================

# setup ------------------------------------------------------------------------

import os
import glob
import pandas as pd
import numpy as np

filepath_wellmetadat = os.environ['metadat_well']
metadata_well = pd.read_csv(filepath_wellmetadat)

nulltable = np.array([pd.NA, pd.NA, pd.NA]) 

def parse_dedupe(filepath):
    try:
        data_dedupe = pd.read_csv(filepath, delimiter = "\t",
                         comment = "#", nrows = 1)[[
                             'UNPAIRED_READS_EXAMINED', 'READ_PAIRS_EXAMINED', 'PERCENT_DUPLICATION'
                         ]].transpose()[0]
        return(data_dedupe)
    except:
        return(nulltable)

tidy_name_dict = {'PERCENT_DUPLICATION' : 'picard_perc_dupe',
                  'READ_PAIRS_EXAMINED' : 'picard_npairsin',
                  'UNPAIRED_READS_EXAMINED' : 'picard_nreadsin'}



# gather metadata --------------------------------------------------------------

filelist=metadata_well['A04a_log_picard']
boolean_fileexists = [os.path.exists(f) for f in filelist]
list_picard = [parse_dedupe(f) for f in filelist[boolean_fileexists]]

df_picard = pd.DataFrame(list_picard,
                            index = metadata_well['wellprefix'][boolean_fileexists]
                           ).rename(columns = tidy_name_dict
                           ).drop("picard_npairsin", axis = 1)


# print percent files missing
print("number of target files: " + str(len(filelist)))
print("fraction files missing: ")
print(round(1 - sum(boolean_fileexists)/len(boolean_fileexists), 3))

# column QC
print("Number of NAs per column:")
print(df_picard.isna().sum().to_string())

print("\nNumber of duplicated wells:")
ndupe=df_picard.index.duplicated().sum()
print(ndupe)



# final export
df_picard.to_csv("Metadata/A07a3_dedupe.tsv", sep='\t')
print("done.\n\n")

In [7]:
%%bash
cat > ../Scripts/A07a4_global_mC_fracs.py

# A07a4_global_mC_fracs.py =====================================================

# setup ------------------------------------------------------------------------

import glob
import pandas as pd
import os

filepath_wellmetadat = os.environ['metadat_well']
metadata_well = pd.read_csv(filepath_wellmetadat)

attempt_dedupe = True # <-- attempt index repair?

# gather metadata --------------------------------------------------------------

filelist=[ "Metadata/A04d_mCfrac_" + str(batch_num) + ".tsv"
           for batch_num in pd.unique(metadata_well['batchnum']) ]
boolean_fileexists = [os.path.exists(f) for f in filelist]

list_mCfracs = [ pd.read_csv(f, delimiter = "\t") for f in pd.Series(filelist)[boolean_fileexists]] 
df_mCfracs = pd.concat(list_mCfracs)
df_mCfracs = df_mCfracs.rename(columns = {"Well" : "wellprefix"})



# print number files
print("number of target files: " + str(len(filelist)))

# column QC
print("Number of NAs per column:")
print(df_mCfracs.isna().sum().to_string())

print("\nNumber of duplicated wells:")
ndupe=df_mCfracs.wellprefix.duplicated().sum()
print(ndupe)



# check for dupe wells ---------------------------------------------------------
# the A04d scripts can generate duplicated wells due to the "append" option
# or early terminated jobs which may result in NAs
# thus we can sort by number of CHs observed in desc order --> remove first dupes
if attempt_dedupe and ndupe != 0:
    print("attempting to dedupe...")
    df_mCfracs = df_mCfracs.sort_values('CH')
    df_mCfracs = df_mCfracs[~df_mCfracs.wellprefix.duplicated(keep = 'first')]



# final export
df_mCfracs = df_mCfracs.sort_values('wellprefix').reset_index(drop=True)
df_mCfracs.to_csv("Metadata/A07a4_global_mC_fracs.tsv", sep='\t', index = False)
print("done.\n\n")

In [8]:
%%bash
cat > ../Scripts/A07a5_samtools_stats.py

# A07a5_samtools_stats.py ======================================================

# setup ------------------------------------------------------------------------

import os
import glob
import pandas as pd

filepath_wellmetadat = os.environ['metadat_well']
metadata_well = pd.read_csv(filepath_wellmetadat)

def parse_samstats(filepath):

    term_dict = {
        'raw total sequences': 'FilteredSeqCount',
        'bases mapped' : 'BasesMapped',
        'error rate': 'ErrorRate'
        }

    with open(filepath) as report:
        report_dict = {}
        for line in report:
            try:
                lhs, rhs = line.split(':')
            except ValueError:
                continue
            try:
                report_dict[term_dict[lhs]] = rhs.strip().split('\t')[0]
            except KeyError:
                pass
            
    return(report_dict)



# gather metadata --------------------------------------------------------------

metadata_well = pd.read_csv(filepath_wellmetadat)

filelist = metadata_well['A04c_txt_samstats']
boolean_fileexists = [os.path.exists(f) for f in filelist]
list_samstats = [parse_samstats(f) for f in filelist[boolean_fileexists]]
df_samstats = pd.DataFrame(list_samstats,
                        index=metadata_well['wellprefix'][boolean_fileexists])



# print percent files missing
print("number of target files: " + str(len(filelist)))
print("percent files missing: ")
print(round(1 - sum(boolean_fileexists)/len(boolean_fileexists), 3))

# column QC
print("Number of NAs per column:")
print(df_samstats.isna().sum().to_string())

print("\nNumber of duplicated wells:")
ndupe=df_samstats.index.duplicated().sum()
print(ndupe)



# final export
df_samstats.to_csv("Metadata/A07a5_samstats.tsv", sep='\t')
print("done.\n\n")

In [9]:
%%bash
cat > ../Scripts/A07a6_coverage.py

# A07a6_coverage.py ============================================================
# setup ------------------------------------------------------------------------

import os
import glob
import pandas as pd
import numpy as np

filepath_wellmetadat = os.environ['metadat_well']
metadata_well = pd.read_csv(filepath_wellmetadat)



# extract autosomal ------------------------------------------------------------

target_chroms = ["chr" + str(i) for i in range(1, 99)]
autosomal_chroms = \
    pd.read_csv(os.environ['ref_chromsizes'],
                sep = "\t", header = None, index_col = 0)
autosomal_chroms = autosomal_chroms[autosomal_chroms.index.isin(target_chroms)]
total_autosomal_bases = autosomal_chroms.sum()
target_chroms=autosomal_chroms.index



# gather metadata: base-lvl unique coverage levels -----------------------------

print("processing autosomal num sites with at least 1-fold coverage.")
print("if any filenames printed below, potentially corrupt files:")
def parse_coverage_unique(filepath):
    try:
        percent_coverage = \
            pd.read_csv(filepath, delimiter = "\s+", header = None, index_col=1)
        percent_coverage = (percent_coverage.loc[target_chroms, 0].sum() / total_autosomal_bases).to_list()[0]
    except:
        print("potentially corrupt file:")
        print(filepath)
        percent_coverage = np.nan
    return(percent_coverage)

filelist=metadata_well['A04c_txt_covnsites']
boolean_fileexists = [os.path.exists(f) for f in filelist]
list_unique = [parse_coverage_unique(file) for file in filelist[boolean_fileexists]]
df_unique = pd.DataFrame(list_unique,
                        index = metadata_well['wellprefix'][boolean_fileexists])
df_unique.columns = ["CoveragePerc1x"]



# print percent files missing
print("number of target files: " + str(len(filelist)))
print("fraction files missing: ")
print(round(1 - sum(boolean_fileexists)/len(boolean_fileexists), 3))

# column QC
print("Number of NAs per column:")
print(df_unique.isna().sum().to_string())

print("\nNumber of duplicated wells:")
ndupe=df_unique.index.duplicated().sum()
print(ndupe)



df_unique.to_csv("Metadata/A07a6_DNA_cov_percent1x.tsv", sep='\t')

print("done.\n\n")



# total coverage levels for chrX/chrY ------------------------------------------

print("processing total coverage levels per chrom.")
print("if any filenames printed below, potentially corrupt files:")
def parse_coverage_total(filepath):
    try:
        total_cov_by_chr = pd.read_csv(filepath, delimiter = "\s+", header = None, index_col=0)
        if any(total_cov_by_chr.index=="chrX") and (not any(total_cov_by_chr.index=="chrY")):
            coverage_XdivY = numpy.inf
        else:
            coverage_XdivY = total_cov_by_chr.loc['chrX', ] / total_cov_by_chr.loc['chrY', ]
            coverage_XdivY = coverage_XdivY.tolist()[0]
    except:
        print(filepath)
        coverage_XdivY = np.nan
    return(coverage_XdivY)

filelist=metadata_well['A04c_txt_covtot']
boolean_fileexists = [os.path.exists(f) for f in filelist]
list_total = [parse_coverage_total(file) for file in filelist[boolean_fileexists]]
df_total = pd.DataFrame(list_total,
             index = metadata_well['wellprefix'][boolean_fileexists])
df_total.columns = ["CoverageXdivY"]



# print percent files missing
print("number of target files: " + str(len(filelist)))
print("fraction files missing: ")
print(round(1 - sum(boolean_fileexists)/len(boolean_fileexists), 3))

# column QC
print("Number of NAs per column:")
print(df_total.isna().sum().to_string())

print("\nNumber of duplicated wells:")
ndupe=df_total.index.duplicated().sum()
print(ndupe)



# final export
df_total.to_csv("Metadata/A07a6_DNA_cov_chrXdivY.tsv", sep='\t')
print("done.\n\n")

In [10]:
%%bash
cat > ../Scripts/A07a7_contacts.py

# A07a7_contacts.py ============================================================
# setup ------------------------------------------------------------------------

import os
import pandas as pd

filepath_wellmetadat = os.environ['metadat_well']
metadata_well = pd.read_csv(filepath_wellmetadat)

def parse_pairs(filepath, prefix = ""):
    return(pd.read_csv(filepath, delimiter="\t"))



# gather metadata --------------------------------------------------------------

filelist=metadata_well['A06a_3c_metadat']
boolean_fileexists = [os.path.exists(f) for f in filelist]
list_contacts = [parse_pairs(file) for file in filelist[boolean_fileexists]]
df_contacts = pd.concat(list_contacts).set_index(metadata_well['wellprefix'][boolean_fileexists])



# print percent files missing
print("number of target files: " + str(len(filelist)))
print("fraction files missing: ")
print(round(1 - sum(boolean_fileexists)/len(boolean_fileexists), 3))

# column QC
print("Number of NAs per column:")
print(df_contacts.isna().sum().to_string())

print("\nNumber of duplicated wells:")
ndupe=df_contacts.index.duplicated().sum()
print(ndupe)



# final export
df_contacts.to_csv("Metadata/A07a7_contact_metadat.tsv", sep='\t')
print("done.\n\n")

In [11]:
%%bash
cat > ../Scripts/A07b_compile_metadata.py

# A07b_compile_metadata.py =====================================================
# assumes no changes to script output names from A07a*s

# setup ------------------------------------------------------------------------

import pandas as pd
    


# load tables ------------------------------------------------------------------
# basic aggregation based on wellprefix (should be index of each A07a* output)

def read_tbl_wrapper(filepath, prefix = ""):
    return(pd.read_csv(filepath, delimiter = "\t", index_col = 0).add_prefix(prefix))

target_files = ["Metadata/A07a1_trimming.tsv",
                "Metadata/A07a2_mappingrate.tsv",
                "Metadata/A07a3_dedupe.tsv",
                "Metadata/A07a4_global_mC_fracs.tsv",
                "Metadata/A07a5_samstats.tsv",
                "Metadata/A07a6_DNA_cov_chrXdivY.tsv", "Metadata/A07a6_DNA_cov_percent1x.tsv",
                "Metadata/A07a7_contact_metadat.tsv"]
                
# Note: joining all tables may yield
# "InvalidIndexError: Reindexing only valid with uniquely valued Index objects"
# which indicates that there's a metadata file containing duplicated wells
# this usually is a problem with script A07a4
metadata_mC = pd.concat([read_tbl_wrapper(f) for f in target_files], axis = 1)



# column QC
print("Number of NAs per column:")
print("(note, these may vary from A07a* scripts b/c those skip missing files)")
print(metadata_mC.isna().sum().to_string())

print("\nNumber of duplicated wells:")
ndupe=metadata_mC.index.duplicated().sum()
print(ndupe)



# final export
metadata_mC.to_csv("Metadata/A07b_compiled_metadata.tsv", sep = "\t")
print("done.\n\n")