In [10]:
# # A05_compile_metadata_DNA overall cmds ===========================================

# qsub Scripts/A05_compile_DNA_metadata.sub # ‡

# # ‡ fast enough to run interactively

## alternatively, use below to run interactively

In [11]:
# # for interactive mode, just need to specify working dir & 3 environment variables below
# # then comment out the "%%bash / cat" lines and run python code in-notebook


# import os
# os.chdir('../') # move to $dir_proj
# os.environ['metadat_well'] = "Metadata/A01c_well_filepath.csv"
# os.environ['ref_chromsizes'] = "/u/project/cluo/chliu/Genomes/IGVF_hg38_pluslambda/chromsizes.tsv"

# # alternatively, loop through "../snmCT_parameters.env":
# import os
# import pandas as pd

# envvar_needed = ['dir_proj', 'metadat_well', 'ref_chromsizes']
# try:
#     os.environ['metadat_well']
# except KeyError:
#     envspec = pd.read_csv("../snmCT_parameters.env", sep = "=", comment="#", header = None
#                ).set_axis(['varname', 'varpath'], axis = 1
#                ).query('varname in @envvar_needed')
#     for index, row in envspec.iterrows():
#         os.environ[row["varname"]] = row["varpath"]
# os.chdir(os.environ['dir_proj'])


## A05a. DNA+RNA: fastp trimming

In [12]:
%%bash
cat > ../Scripts/A05a_trimming.py

# A05a_trimming.py =============================================================



# setup ========================================================================

import os
import re
import pandas as pd
import glob

filepath_wellmetadat = os.environ['metadat_well']
metadata_well = pd.read_csv(filepath_wellmetadat)

def parse_fastp_report(filepath):
    jsonfile = pd.read_json(filepath)
    dict_out = {
        'nreads_pretrim' : jsonfile['summary']['before_filtering']['total_reads'],
        'percreads_passtrim' : jsonfile['summary']['after_filtering']['total_reads'] /
              jsonfile['summary']['before_filtering']['total_reads'],
        'q20_pretrim' : jsonfile['summary']['before_filtering']['q30_rate'],
        'q20_posttrim' : jsonfile['summary']['after_filtering']['q30_rate'],
        'r1_len' : jsonfile['summary']['after_filtering']['read1_mean_length'],
        'r2_len' : jsonfile['summary']['after_filtering']['read2_mean_length'],
        'gc_perc' : jsonfile['summary']['after_filtering']['gc_content']}
    return(dict_out)



# gather metadata ==============================================================

print("\n\nfastp .json...")

filelist = metadata_well['A03a_json_fastp']
boolean_fileexists = [os.path.exists(f) for f in filelist]
list_fastp = [parse_fastp_report(f) for f in filelist[boolean_fileexists]]
df_fastp = pd.DataFrame(list_fastp,
                        index = metadata_well['wellprefix'][boolean_fileexists])

# percent files missing
print("number of target files: " + str(len(filelist)))
print("fraction files missing: ")
print(round(1 - sum(boolean_fileexists)/len(boolean_fileexists), 3))
boolean_filemissing = [not f for f in boolean_fileexists]
if sum(boolean_filemissing) != 0:
    print("missing " + str(sum(boolean_filemissing)) + " files:")
    print(filelist[boolean_filemissing].to_string())

# column QC
print("number of NAs per column:")
print(df_fastp.isna().sum().to_string())

print("number of duplicated wells:")
ndupe = df_fastp.index.duplicated().sum()
print(ndupe)

# final export
print("exporting Metadata/A05a_trimming.tsv of shape: {}".format(*df_fastp.shape))
df_fastp.to_csv("Metadata/A05a_trimming.tsv", sep = '\t')
print("\n\n")


## A05b. DNA: bismark mapping rates

In [13]:
%%bash
cat > ../Scripts/A05b_DNA_maprate.py

# A05b_DNA_maprate.py ==========================================================



# setup ========================================================================

import os
import glob
import itertools
import re
import pandas as pd

filepath_wellmetadat = os.environ['metadat_well']
metadata_well = pd.read_csv(filepath_wellmetadat)


def parse_bismark_report(filepath):

    """
    parse bismark.txt output
    adapted from YAP @ https://github.com/lhqing/cemba_data to include PE & SE output
    commented out term_dict lines of limited interest
    note that paired-end metrics usually yield fragments, versus reads
    """

    term_dict = {
        'Sequence pairs analysed in total': f'TotalReadPairsIn',
        'Sequences analysed in total': f'TotalReadsIn',
        'Number of paired-end alignments with a unique best hit': f'UniqueMappedPairs',
        'Number of alignments with a unique best hit from the different alignments': f'UniqueMappedReads',
        'Mapping efficiency': f'MappingRate',
#         'Sequence pairs with no alignments under any condition': f'UnmappedPairs',
#         'Sequences with no alignments under any condition': f'UnmappedReads',
#         'Sequences did not map uniquely': f'AmbigReads',
#         'Sequence pairs did not map uniquely': f'AmbigPairs',
#         'CT/GA/CT': f'ReadsOT',
#         'GA/CT/CT': f'ReadsOB',
#         'GA/CT/GA': f'ReadsCTOT',
#         'CT/GA/GA': f'ReadsCTOB',
#         'CT/CT': f'ReadsOT',
#         'CT/GA': f'ReadsOB',
#         'GA/CT': f'ReadsCTOT',
#         'GA/GA': f'ReadsCTOB',
#         'Total number of C\'s analysed': f'TotalC',
        'C methylated in CpG context': f'BismarkmCGRate',
        'C methylated in CHG context': f'BismarkmCHGRate',
        'C methylated in CHH context': f'BismarkmCHHRate',
        'C methylated in unknown context (CN or CHN)' : f'BismarkmCNCHNRate',
        'C methylated in Unknown context (CN or CHN)' : f'BismarkmCNCHNRate'
        }

    with open(filepath) as report:
        report_dict = {}
        for line in report:
            try:
                lhs, rhs = line.split(':')
            except ValueError:
                continue
            try:
                report_dict[term_dict[lhs]] = rhs.strip().split('\t')[0].strip('%')
            except KeyError:
                pass
            
    return(report_dict)





# gather metadata ==============================================================


# paired-end -------------------------------------------------------------------

print("\n\nPE logs...")
filelist = metadata_well['A04a_txt_bismark_PE']
boolean_fileexists = [os.path.exists(f) for f in filelist]
list_bismark_PE = [parse_bismark_report(f) for f in filelist[boolean_fileexists]]
df_bismark_PE = pd.DataFrame(list_bismark_PE,
                        index = metadata_well['wellprefix'][boolean_fileexists])

# percent files missing
print("number of target files: " + str(len(filelist)))
print("fraction files missing: ")
print(round(1 - sum(boolean_fileexists)/len(boolean_fileexists), 3))
boolean_filemissing = [not f for f in boolean_fileexists]
if sum(boolean_filemissing) != 0:
    print("missing " + str(sum(boolean_filemissing)) + " files:")
    print(filelist[boolean_filemissing].to_string())

# column QC
print("number of NAs per column:")
print(df_bismark_PE.isna().sum().to_string())

print("number of duplicated wells:")
ndupe = df_bismark_PE.index.duplicated().sum()
print(ndupe)

# final export
print("exporting Metadata/A05b_DNA_maprate_PE.tsv of shape: {}".format(*df_bismark_PE.shape))
df_bismark_PE.to_csv("Metadata/A05b_DNA_maprate_PE.tsv", sep = '\t')
print("\n\n")





# read 1 singletons from trimming ----------------------------------------------

print("\n\nSE1trim logs...")
filelist = metadata_well['A04a_txt_bismark_SE1trim']
boolean_fileexists = [os.path.exists(f) for f in filelist]
list_bismark_SE1trim = [parse_bismark_report(f) for f in filelist[boolean_fileexists]]
df_bismark_SE1trim = pd.DataFrame(list_bismark_SE1trim,
                        index = metadata_well['wellprefix'][boolean_fileexists])

# percent files missing
print("number of target files: " + str(len(filelist)))
print("fraction files missing: ")
print(round(1 - sum(boolean_fileexists)/len(boolean_fileexists), 3))
boolean_filemissing = [not f for f in boolean_fileexists]
if sum(boolean_filemissing) != 0:
    print("missing " + str(sum(boolean_filemissing)) + " files:")
    print(filelist[boolean_filemissing].to_string())

# column QC
print("number of NAs per column:")
print(df_bismark_SE1trim.isna().sum().to_string())

print("number of duplicated wells:")
ndupe = df_bismark_SE1trim.index.duplicated().sum()
print(ndupe)

# final export
print("exporting Metadata/A05b_DNA_maprate_SE1trim.tsv of shape: {}".format(*df_bismark_SE1trim.shape))
df_bismark_SE1trim.to_csv("Metadata/A05b_DNA_maprate_SE1trim.tsv", sep = '\t')
print("\n\n")





# read 2 singletons from trimming ----------------------------------------------

print("\n\nSE2trim logs...")
filelist = metadata_well['A04a_txt_bismark_SE2trim']
boolean_fileexists = [os.path.exists(f) for f in filelist]
list_bismark_SE2trim = [parse_bismark_report(f) for f in filelist[boolean_fileexists]]
df_bismark_SE2trim = pd.DataFrame(list_bismark_SE2trim,
                                  index = metadata_well['wellprefix'][boolean_fileexists])

# percent files missing
print("number of target files: " + str(len(filelist)))
print("fraction files missing: ")
print(round(1 - sum(boolean_fileexists)/len(boolean_fileexists), 3))
boolean_filemissing = [not f for f in boolean_fileexists]
if sum(boolean_filemissing) != 0:
    print("missing " + str(sum(boolean_filemissing)) + " files:")
    print(filelist[boolean_filemissing].to_string())

# column QC
print("number of NAs per column:")
print(df_bismark_SE2trim.isna().sum().to_string())

print("number of duplicated wells:")
ndupe = df_bismark_SE2trim.index.duplicated().sum()
print(ndupe)

# final export
print("exporting Metadata/A05b_DNA_maprate_SE2trim.tsv of shape: {}".format(*df_bismark_SE2trim.shape))
df_bismark_SE2trim.to_csv("Metadata/A05b_DNA_maprate_SE2trim.tsv", sep = '\t')
print("\n\n")





# read 1 singletons unampped in paired-end mode --------------------------------

print("\n\nSE1unmap logs...")
filelist = metadata_well['A04a_txt_bismark_SE1unmap']
boolean_fileexists = [os.path.exists(f) for f in filelist]
list_bismark_SE1unmap = [parse_bismark_report(f) for f in filelist[boolean_fileexists]]
df_bismark_SE1unmap = pd.DataFrame(list_bismark_SE1unmap,
                                   index = metadata_well['wellprefix'][boolean_fileexists])

# percent files missing
print("number of target files: " + str(len(filelist)))
print("fraction files missing: ")
print(round(1 - sum(boolean_fileexists)/len(boolean_fileexists), 3))
boolean_filemissing = [not f for f in boolean_fileexists]
if sum(boolean_filemissing) != 0:
    print("missing " + str(sum(boolean_filemissing)) + " files:")
    print(filelist[boolean_filemissing].to_string())

# column QC
print("number of NAs per column:")
print(df_bismark_SE1unmap.isna().sum().to_string())

print("number of duplicated wells:")
ndupe = df_bismark_SE1unmap.index.duplicated().sum()
print(ndupe)

# final export
print("exporting Metadata/A05b_DNA_maprate_SE1unmap.tsv of shape: {}".format(*df_bismark_SE1unmap.shape))
df_bismark_SE1unmap.to_csv("Metadata/A05b_DNA_maprate_SE1unmap.tsv", sep = '\t')
print("\n\n")



# read 2 singletons unampped in paired-end mode --------------------------------

print("\n\nSE2unmap logs...")
filelist = metadata_well['A04a_txt_bismark_SE2unmap']
boolean_fileexists = [os.path.exists(f) for f in filelist]
list_bismark_SE2unmap = [parse_bismark_report(f) for f in filelist[boolean_fileexists]]
df_bismark_SE2unmap = pd.DataFrame(list_bismark_SE2unmap,
                                  index = metadata_well['wellprefix'][boolean_fileexists])


# percent files missing
print("number of target files: " + str(len(filelist)))
print("fraction files missing: ")
print(round(1 - sum(boolean_fileexists)/len(boolean_fileexists), 3))
boolean_filemissing = [not f for f in boolean_fileexists]
if sum(boolean_filemissing) != 0:
    print("missing " + str(sum(boolean_filemissing)) + " files:")
    print(filelist[boolean_filemissing].to_string())

# column QC
print("number of NAs per column:")
print(df_bismark_SE2unmap.isna().sum().to_string())

print("number of duplicated wells:")
ndupe = df_bismark_SE2unmap.index.duplicated().sum()
print(ndupe)

# final export
print("exporting Metadata/A05b_DNA_maprate_SE2unmap.tsv of shape: {}".format(*df_bismark_SE2unmap.shape))
df_bismark_SE2unmap.to_csv("Metadata/A05b_DNA_maprate_SE2unmap.tsv", sep = '\t')
print("\n\n")


## A05c. DNA: picard deduplication 

In [14]:
%%bash
cat > ../Scripts/A05c_DNA_dedupe.py

# A05c_DNA_dedupe.py ===========================================================

# setup ========================================================================

import os
import glob
import pandas as pd
import numpy as np

filepath_wellmetadat = os.environ['metadat_well']
metadata_well = pd.read_csv(filepath_wellmetadat)

nulltable = np.array([pd.NA, pd.NA, pd.NA]) 

def parse_picard_dedupe(filepath):
    try:
        data_dedupe = pd.read_csv(filepath, delimiter = "\t",
                         comment = "#", nrows = 1)[[
                             'UNPAIRED_READS_EXAMINED', 'READ_PAIRS_EXAMINED', 'PERCENT_DUPLICATION'
                         ]].transpose()[0]
        return(data_dedupe)
    except:
        print("error reading file: " + filepath)
        return(nulltable)

tidy_name_dict = {'PERCENT_DUPLICATION' : 'picard_perc_dupe',
                  'READ_PAIRS_EXAMINED' : 'picard_npairsin',
                  'UNPAIRED_READS_EXAMINED' : 'picard_nreadsin'}



# gather metadata ==============================================================

# paired-end -------------------------------------------------------------------

print("\n\nPE logs...")
filelist = metadata_well['A04b_txt_picard_PE']
boolean_fileexists = [os.path.exists(f) for f in filelist]
list_picard_PE = [parse_picard_dedupe(f) for f in filelist[boolean_fileexists]]
df_picard_PE = pd.DataFrame(list_picard_PE,
                        index = metadata_well['wellprefix'][boolean_fileexists]
                           ).rename(columns = tidy_name_dict)

# percent files missing
print("number of target files: " + str(len(filelist)))
print("fraction files missing: ")
print(round(1 - sum(boolean_fileexists)/len(boolean_fileexists), 3))
boolean_filemissing = [not f for f in boolean_fileexists]
if sum(boolean_filemissing) != 0:
    print("missing " + str(sum(boolean_filemissing)) + " files:")
    print(filelist[boolean_filemissing].to_string())

# column QC
print("number of NAs per column:")
print(df_picard_PE.isna().sum().to_string())

print("number of duplicated wells:")
ndupe = df_picard_PE.index.duplicated().sum()
print(ndupe)

# final export
print("exporting Metadata/A05c_DNA_picard_PE.tsv of shape: {}".format(*df_picard_PE.shape))
df_picard_PE.to_csv("Metadata/A05c_DNA_picard_PE.tsv", sep = '\t')
print("\n\n")



# single-end -------------------------------------------------------------------

print("\n\nSE logs...")
filelist = metadata_well['A04b_txt_picard_SE']
boolean_fileexists = [os.path.exists(f) for f in filelist]
list_picard_SE = [parse_picard_dedupe(f) for f in filelist[boolean_fileexists]]
df_picard_SE = pd.DataFrame(list_picard_SE,
                            index = metadata_well['wellprefix'][boolean_fileexists]
                            ).rename(columns = tidy_name_dict)

# percent files missing
print("number of target files: " + str(len(filelist)))
print("fraction files missing: ")
print(round(1 - sum(boolean_fileexists)/len(boolean_fileexists), 3))
boolean_filemissing = [not f for f in boolean_fileexists]
if sum(boolean_filemissing) != 0:
    print("missing " + str(sum(boolean_filemissing)) + " files:")
    print(filelist[boolean_filemissing].to_string())

# column QC
print("number of NAs per column:")
print(df_picard_SE.isna().sum().to_string())

print("number of duplicated wells:")
ndupe = df_picard_SE.index.duplicated().sum()
print(ndupe)

# final export
print("exporting Metadata/A05c_DNA_picard_SE.tsv of shape: {}".format(*df_picard_SE.shape))
df_picard_SE.to_csv("Metadata/A05c_DNA_picard_SE.tsv", sep = '\t')
print("\n\n")


## A05d. DNA: mC fractions

In [15]:
%%bash
cat > ../Scripts/A05d_DNA_global_mCfracs.py

# A05d_DNA_global_mCfracs.py ==========================================================

# setup ------------------------------------------------------------------------

import os
import glob
import pandas as pd

filepath_wellmetadat = os.environ['metadat_well']
metadata_well = pd.read_csv(filepath_wellmetadat)


# gather metadata --------------------------------------------------------------

filelist=pd.Series([ "Metadata/A04d_mCfrac_" + str(i) + ".tsv"
                for i in pd.unique(metadata_well['batchnum']) ])
boolean_fileexists = [os.path.exists(f) for f in filelist]
list_mCfracs = [ pd.read_csv(f, delimiter="\t") for f in filelist[boolean_fileexists] ] 
df_mCfracs = pd.concat(list_mCfracs)
df_mCfracs = df_mCfracs.rename(columns = {"Well" : "wellprefix"})
df_mCfracs = df_mCfracs.set_index("wellprefix")

# percent files missing
print("number of target files: " + str(len(filelist)))
print("fraction files missing: ")
print(round(1 - sum(boolean_fileexists)/len(boolean_fileexists), 3))
boolean_filemissing = [not f for f in boolean_fileexists]
if sum(boolean_filemissing) != 0:
    print("missing " + str(sum(boolean_filemissing)) + " files:")
    print(filelist[boolean_filemissing].to_string())

# column QC
print("number of NAs per column:")
print(df_mCfracs.isna().sum().to_string())

print("number of duplicated wells:")
ndupe = df_mCfracs.index.duplicated().sum()
print(ndupe)

# final export
print("exporting Metadata/A05d_DNA_global_mCfracs.tsv of shape: {}".format(*df_mCfracs.shape))
df_mCfracs.to_csv("Metadata/A05d_DNA_global_mCfracs.tsv", sep = '\t')
print("\n\n")


## A05e. DNA: samtools stats

In [16]:
%%bash
cat > ../Scripts/A05e_DNA_samtools.py

# A05e_DNA_samtools.py =========================================================

# setup ------------------------------------------------------------------------

import os
import glob
import pandas as pd

filepath_wellmetadat = os.environ['metadat_well']
metadata_well = pd.read_csv(filepath_wellmetadat)

def parse_samstats(filepath):

    term_dict = {
        'raw total sequences': f'FilteredSeqCount',
        'error rate': f'ErrorRate',
        'insert size average': f'InsertSizeAvg',
        'insert size standard deviation': f'InsertSizeSD',
        }

    with open(filepath) as report:
        report_dict = {}
        for line in report:
            try:
                lhs, rhs = line.split(':')
            except ValueError:
                continue
            try:
                report_dict[term_dict[lhs]] = rhs.strip().split('\t')[0]
            except KeyError:
                pass
            
    return(report_dict)




# gather metadata --------------------------------------------------------------

# paired-end -------------------------------------------------------------------

print("\n\npaired-end...")
filelist = metadata_well["A04e_txt_samstats_PE"]
boolean_fileexists = [os.path.exists(f) for f in filelist]
list_samstats_PE = [ parse_samstats(f) for f in filelist[boolean_fileexists] ] 
df_samstats_PE = pd.DataFrame(list_samstats_PE,
                            index = metadata_well['wellprefix'][boolean_fileexists])

# percent files missing
print("number of target files: " + str(len(filelist)))
print("fraction files missing: ")
print(round(1 - sum(boolean_fileexists)/len(boolean_fileexists), 3))
boolean_filemissing = [not f for f in boolean_fileexists]
if sum(boolean_filemissing) != 0:
    print("missing " + str(sum(boolean_filemissing)) + " files:")
    print(filelist[boolean_filemissing].to_string())

# column QC
print("number of NAs per column:")
print(df_samstats_PE.isna().sum().to_string())

print("number of duplicated wells:")
ndupe = df_samstats_PE.index.duplicated().sum()
print(ndupe)

# final export
print("exporting Metadata/A05e_DNA_samstats_PE.tsv of shape: {}".format(*df_samstats_PE.shape))
df_samstats_PE.to_csv("Metadata/A05e_DNA_samstats_PE.tsv", sep = '\t')
print("\n\n")




# single-end -------------------------------------------------------------------

print("\n\nsingle-end...")
filelist = metadata_well["A04e_txt_samstats_SE"]
boolean_fileexists = [os.path.exists(f) for f in filelist]
list_samstats_SE = [ parse_samstats(f) for f in filelist[boolean_fileexists] ] 
df_samstats_SE = pd.DataFrame(list_samstats_SE,
                            index = metadata_well['wellprefix'][boolean_fileexists]
                               ).drop(['InsertSizeAvg', 'InsertSizeSD'], axis = 1)


# percent files missing
print("number of target files: " + str(len(filelist)))
print("fraction files missing: ")
print(round(1 - sum(boolean_fileexists)/len(boolean_fileexists), 3))
boolean_filemissing = [not f for f in boolean_fileexists]
if sum(boolean_filemissing) != 0:
    print("missing " + str(sum(boolean_filemissing)) + " files:")
    print(filelist[boolean_filemissing].to_string())

# column QC
print("number of NAs per column:")
print(df_samstats_SE.isna().sum().to_string())

print("number of duplicated wells:")
ndupe = df_samstats_SE.index.duplicated().sum()
print(ndupe)

# final export
print("exporting Metadata/A05e_DNA_samstats_SE.tsv of shape: {}".format(*df_samstats_SE.shape))
df_samstats_SE.to_csv("Metadata/A05e_DNA_samstats_SE.tsv", sep = '\t')
print("\n\n")


## A05f. DNA Coverage

In [17]:
%%bash
cat > ../Scripts/A05f_DNA_cov.py

# A05f_DNA_cov.py ==============================================================

# setup ------------------------------------------------------------------------

import os
import glob
import pandas as pd
import numpy as np

filepath_wellmetadat = os.environ['metadat_well']
metadata_well = pd.read_csv(filepath_wellmetadat)

target_chroms = ["chr" + str(i) for i in range(1, 99)]
total_autosomal_bases = \
    pd.read_csv(os.environ['ref_chromsizes'],
                sep = "\t", header = None, index_col = 0)
total_autosomal_bases = \
    total_autosomal_bases.loc[np.intersect1d(target_chroms, total_autosomal_bases.index), 1].sum()


# gather metadata --------------------------------------------------------------


# extract autosomal ------------------------------------------------------------

target_chroms = ["chr" + str(i) for i in range(1, 99)]
autosomal_chroms = \
    pd.read_csv(os.environ['ref_chromsizes'],
                sep = "\t", header = None, index_col = 0)
autosomal_chroms = autosomal_chroms[autosomal_chroms.index.isin(target_chroms)]
total_autosomal_bases = autosomal_chroms.sum()
target_chroms = autosomal_chroms.index



# gather metadata: base-lvl unique coverage levels -----------------------------

print("processing autosomal num sites with at least 1-fold coverage.")
print("if any filenames printed below, potentially corrupt files:")
def parse_coverage_unique(filepath):
    try:
        percent_coverage = \
            pd.read_csv(filepath, delimiter = "\s+", header = None, index_col=1)
        percent_coverage = (
            percent_coverage.loc[
                np.intersect1d(target_chroms, percent_coverage.index), 0
            ].sum() / total_autosomal_bases).to_list()[0]
    except:
        print("'" + filepath + "'")
        percent_coverage = np.nan
    return(percent_coverage)

filelist = metadata_well['A04f_txt_covnsites']
boolean_fileexists = [os.path.exists(f) for f in filelist]
list_unique = [parse_coverage_unique(file) for file in filelist[boolean_fileexists]]
df_unique = pd.DataFrame(list_unique,
                        index = metadata_well['wellprefix'][boolean_fileexists])
df_unique.columns = ["CoveragePerc1x"]


# percent files missing
print("number of target files: " + str(len(filelist)))
print("fraction files missing: ")
print(round(1 - sum(boolean_fileexists)/len(boolean_fileexists), 3))
boolean_filemissing = [not f for f in boolean_fileexists]
if sum(boolean_filemissing) != 0:
    print("missing " + str(sum(boolean_filemissing)) + " files:")
    print(filelist[boolean_filemissing].to_string())

# column QC
print("number of NAs per column:")
print(df_unique.isna().sum().to_string())

print("number of duplicated wells:")
ndupe = df_unique.index.duplicated().sum()
print(ndupe)

# final export
print("exporting Metadata/A05f_DNA_cov_percent1x.tsv of shape: {}".format(*df_unique.shape))
df_unique.to_csv("Metadata/A05f_DNA_cov_percent1x.tsv", sep = '\t')
print("\n\n")



# total coverage levels for chrX/chrY ------------------------------------------

print("processing total coverage levels per chrom.")
print("if any filenames printed below, potentially corrupt files:")
def parse_coverage_total(filepath):
    try:
        total_cov_by_chr = pd.read_csv(filepath, delimiter = "\s+", header = None, index_col=0)
        if not(any(total_cov_by_chr.index=="chrX")) and (not any(total_cov_by_chr.index=="chrY")):
            coverage_XdivY = np.nan
        elif any(total_cov_by_chr.index=="chrX") and (not any(total_cov_by_chr.index=="chrY")):
            coverage_XdivY = np.inf
        else:
            coverage_XdivY = total_cov_by_chr.loc['chrX', ] / total_cov_by_chr.loc['chrY', ]
            coverage_XdivY = coverage_XdivY.tolist()[0]
    except:
        print("'" + filepath + "'")
        coverage_XdivY = np.nan
    return(coverage_XdivY)

filelist = metadata_well['A04f_txt_covtot']
boolean_fileexists = [os.path.exists(f) for f in filelist]
list_total = [parse_coverage_total(file) for file in filelist[boolean_fileexists]]
df_total = pd.DataFrame(list_total,
             index = metadata_well['wellprefix'][boolean_fileexists])
df_total.columns = ["CoverageXdivY"]


# percent files missing
print("number of target files: " + str(len(filelist)))
print("fraction files missing: ")
print(round(1 - sum(boolean_fileexists)/len(boolean_fileexists), 3))
boolean_filemissing = [not f for f in boolean_fileexists]
if sum(boolean_filemissing) != 0:
    print("missing " + str(sum(boolean_filemissing)) + " files:")
    print(filelist[boolean_filemissing].to_string())

# column QC
print("number of NAs per column:")
print(df_total.isna().sum().to_string())

print("number of duplicated wells:")
ndupe = df_total.index.duplicated().sum()
print(ndupe)

# final export
print("exporting Metadata/A05f_DNA_cov_chrXdivY.tsv of shape: {}".format(*df_total.shape))
df_total.to_csv("Metadata/A05f_DNA_cov_chrXdivY.tsv", sep = '\t')
print("\n\n")

## A05. run helper script

In [18]:
%%bash
cat > ../Scripts/A05_compile_DNA_metadata.sub

#!/bin/bash
#$ -cwd
#$ -o sublogs/A05_compile_DNA.$JOB_ID.$TASK_ID
#$ -j y
#$ -l h_rt=6:00:00,h_data=12G
#$ -N A05_compile_DNA
#$ -t 1-6
#$ -hold_jid A04a_bismark,A04b_filter_mC,A04c_make_allc,A04d_mCfracs,A04f_global_mC_stats,A04g_coverage_DNA



echo "Job $JOB_ID.$SGE_TASK_ID started on:   " `hostname -s`
echo "Job $JOB_ID.$SGE_TASK_ID started on:   " `date `
echo " "



# environment init -------------------------------------------------------------

. /u/local/Modules/default/init/modules.sh # <--
module load anaconda3 # <--
conda activate snmCTseq # <--

export $(cat snmCT_parameters.env | grep -v '^#' | xargs) # <--



# run each helper script (A05*) ------------------------------------------------

# note: in practice these can each be submitted interactively/as its own task,
# as some of these scripts should be much lower resource than others;
# however, this -t 1-6 job parallelization is just for tidyness

echo "metadata script # $SGE_TASK_ID running:"

case $SGE_TASK_ID in

  1)
    echo "python Scripts/A05a_trimming.py"
    python Scripts/A05a_trimming.py
    ;;

  2)
    echo "python Scripts/A05b_DNA_maprate.py"
    python Scripts/A05b_DNA_maprate.py
    ;;

  3)
    echo "python Scripts/A05c_DNA_dedupe.py"
    python Scripts/A05c_DNA_dedupe.py
    ;;

  4)
    echo "python Scripts/A05d_DNA_global_mCfracs.py"
    python Scripts/A05d_DNA_global_mCfracs.py
    ;;

  5)
    echo "python Scripts/A05e_DNA_samtools.py"
    python Scripts/A05e_DNA_samtools.py
    ;;

  6)
    echo "python Scripts/A05f_DNA_cov.py"
    python Scripts/A05f_DNA_cov.py
    ;;
    
  *)
    ;;
esac


echo "completed 'A05_compile_DNA_metadata.'"

echo " "
echo "Job $JOB_ID.$SGE_TASK_ID ended on:   " `hostname -s`
echo "Job $JOB_ID.$SGE_TASK_ID ended on:   " `date `
echo " "

