In [None]:
# ## just one script to run
# qsub Scripts/A06_compile_DNA_metadata.sub

## A06a. DNA+RNA: fastp trimming

In [None]:
%%bash
cat > ../Scripts/A06a_trimming.py

# A06a_trimming.py =============================================================

# setup ------------------—------------------—----------------------------------

import re
import pandas as pd
import glob

import os
filepath_wellmetadat = os.environ['metadat_plate']
metadata_well = pd.read_csv(filepath_wellmetadat)

def parse_fastp_report(filepath):
    jsonfile = pd.read_json(filepath)
    dict_out = {
        'nreads_pretrim' : jsonfile['summary']['before_filtering']['total_reads'],
        'percreads_passtrim' : jsonfile['summary']['after_filtering']['total_reads'] /
              jsonfile['summary']['before_filtering']['total_reads'],
        'q20_pretrim' : jsonfile['summary']['before_filtering']['q30_rate'],
        'q20_posttrim' : jsonfile['summary']['after_filtering']['q30_rate'],
        'r1_len' : jsonfile['summary']['after_filtering']['read1_mean_length'],
        'r2_len' : jsonfile['summary']['after_filtering']['read2_mean_length'],
        'gc_perc' : jsonfile['summary']['after_filtering']['gc_content']}
    return(dict_out)



# gather metadata ------------------—------------------—------------------------

list_fastp = [parse_fastp_report(file) for file in metadata_well['A03a_json_fastp']]
df_fastp = pd.DataFrame(list_fastp,
                        index=metadata_well['wellprefix'])

del(list_fastp)
df_fastp.to_csv("Metadata/A06a_trimming.tsv", sep='\t')
del(df_fastp)

## A06b. DNA: bismark mapping rates

In [None]:
%%bash
cat > ../Scripts/A06b_DNA_maprate.py

# A06b_DNA_maprate.py ==========================================================

# setup ------------------—------------------—----------------------------------

import glob
import itertools
import re
import pandas as pd

import os
filepath_wellmetadat = os.environ['metadat_plate']
metadata_well = pd.read_csv(filepath_wellmetadat)

    
def parse_bismark_report(filepath):

    """
    parse bismark.txt output
    adapted from YAP @ https://github.com/lhqing/cemba_data to include PE & SE output
    commented out term_dict lines of limited interest
    note that paired-end metrics usually yield fragments, versus reads
    """

    term_dict = {
        'Sequence pairs analysed in total': f'TotalReadPairsIn',
        'Sequences analysed in total': f'TotalReadsIn',
        'Number of paired-end alignments with a unique best hit': f'UniqueMappedPairs',
        'Number of alignments with a unique best hit from the different alignments': f'UniqueMappedReads',
        'Mapping efficiency': f'MappingRate',
#         'Sequence pairs with no alignments under any condition': f'UnmappedPairs',
#         'Sequences with no alignments under any condition': f'UnmappedReads',
#         'Sequences did not map uniquely': f'AmbigReads',
#         'Sequence pairs did not map uniquely': f'AmbigPairs',
#         'CT/GA/CT': f'ReadsOT',
#         'GA/CT/CT': f'ReadsOB',
#         'GA/CT/GA': f'ReadsCTOT',
#         'CT/GA/GA': f'ReadsCTOB',
#         'CT/CT': f'ReadsOT',
#         'CT/GA': f'ReadsOB',
#         'GA/CT': f'ReadsCTOT',
#         'GA/GA': f'ReadsCTOB',
#         'Total number of C\'s analysed': f'TotalC',
        'C methylated in CpG context': f'BismarkmCGRate',
        'C methylated in CHG context': f'BismarkmCHGRate',
        'C methylated in CHH context': f'BismarkmCHHRate',
        'C methylated in unknown context (CN or CHN)' : f'BismarkmCNCHNRate',
        'C methylated in Unknown context (CN or CHN)' : f'BismarkmCNCHNRate'
        }

    with open(filepath) as report:
        report_dict = {}
        for line in report:
            try:
                lhs, rhs = line.split(':')
            except ValueError:
                continue
            try:
                report_dict[term_dict[lhs]] = rhs.strip().split('\t')[0].strip('%')
            except KeyError:
                pass
            
    return(report_dict)



# gather metadata ------------------—------------------—------------------------

# paired-end bismark report file

list_bismark_PE = [parse_bismark_report(file) for file in metadata_well['A04a_txt_bismark_PE']]
df_bismark_PE = pd.DataFrame(list_bismark_PE,
                             index=metadata_well['wellprefix'])

del(list_bismark_PE)
df_bismark_PE.to_csv("Metadata/A06b_DNA_maprate_PE.tsv", sep='\t')
del(df_bismark_PE)




# read 1 singletons from trimming 

list_bismark_SE1trim = [parse_bismark_report(file) for file in metadata_well['A04a_txt_bismark_SE1trim']]
df_bismark_SE1trim = pd.DataFrame(list_bismark_SE1trim,
                             index=metadata_well['wellprefix'])

del(list_bismark_SE1trim)
df_bismark_SE1trim.to_csv("Metadata/A06b_DNA_maprate_SE1trim.tsv", sep='\t')
del(df_bismark_SE1trim)




# read 1 bismark unmapped in PE mode 

list_bismark_SE1unmap = [parse_bismark_report(file) for file in metadata_well['A04a_txt_bismark_SE1unmap']]
df_bismark_SE1unmap = pd.DataFrame(list_bismark_SE1unmap,
                             index=metadata_well['wellprefix'])

del(list_bismark_SE1unmap)
df_bismark_SE1unmap.to_csv("Metadata/A06b_DNA_maprate_SE1unmap.tsv", sep='\t')
del(df_bismark_SE1unmap)




# read 2 singletons from trimming 

list_bismark_SE2trim = [parse_bismark_report(file) for file in metadata_well['A04a_txt_bismark_SE2trim']]
df_bismark_SE2trim = pd.DataFrame(list_bismark_SE2trim,
                             index=metadata_well['wellprefix'])

del(list_bismark_SE2trim)
df_bismark_SE2trim.to_csv("Metadata/A06b_DNA_maprate_SE2trim.tsv", sep='\t')
del(df_bismark_SE2trim)



# read 2 bismark unmapped in PE mode 

list_bismark_SE2unmap = [parse_bismark_report(file) for file in metadata_well['A04a_txt_bismark_SE2unmap']]
df_bismark_SE2unmap = pd.DataFrame(list_bismark_SE2unmap,
                             index=metadata_well['wellprefix'])

del(list_bismark_SE2unmap)
df_bismark_SE2unmap.to_csv("Metadata/A06b_DNA_maprate_SE2unmap.tsv", sep='\t')
del(df_bismark_SE2unmap)





## A06c. DNA: picard deduplication 

In [None]:
%%bash
cat > ../Scripts/A06c_DNA_dedupe.py

# A06c_DNA_dedupe.py ===========================================================

# setup ------------------—------------------—----------------------------------

import glob
import pandas as pd
import numpy as np

import os
filepath_wellmetadat = os.environ['metadat_plate']
metadata_well = pd.read_csv(filepath_wellmetadat)

# picard .log files
nulltable = np.array([pd.NA, pd.NA, pd.NA]) 

def parse_dedupe(filepath):
    try:
        data_dedupe = pd.read_csv(filepath, delimiter = "\t",
                         comment = "#", nrows = 1)[[
                             'UNPAIRED_READS_EXAMINED', 'READ_PAIRS_EXAMINED', 'PERCENT_DUPLICATION'
                         ]].transpose()[0]
        return(data_dedupe)
    except:
        return(nulltable)

tidy_name_dict = {'PERCENT_DUPLICATION' : 'picard_perc_dupe',
                  'READ_PAIRS_EXAMINED' : 'picard_npairsin',
                  'UNPAIRED_READS_EXAMINED' : 'picard_nreadsin'}



# gather metadata ------------------—------------------—------------------------

# paired end
list_picard_PE = [parse_dedupe(file) for file in metadata_well['A04a_txt_picard_PE']]
df_picard_PE = pd.DataFrame(list_picard_PE,
                            index = metadata_well['wellprefix']
                           ).rename(columns = tidy_name_dict
                           ).drop("picard_nreadsin", axis = 1)

del(list_picard_PE)
df_picard_PE.to_csv("Metadata/A06c_DNA_picard_PE.tsv", sep='\t')
del(df_picard_PE)

# single end
list_picard_SE = [parse_dedupe(file) for file in metadata_well['A04a_txt_picard_SE']]
df_picard_SE = pd.DataFrame(list_picard_SE,
                            index = metadata_well['wellprefix']
                           ).rename(columns = tidy_name_dict
                           ).drop("picard_npairsin", axis = 1)

del(list_picard_SE)
df_picard_SE.to_csv("Metadata/A06c_DNA_picard_SE.tsv", sep='\t')
del(df_picard_SE)





## A06d. DNA: mC fractions

In [None]:
%%bash
cat > ../Scripts/A06d_DNA_mCfracs.py

# A06d_DNA_mCfracs.py ==========================================================

# setup ------------------—------------------—----------------------------------

import glob
import pandas as pd

import os
filepath_wellmetadat = os.environ['metadat_plate']
metadata_well = pd.read_csv(filepath_wellmetadat)


# gather metadata ------------------—------------------—------------------------

list_mCfracs = [ pd.read_csv("Metadata/A04d_mCfrac_" + str(i) + ".tsv", delimiter="\t")
                for i in pd.unique(metadata_well['batchnum']) ] 
df_mCfracs = pd.concat(list_mCfracs)
df_mCfracs = df_mCfracs.rename(columns = {"Well" : "wellprefix"})

df_mCfracs.to_csv("Metadata/A06d_DNA_compiled_mCfracs.tsv", sep='\t', index = False)

## A06e. DNA: samtools stats

In [None]:
%%bash
cat > ../Scripts/A06e_DNA_samtools.py

# A06e_DNA_samtools.py =========================================================

# setup ------------------—------------------—----------------------------------

import glob
import pandas as pd

import os
filepath_wellmetadat = os.environ['metadat_plate']
metadata_well = pd.read_csv(filepath_wellmetadat)

def parse_samstats(filepath):

    term_dict = {
        'raw total sequences': f'FilteredSeqCount',
        'error rate': f'ErrorRate',
        'insert size average': f'InsertSizeAvg',
        'insert size standard deviation': f'InsertSizeSD',
        }

    with open(filepath) as report:
        report_dict = {}
        for line in report:
            try:
                lhs, rhs = line.split(':')
            except ValueError:
                continue
            try:
                report_dict[term_dict[lhs]] = rhs.strip().split('\t')[0]
            except KeyError:
                pass
            
    return(report_dict)




# gather metadata ------------------—------------------—------------------------


metadata_well = pd.read_csv(filepath_wellmetadat)

# paired-end
list_samstats_pe = [parse_samstats(file) for file in metadata_well["A04e_txt_samstats_PE"]]
df_samstats_pe = pd.DataFrame(list_samstats_pe,
             index = metadata_well["wellprefix"])

del(list_samstats_pe)
df_samstats_pe.to_csv("Metadata/A06e_DNA_samstats_PE.tsv", sep='\t')
del(df_samstats_pe)


# single-end
list_samstats_se = [parse_samstats(file) for file in metadata_well["A04e_txt_samstats_SE"]]
df_samstats_se = pd.DataFrame(list_samstats_se,
                     index = metadata_well["wellprefix"]
                             ).drop(["InsertSizeAvg", "InsertSizeSD"], axis = 1)

del(list_samstats_se)
df_samstats_se.to_csv("Metadata/A06e_DNA_samstats_SE.tsv", sep='\t')
del(df_samstats_se)




## A06f. DNA Coverage

In [None]:
%%bash
cat > ../Scripts/A06f_DNA_cov.py

# A06f_DNA_cov.py ==============================================================

# setup ------------------—------------------—----------------------------------

import glob
import pandas as pd
import numpy as np

import os
filepath_wellmetadat = os.environ['metadat_plate']
metadata_well = pd.read_csv(filepath_wellmetadat)

target_chroms = ["chr" + str(i) for i in range(1, 22)]
total_autosomal_bases = \
    pd.read_csv("/u/project/cluo/chliu/Genomes/human_gencode_v40/chromsizes.tsv",
                sep = "\t", header = None, index_col = 0).loc[target_chroms, 1].sum()



# gather metadata ------------------—------------------—------------------------

# unique coverage levels 
metadata_well = pd.read_csv(filepath_wellmetadat)

def parse_coverage_unique(filepath):
    try:
        percent_coverage = \
            pd.read_csv(filepath, delimiter = "\t", header = None, index_col=0)
        percent_coverage = percent_coverage.loc[
            np.intersect1d(target_chroms, percent_coverage.index), 1].sum() / total_autosomal_bases
    except:
        percent_coverage = pd.NA
    return(percent_coverage)

list_unique = [parse_coverage_unique(file) for file in metadata_well['A04f_txt_covtot']]
df_unique = pd.DataFrame(list_unique,
                        index = metadata_well['wellprefix'])

del(list_unique)
df_unique.columns = ["CoveragePerc1x"]
df_unique.to_csv("Metadata/A06f_DNA_cov_percent1x.tsv", sep='\t')
del(df_unique)

# total coverage levels 

def parse_coverage_total(filepath):
    try:
        total_cov_by_chr = pd.read_csv(filepath, delimiter = "\s+", header = None, index_col=1)
        coverage_XdivY = total_cov_by_chr.loc['chrX', 0] / total_cov_by_chr.loc['chrY', 0]
    except:
        coverage_XdivY = pd.NA
    return(coverage_XdivY)

list_total = [parse_coverage_total(file) for file in metadata_well['A04f_txt_covnsites']]
df_total = pd.DataFrame(list_total,
             index = metadata_well['wellprefix'])

del(list_total)
df_total.columns = ["CoverageXdivY"]
df_total.to_csv("Metadata/A06f_DNA_cov_chrXdivY.tsv", sep='\t')
del(df_total)


## A06z. run helper scripts

In [None]:
%%bash
cat > ../Scripts/A06_compile_DNA_metadata.sub

#!/bin/bash
#$ -cwd
#$ -o sublogs/A06_compile_DNA.$JOB_ID.$TASK_ID
#$ -j y
#$ -l h_rt=6:00:00,h_data=12G
#$ -N A06_compile_DNA
#$ -t 1-6




echo "Job $JOB_ID.$SGE_TASK_ID started on:   " `hostname -s`
echo "Job $JOB_ID.$SGE_TASK_ID started on:   " `date `
echo " "



# environment init ------------------—------------------—-----------------------

. /u/local/Modules/default/init/modules.sh # <--
module load anaconda3 # <--
conda activate snmCTseq # <--

export $(cat snmCT_parameters.env | grep -v '^#' | xargs) # <--



# run each helper script (A06*) ------------------—------------------—----------

# note: in practice these can each be submitted interactively/as its own task,
# as some of these scripts should be much lower resource than others;
# however, this -t 1-6 job parallelization is just for tidyness

echo "metadata script # $SGE_TASK_ID running:"

case $SGE_TASK_ID in

  1)
    echo "python Scripts/A06a_trimming.py"
    python Scripts/A06a_trimming.py
    ;;

  2)
    echo "python Scripts/A06b_DNA_maprate.py"
    python Scripts/A06b_DNA_maprate.py
    ;;

  3)
    echo "python Scripts/A06c_DNA_dedupe.py"
    python Scripts/A06c_DNA_dedupe.py
    ;;

  4)
    echo "python Scripts/A06d_DNA_mCfracs.py"
    python Scripts/A06d_DNA_mCfracs.py
    ;;

  5)
    echo "python Scripts/A06e_DNA_samtools.py"
    python Scripts/A06e_DNA_samtools.py
    ;;

  6)
    echo "python Scripts/A06f_DNA_cov.py"
    python Scripts/A06f_DNA_cov.py
    ;;
    
  *)
    ;;
esac


echo "completed 'A06_compile_DNA_metadata.'"

echo " "
echo "Job $JOB_ID.$SGE_TASK_ID ended on:   " `hostname -s`
echo "Job $JOB_ID.$SGE_TASK_ID ended on:   " `date `
echo " "

