In [None]:
# ## just one script to run
# qsub Scripts/A07_compile_RNA_metadata.sub

## A07a. DNA+RNA: fastp trimming



In [None]:
%%bash
cat > ../Scripts/A07a_trimming.py

# A07a_trimming.py =============================================================

# don't need to run twice if already obtained in DNA metadata compilation (A06a)

# setup ------------------—------------------—----------------------------------

import re
import pandas as pd
import glob

import os
filepath_wellmetadat = os.environ['metadat_plate']
metadata_well = pd.read_csv(filepath_wellmetadat)

def parse_fastp_report(filepath):
    jsonfile = pd.read_json(filepath)
    dict_out = {
        'nreads_pretrim' : jsonfile['summary']['before_filtering']['total_reads'],
        'percreads_passtrim' : jsonfile['summary']['after_filtering']['total_reads'] /
              jsonfile['summary']['before_filtering']['total_reads'],
        'q20_pretrim' : jsonfile['summary']['before_filtering']['q30_rate'],
        'q20_posttrim' : jsonfile['summary']['after_filtering']['q30_rate'],
        'r1_len' : jsonfile['summary']['after_filtering']['read1_mean_length'],
        'r2_len' : jsonfile['summary']['after_filtering']['read2_mean_length'],
        'gc_perc' : jsonfile['summary']['after_filtering']['gc_content']}
    return(dict_out)



# gather metadata ------------------—------------------—------------------------

list_fastp = [parse_fastp_report(file) for file in metadata_well['A03a_json_fastp']]
df_fastp = pd.DataFrame(list_fastp,
                        index=metadata_well['wellprefix'])

del(list_fastp)
df_fastp.to_csv("Metadata/A07a_trimming.tsv", sep='\t')
del(df_fastp)

## A07b. RNA: STAR mapping rates

In [None]:
%%bash
cat > ../Scripts/A07b_RNA_maprate.py

# A07b_RNA_maprate.py ==========================================================

# setup ------------------—------------------—----------------------------------

import glob
import itertools
import re
import pandas as pd

import os
filepath_wellmetadat = os.environ['metadat_plate']
metadata_well = pd.read_csv(filepath_wellmetadat)

def parse_star_report(filepath):

    """
    parse STAR.log output
    note that paired-end metrics usually fragments, versus reads
    """
    
    term_dict = {
        'Number of input reads': f'NumReadsIn',
        'Average input read length': f'AvgLengthIn',
        'Uniquely mapped reads number': f'NumReadsUniqueMapped',
        'Uniquely mapped reads %': f'PercentReadsUniqueMapped',
        'Average mapped length': f'AvgLengthMapped',
        'Number of splices: Total': f'NumTotSplices',
        'Number of splices: Annotated (sjdb)': f'NumAnnotSplices',
#         'Number of splices: GT/AG': f'NumGTAGSplices',
#         'Number of splices: GC/AG': f'NumGCAGSplices',
#         'Number of splices: AT/AC': f'NumATACSplices',
        'Mismatch rate per base, %': f'RateBaseMismatch',
        'Deletion rate per base': f'RateBaseDeletion',
        'Deletion average length': f'AvgLengthDeletion',
        'Insertion rate per base': f'RateBaseInsertion',
        'Insertion average length': f'AvgLengthInsertion',
#         'Number of reads mapped to multiple loci': f'NumReadsMultiMap',
        '% of reads mapped to multiple loci': f'PercentReadsMultiMap',
#         'Number of reads mapped to too many loci': f'NumReadsTooManyLoci',
        '% of reads mapped to too many loci': f'PercentReadsTooManyLoci',
#         'Number of reads unmapped: too many mismatches': f'NumReadsTooManyMismatch',
        '% of reads unmapped: too many mismatches':  f'PercentReadsTooManyMismatch',
#         'Number of reads unmapped: too short': f'NumReadsTooShort',
        '% of reads unmapped: too short': f'PercentReadsTooShort',
#         'Number of reads unmapped: other': f'NumReadsUnmappedOther',
        '% of reads unmapped: other': f'PercentReadsUnmappedOther',
#         'Number of chimeric reads': f'NumReadsChimeric',
#         '% of chimeric reads': f'PercentReadsChimeric',
    }
    
    with open(filepath) as report:
        report_dict = {}
        for line in report:
            try:
                lhs, rhs = line.split('|')
                lhs = lhs.strip()
            except ValueError:
                continue
            try:
                report_dict[term_dict[lhs]] = rhs.strip().strip('%')
            except KeyError:
                pass
            
    return(report_dict)
    
    


# gather metadata ------------------—------------------—------------------------

# paired-end
list_star_pe = [parse_star_report(file) for file in metadata_well['A05a_txt_star_PE']]
df_star_pe = pd.DataFrame(list_star_pe,
                          index= metadata_well['wellprefix'])
del(list_star_pe)
df_star_pe.to_csv("Metadata/A07b_RNA_maprate_PE.tsv", sep='\t')
del(df_star_pe)

# single-end, r1
list_star_SE1 = [parse_star_report(file) for file in metadata_well['A05a_txt_star_SE1']]
df_star_SE1 = pd.DataFrame(list_star_SE1,
                          index= metadata_well['wellprefix'])
del(list_star_SE1)
df_star_SE1.to_csv("Metadata/A07b_RNA_maprate_SE1.tsv", sep='\t')
del(df_star_SE1)

# single-end, r2
list_star_SE2 = [parse_star_report(file) for file in metadata_well['A05a_txt_star_SE2']]
df_star_SE2 = pd.DataFrame(list_star_SE2,
                          index= metadata_well['wellprefix'])
del(list_star_SE2)
df_star_SE2.to_csv("Metadata/A07b_RNA_maprate_SE2.tsv", sep='\t')
del(df_star_SE2)

## A07c. RNA: feature counts

In [None]:
%%bash
cat > ../Scripts/A07c_RNA_featcounts.py

# A07c_RNA_featcounts.py =======================================================

# setup ------------------—------------------—----------------------------------

import pandas as pd

import os
filepath_wellmetadat = os.environ['metadat_plate']
metadata_well = pd.read_csv(filepath_wellmetadat)

def parse_featurecounts(filepath):

    featc_summary = pd.read_csv(filepath, delimiter='\t')
    names_samples = [filename.split("/")[1] for filename in featc_summary.columns[1:]]
    names_features = featc_summary.iloc[ :, 0]

    # calc total read, tidy column names
    featc_summary = featc_summary.iloc[:, 1:].transpose()
    featc_summary = featc_summary.set_axis(names_samples, axis = 0).set_axis(names_features, axis = 1)
    featc_summary['TotalReadsFiltered'] = featc_summary.sum(axis = 1) # from A05c .Aligned.bam --> .Final.bam

    # other unassigned features should be zero (non-mapped filtered out)
    featc_summary = featc_summary[
        ['TotalReadsFiltered', 'Assigned', 'Unassigned_NoFeatures', 'Unassigned_Ambiguity']]
    
    return(featc_summary)



# gather metadata ------------------—------------------—------------------------

batchnums=pd.unique(metadata_well['platenum'])

# gene-level
list_fcgene_PE = [ parse_featurecounts("featurecounts_gene/PE_" + str(i) + ".summary")
                for i in batchnums ] 
df_fcgene_PE = pd.concat(list_fcgene_PE)

list_fcgene_SE1 = [ parse_featurecounts("featurecounts_gene/SE1_" + str(i) + ".summary")
                for i in batchnums ] 
df_fcgene_SE1 = pd.concat(list_fcgene_SE1)

list_fcgene_SE2 = [ parse_featurecounts("featurecounts_gene/SE2_" + str(i) + ".summary")
                for i in batchnums ] 
df_fcgene_SE2 = pd.concat(list_fcgene_SE2)

fcgene_joined = \
    pd.concat([df_fcgene_PE.add_prefix("PE_"),
               df_fcgene_SE1.add_prefix("SE1_"),
               df_fcgene_SE2.add_prefix("SE2_")], axis = 1
               )
fcgene_joined.index.names = ["wellprefix"]

fcgene_joined.to_csv("Metadata/A07c_RNA_featcounts_gene.tsv", sep='\t')


# intron-level
list_fcexon_PE = [ parse_featurecounts("featurecounts_exon/PE_" + str(i) + ".summary")
                for i in batchnums ] 
df_fcexon_PE = pd.concat(list_fcexon_PE)

list_fcexon_SE1 = [ parse_featurecounts("featurecounts_exon/SE1_" + str(i) + ".summary")
                for i in batchnums ] 
df_fcexon_SE1 = pd.concat(list_fcexon_SE1)

list_fcexon_SE2 = [ parse_featurecounts("featurecounts_exon/SE2_" + str(i) + ".summary")
                for i in batchnums ] 
df_fcexon_SE2 = pd.concat(list_fcexon_SE2)

fcexon_joined = \
    pd.concat([df_fcexon_PE.add_prefix("PE_"),
               df_fcexon_SE1.add_prefix("SE1_"),
               df_fcexon_SE2.add_prefix("SE2_")], axis = 1
               )
fcexon_joined.index.names = ["wellprefix"]

fcexon_joined.to_csv("Metadata/A07c_RNA_featcounts_exon.tsv", sep='\t')



## A07d. RNA: samtools stats

In [None]:
%%bash
cat > ../Scripts/A07d_RNA_samtools.py

# A07d_RNA_samtools.py =========================================================

# setup ------------------—------------------—----------------------------------

import glob
import pandas as pd

import os
filepath_wellmetadat = os.environ['metadat_plate']
metadata_well = pd.read_csv(filepath_wellmetadat)

# import samtools stats
def parse_samstats(filepath):

    term_dict = {
        'raw total sequences': f'FilteredSeqCount',
        'error rate': f'ErrorRate',
        'insert size average': f'InsertSizeAvg',
        'insert size standard deviation': f'InsertSizeSD',
        }

    with open(filepath) as report:
        report_dict = {}
        for line in report:
            try:
                lhs, rhs = line.split(':')
            except ValueError:
                continue
            try:
                report_dict[term_dict[lhs]] = rhs.strip().split('\t')[0]
            except KeyError:
                pass
            
    return(report_dict)




# gather metadata ------------------—------------------—------------------------

# paired-end
list_samstats_PE = [parse_samstats(file) for file in metadata_well["A05e_txt_samtools_PE"]]
df_samstats_PE = pd.DataFrame(list_samstats_PE,
             index = metadata_well["wellprefix"])
del(list_samstats_PE)
df_samstats_PE.to_csv("Metadata/A07d_RNA_samstats_PE.tsv", sep='\t')
del(df_samstats_PE)


# single-end, read 1
list_samstats_SE1 = [parse_samstats(file) for file in metadata_well["A05e_txt_samtools_SE1"]]
df_samstats_SE1 = pd.DataFrame(list_samstats_SE1,
             index = metadata_well["wellprefix"]
                               ).drop(["InsertSizeAvg", "InsertSizeSD"], axis = 1)
del(list_samstats_SE1)
df_samstats_SE1.to_csv("Metadata/A07d_RNA_samstats_SE1.tsv", sep='\t')
del(df_samstats_SE1)

# single-end, read 2
list_samstats_SE2 = [parse_samstats(file) for file in metadata_well["A05e_txt_samtools_SE2"]]
df_samstats_SE2 = pd.DataFrame(list_samstats_SE2,
             index = metadata_well["wellprefix"]
                              ).drop(["InsertSizeAvg", "InsertSizeSD"], axis = 1)
del(list_samstats_SE2)
df_samstats_SE2.to_csv("Metadata/A07d_RNA_samstats_SE2.tsv", sep='\t')
del(df_samstats_SE2)

## A07e. RNA picard

In [None]:
%%bash
cat > ../Scripts/A07e_RNA_picard.py

# A07e_RNA_picard.py ===========================================================

# setup ------------------—------------------—----------------------------------

import glob
import pandas as pd

import os
filepath_wellmetadat = os.environ['metadat_plate']
metadata_well = pd.read_csv(filepath_wellmetadat)

# read picard log files
def parse_picard_rna(filepath):
    data_dedupe = pd.read_csv(filepath, delimiter = "\t",
                     comment = "#", nrows = 1).transpose()[0]
    return(data_dedupe)



# gather metadata ------------------—------------------—------------------------

list_picard_PE = [parse_picard_rna(file) for file in metadata_well['A05e_txt_picard_PE']]
df_picard_PE = pd.DataFrame(list_picard_PE,
                            index = metadata_well['wellprefix']
                           ).drop(["SAMPLE", "LIBRARY", "READ_GROUP"], axis = 1
                           ).add_prefix("picard_")
df_picard_PE.columns = df_picard_PE.columns.str.lower()

del(list_picard_PE)
df_picard_PE.to_csv("Metadata/A07e_RNA_picard_PE.tsv", sep='\t')
del(df_picard_PE)

list_picard_SE1 = [parse_picard_rna(file) for file in metadata_well['A05e_txt_picard_SE1']]
df_picard_SE1 = pd.DataFrame(list_picard_SE1,
                            index = metadata_well['wellprefix']
                           ).drop(["SAMPLE", "LIBRARY", "READ_GROUP"], axis = 1
                           ).add_prefix("picard_")
df_picard_SE1.columns = df_picard_SE1.columns.str.lower()

del(list_picard_SE1)
df_picard_SE1.to_csv("Metadata/A07e_RNA_picard_SE1.tsv", sep='\t')
del(df_picard_SE1)

list_picard_SE2 = [parse_picard_rna(file) for file in metadata_well['A05e_txt_picard_SE2']]
df_picard_SE2 = pd.DataFrame(list_picard_SE2,
                            index = metadata_well['wellprefix']
                           ).drop(["SAMPLE", "LIBRARY", "READ_GROUP"], axis = 1
                           ).add_prefix("picard_")
df_picard_SE2.columns = df_picard_SE2.columns.str.lower()

del(list_picard_SE2)
df_picard_SE2.to_csv("Metadata/A07e_RNA_picard_SE2.tsv", sep='\t')
del(df_picard_SE2)

In [None]:
%%bash
cat > ../Scripts/A07_compile_RNA_metadata.sub

#!/bin/bash
#$ -cwd
#$ -o sublogs/A07_compile_RNA.$JOB_ID.$TASK_ID
#$ -j y
#$ -l h_rt=6:00:00,h_data=12G
#$ -N A07_compile_RNA
#$ -t 2-5




echo "Job $JOB_ID.$SGE_TASK_ID started on:   " `hostname -s`
echo "Job $JOB_ID.$SGE_TASK_ID started on:   " `date `
echo " "


# environment init ------------------—------------------—-----------------------

. /u/local/Modules/default/init/modules.sh # <--
module load anaconda3 # <--
conda activate snmCTseq # <--

export $(cat snmCT_parameters.env | grep -v '^#' | xargs) # <--



# run each helper script (A07*) ------------------—------------------—----------

# note: in practice these can each be submitted interactively/as its own task,
# as some of these scripts should be much lower resource than others;
# the -t 2-5 job parallelization is just for tidyness

echo "metadata script # $SGE_TASK_ID running:"

case $SGE_TASK_ID in

  1)
    echo "python Scripts/A07a_trimming.py" # usually already run in A06a
    python Scripts/A07a_trimming.py
    ;;

  2)
    echo "python Scripts/A07b_RNA_maprate.py"
    python Scripts/A07b_RNA_maprate.py
    ;;

  3)
    echo "python Scripts/A07c_RNA_featcounts.py"
    python Scripts/A07c_RNA_featcounts.py
    ;;

  4)
    echo "python Scripts/A07d_RNA_samtools.py"
    python Scripts/A07d_RNA_samtools.py
    ;;

  5)
    echo "python Scripts/A07e_RNA_picard.py"
    python Scripts/A07e_RNA_picard.py
    ;;

  *)
    ;;
esac


echo "completed 'A07_compile_RNA_metadata.'"

echo " "
echo "Job $JOB_ID.$SGE_TASK_ID ended on:   " `hostname -s`
echo "Job $JOB_ID.$SGE_TASK_ID ended on:   " `date `
echo " "

