In [None]:
# # A07_compile_metadata_RNA overall cmds ===========================================
# # just one script to run

# qsub Scripts/A07_compile_RNA_metadata.sub # ‡

# # ‡ fast enough to run interactively

In [None]:
# # for interactive mode, just need to specify working dir & 2 environment variables below
# # then comment out the "%%bash / cat" lines and run python code in-notebook

# import os
# os.chdir('../') # move to $dir_proj
# os.environ['metadat_well'] = "Metadata/A01c_well_filepath.csv"
# os.environ['ref_chromsizes'] = "/u/project/cluo/chliu/Genomes/IGVF_hg38_pluslambda/chromsizes.tsv"

# # alternatively, loop through "../snmCT_parameters.env":
# import os
# import pandas as pd

# envvar_needed = ['dir_proj', 'metadat_well']
# try:
#     os.environ['metadat_well']
# except KeyError:
#     envspec = pd.read_csv("../snmCT_parameters.env", sep = "=", comment="#", header = None
#                ).set_axis(['varname', 'varpath'], axis = 1
#                ).query('varname in @envvar_needed')
#     for index, row in envspec.iterrows():
#         os.environ[row["varname"]] = row["varpath"]
# os.chdir(os.environ['dir_proj'])


## A07a. DNA+RNA: fastp trimming



In [None]:
%%bash
cat > ../Scripts/A07a_trimming.py

# A07a_trimming.py =============================================================



# setup ========================================================================

import os
import re
import pandas as pd
import glob

filepath_wellmetadat = os.environ['metadat_well']
metadata_well = pd.read_csv(filepath_wellmetadat)

def parse_fastp_report(filepath):
    jsonfile = pd.read_json(filepath)
    dict_out = {
        'nreads_pretrim' : jsonfile['summary']['before_filtering']['total_reads'],
        'percreads_passtrim' : jsonfile['summary']['after_filtering']['total_reads'] /
              jsonfile['summary']['before_filtering']['total_reads'],
        'q20_pretrim' : jsonfile['summary']['before_filtering']['q30_rate'],
        'q20_posttrim' : jsonfile['summary']['after_filtering']['q30_rate'],
        'r1_len' : jsonfile['summary']['after_filtering']['read1_mean_length'],
        'r2_len' : jsonfile['summary']['after_filtering']['read2_mean_length'],
        'gc_perc' : jsonfile['summary']['after_filtering']['gc_content']}
    return(dict_out)



# gather metadata ==============================================================

print("\n\nfastp .json...")

filelist = metadata_well['A03a_json_fastp']
boolean_fileexists = [os.path.exists(f) for f in filelist]
list_fastp = [parse_fastp_report(f) for f in filelist[boolean_fileexists]]
df_fastp = pd.DataFrame(list_fastp,
                        index = metadata_well['wellprefix'][boolean_fileexists])

# percent files missing
print("number of target files: " + str(len(filelist)))
print("fraction files missing: ")
print(round(1 - sum(boolean_fileexists)/len(boolean_fileexists), 3))
boolean_filemissing = [not f for f in boolean_fileexists]
if sum(boolean_filemissing) != 0:
    print("missing " + str(sum(boolean_filemissing)) + " files:")
    print(filelist[boolean_filemissing].to_string())

# column QC
print("number of NAs per column:")
print(df_fastp.isna().sum().to_string())

print("number of duplicated wells:")
ndupe = df_fastp.index.duplicated().sum()
print(ndupe)

# final export
print("exporting Metadata/A07a_trimming.tsv of shape: {}".format(*df_fastp.shape))
df_fastp.to_csv("Metadata/A07a_trimming.tsv", sep = '\t')
print("\n\n")


## A07b. RNA: STAR mapping rates

In [None]:
%%bash
cat > ../Scripts/A07b_RNA_maprate.py

# A07b_RNA_maprate.py ==========================================================

# setup ------------------------------------------------------------------------

import os
import glob
import itertools
import re
import pandas as pd

filepath_wellmetadat = os.environ['metadat_well']
metadata_well = pd.read_csv(filepath_wellmetadat)

def parse_star_report(filepath):

    """
    parse STAR.log output
    note that paired-end metrics usually fragments, versus reads
    """
    
    term_dict = {
        'Number of input reads': f'NumReadsIn',
        'Average input read length': f'AvgLengthIn',
        'Uniquely mapped reads number': f'NumReadsUniqueMapped',
        'Uniquely mapped reads %': f'PercentReadsUniqueMapped',
        'Average mapped length': f'AvgLengthMapped',
        'Number of splices: Total': f'NumTotSplices',
        'Number of splices: Annotated (sjdb)': f'NumAnnotSplices',
#         'Number of splices: GT/AG': f'NumGTAGSplices',
#         'Number of splices: GC/AG': f'NumGCAGSplices',
#         'Number of splices: AT/AC': f'NumATACSplices',
        'Mismatch rate per base, %': f'RateBaseMismatch',
        'Deletion rate per base': f'RateBaseDeletion',
        'Deletion average length': f'AvgLengthDeletion',
        'Insertion rate per base': f'RateBaseInsertion',
        'Insertion average length': f'AvgLengthInsertion',
#         'Number of reads mapped to multiple loci': f'NumReadsMultiMap',
        '% of reads mapped to multiple loci': f'PercentReadsMultiMap',
#         'Number of reads mapped to too many loci': f'NumReadsTooManyLoci',
        '% of reads mapped to too many loci': f'PercentReadsTooManyLoci',
#         'Number of reads unmapped: too many mismatches': f'NumReadsTooManyMismatch',
        '% of reads unmapped: too many mismatches':  f'PercentReadsTooManyMismatch',
#         'Number of reads unmapped: too short': f'NumReadsTooShort',
        '% of reads unmapped: too short': f'PercentReadsTooShort',
#         'Number of reads unmapped: other': f'NumReadsUnmappedOther',
        '% of reads unmapped: other': f'PercentReadsUnmappedOther',
#         'Number of chimeric reads': f'NumReadsChimeric',
#         '% of chimeric reads': f'PercentReadsChimeric',
    }
    
    with open(filepath) as report:
        report_dict = {}
        for line in report:
            try:
                lhs, rhs = line.split('|')
                lhs = lhs.strip()
            except ValueError:
                continue
            try:
                report_dict[term_dict[lhs]] = rhs.strip().strip('%')
            except KeyError:
                pass
            
    return(report_dict)
    
    


# gather metadata --------------------------------------------------------------



# paired-end -------------------------------------------------------------------

print("\n\nPE logs...")
filelist = metadata_well['A06a_txt_star_PE']
boolean_fileexists = [os.path.exists(f) for f in filelist]
list_star_PE = [parse_star_report(f) for f in filelist[boolean_fileexists]]
df_star_PE = pd.DataFrame(list_star_PE,
                           index = metadata_well['wellprefix'][boolean_fileexists])

# percent files missing
print("number of target files: " + str(len(filelist)))
print("fraction files missing: ")
print(round(1 - sum(boolean_fileexists)/len(boolean_fileexists), 3))
boolean_filemissing = [not f for f in boolean_fileexists]
if sum(boolean_filemissing) != 0:
    print("missing " + str(sum(boolean_filemissing)) + " files:")
    print(filelist[boolean_filemissing].to_string())

# column QC
print("number of NAs per column:")
print(df_star_PE.isna().sum().to_string())

print("number of duplicated wells:")
ndupe = df_star_PE.index.duplicated().sum()
print(ndupe)

# final export
print("exporting Metadata/A07b_RNA_maprate_PE.tsv of shape: {}".format(*df_star_PE.shape))
df_star_PE.to_csv("Metadata/A07b_RNA_maprate_PE.tsv", sep = '\t')
print("\n\n")



# single-end, r1 ---------------------------------------------------------------

print("\n\nSE1 logs...")
filelist = metadata_well['A06a_txt_star_SE1']
boolean_fileexists = [os.path.exists(f) for f in filelist]
list_star_SE1 = [parse_star_report(f) for f in filelist[boolean_fileexists]]
df_star_SE1 = pd.DataFrame(list_star_SE1,
                           index = metadata_well['wellprefix'][boolean_fileexists])

# percent files missing
print("number of target files: " + str(len(filelist)))
print("fraction files missing: ")
print(round(1 - sum(boolean_fileexists)/len(boolean_fileexists), 3))
boolean_filemissing = [not f for f in boolean_fileexists]
if sum(boolean_filemissing) != 0:
    print("missing " + str(sum(boolean_filemissing)) + " files:")
    print(filelist[boolean_filemissing].to_string())

# column QC
print("number of NAs per column:")
print(df_star_SE1.isna().sum().to_string())

print("number of duplicated wells:")
ndupe = df_star_SE1.index.duplicated().sum()
print(ndupe)

# final export
print("exporting Metadata/A07b_RNA_maprate_SE1.tsv of shape: {}".format(*df_star_SE1.shape))
df_star_SE1.to_csv("Metadata/A07b_RNA_maprate_SE1.tsv", sep = '\t')
print("\n\n")



# single-end, r2 ---------------------------------------------------------------

print("\n\nSE2 logs...")
filelist = metadata_well['A06a_txt_star_SE2']
boolean_fileexists = [os.path.exists(f) for f in filelist]
list_star_SE2 = [parse_star_report(f) for f in filelist[boolean_fileexists]]
df_star_SE2 = pd.DataFrame(list_star_SE2,
                                  index = metadata_well['wellprefix'][boolean_fileexists])

# percent files missing
print("number of target files: " + str(len(filelist)))
print("fraction files missing: ")
print(round(1 - sum(boolean_fileexists)/len(boolean_fileexists), 3))
boolean_filemissing = [not f for f in boolean_fileexists]
if sum(boolean_filemissing) != 0:
    print("missing " + str(sum(boolean_filemissing)) + " files:")
    print(filelist[boolean_filemissing].to_string())

# column QC
print("number of NAs per column:")
print(df_star_SE2.isna().sum().to_string())

print("number of duplicated wells:")
ndupe = df_star_SE2.index.duplicated().sum()
print(ndupe)

# final export
print("exporting Metadata/A07b_RNA_maprate_SE2.tsv of shape: {}".format(*df_star_SE2.shape))
df_star_SE2.to_csv("Metadata/A07b_RNA_maprate_SE2.tsv", sep = '\t')
print("\n\n")


## A07c. RNA deduplication rate

In [None]:
%%bash
cat > ../Scripts/A07c_RNA_dedupe.py

# A07c_RNA_dedupe.py ===========================================================

# setup ------------------------------------------------------------------------

import os
import glob
import pandas as pd
import numpy as np

filepath_wellmetadat = os.environ['metadat_well']
metadata_well = pd.read_csv(filepath_wellmetadat)

# picard .log files
nulltable = np.array([pd.NA, pd.NA, pd.NA]) 

def parse_picard_dedupe(filepath):
    try:
        data_dedupe = pd.read_csv(filepath, delimiter = "\t",
                         comment = "#", nrows = 1)[[
                             'UNPAIRED_READS_EXAMINED', 'READ_PAIRS_EXAMINED', 'PERCENT_DUPLICATION'
                         ]].transpose()[0]
        return(data_dedupe)
    except:
        print("error reading file: " + filepath)
        return(nulltable)

tidy_name_dict = {'PERCENT_DUPLICATION' : 'picard_perc_dupe',
                  'READ_PAIRS_EXAMINED' : 'picard_npairsin',
                  'UNPAIRED_READS_EXAMINED' : 'picard_nreadsin'}



# gather metadata ==============================================================

# paired-end -------------------------------------------------------------------

print("\n\nPE logs...")
filelist = metadata_well['A06b_bam_dedupe_PE']
boolean_fileexists = [os.path.exists(f) for f in filelist]
list_picard_PE = [parse_picard_dedupe(f) for f in filelist[boolean_fileexists]]
df_picard_PE = pd.DataFrame(list_picard_PE,
                        index = metadata_well['wellprefix'][boolean_fileexists])


# percent files missing
print("number of target files: " + str(len(filelist)))
print("fraction files missing: ")
print(round(1 - sum(boolean_fileexists)/len(boolean_fileexists), 3))
boolean_filemissing = [not f for f in boolean_fileexists]
if sum(boolean_filemissing) != 0:
    print("missing " + str(sum(boolean_filemissing)) + " files:")
    print(filelist[boolean_filemissing].to_string())

# column QC
print("number of NAs per column:")
print(df_picard_PE.isna().sum().to_string())

print("number of duplicated wells:")
ndupe = df_picard_PE.index.duplicated().sum()
print(ndupe)

# final export
print("exporting Metadata/A07c_RNA_picard_PE.tsv of shape: {}".format(*df_picard_PE.shape))
df_picard_PE.to_csv("Metadata/A07c_RNA_picard_PE.tsv", sep = '\t')
print("\n\n")


# single-end, read 1 -----------------------------------------------------------

print("\n\nSE1 logs...")
filelist = metadata_well['A06b_bam_dedupe_SE1']
boolean_fileexists = [os.path.exists(f) for f in filelist]
list_picard_SE1 = [parse_picard_dedupe(f) for f in filelist[boolean_fileexists]]
df_picard_SE1 = pd.DataFrame(list_picard_SE1,
                        index = metadata_well['wellprefix'][boolean_fileexists])

# percent files missing
print("number of target files: " + str(len(filelist)))
print("fraction files missing: ")
print(round(1 - sum(boolean_fileexists)/len(boolean_fileexists), 3))
boolean_filemissing = [not f for f in boolean_fileexists]
if sum(boolean_filemissing) != 0:
    print("missing " + str(sum(boolean_filemissing)) + " files:")
    print(filelist[boolean_filemissing].to_string())

# column QC
print("number of NAs per column:")
print(df_picard_SE1.isna().sum().to_string())

print("number of duplicated wells:")
ndupe = df_picard_SE1.index.duplicated().sum()
print(ndupe)

# final export
print("exporting Metadata/A07c_RNA_picard_SE1.tsv of shape: {}".format(*df_picard_SE1.shape))
df_picard_SE1.to_csv("Metadata/A07c_RNA_picard_SE1.tsv", sep = '\t')
print("\n\n")


# single-end, read 2 -----------------------------------------------------------

print("\n\nSE2 logs...")
filelist = metadata_well['A06b_bam_dedupe_SE2']
boolean_fileexists = [os.path.exists(f) for f in filelist]
list_picard_SE2 = [parse_picard_dedupe(f) for f in filelist[boolean_fileexists]]
df_picard_SE2 = pd.DataFrame(list_picard_SE2,
                        index = metadata_well['wellprefix'][boolean_fileexists])


# percent files missing
print("number of target files: " + str(len(filelist)))
print("fraction files missing: ")
print(round(1 - sum(boolean_fileexists)/len(boolean_fileexists), 3))
boolean_filemissing = [not f for f in boolean_fileexists]
if sum(boolean_filemissing) != 0:
    print("missing " + str(sum(boolean_filemissing)) + " files:")
    print(filelist[boolean_filemissing].to_string())

# column QC
print("number of NAs per column:")
print(df_picard_SE2.isna().sum().to_string())

print("number of duplicated wells:")
ndupe = df_picard_SE2.index.duplicated().sum()
print(ndupe)

# final export
print("exporting Metadata/A07c_RNA_picard_SE2.tsv of shape: {}".format(*df_picard_SE2.shape))
df_picard_SE2.to_csv("Metadata/A07c_RNA_picard_SE2.tsv", sep = '\t')
print("\n\n")


## A07d. RNA: feature counts

In [None]:
%%bash
cat > ../Scripts/A07d_RNA_featcounts.py

# A07d_RNA_featcounts.py =======================================================

# setup ------------------------------------------------------------------------

import os
import pandas as pd

filepath_wellmetadat = os.environ['metadat_well']
metadata_well = pd.read_csv(filepath_wellmetadat)
batchnums = pd.unique(metadata_well['platenum'])


def parse_featurecounts(filepath):

    featc_summary = pd.read_csv(filepath, delimiter='\t')
    names_samples = [filename.split("/")[1] for filename in featc_summary.columns[1:]]
    names_features = featc_summary.iloc[ :, 0]

    # calc total read, tidy column names
    featc_summary = featc_summary.iloc[:, 1:].transpose()
    featc_summary = featc_summary.set_axis(names_samples, axis = 0).set_axis(names_features, axis = 1)
    featc_summary['TotalReadsFiltered'] = featc_summary.sum(axis = 1) # from A06c .Aligned.bam --> .Final.bam

    # other unassigned features should be zero (non-mapped filtered out)
    featc_summary = featc_summary[
        ['TotalReadsFiltered', 'Assigned', 'Unassigned_NoFeatures', 'Unassigned_Ambiguity']]
    
    return(featc_summary)



# gather metadata --------------------------------------------------------------

# gene-level -------------------------------------------------------------------

print("\n\ngene-level quants...")
filelist = pd.Series(["featurecounts_gene/PE_" + str(i) + ".summary" for i in batchnums])
boolean_fileexists = [os.path.exists(f) for f in filelist]
list_fcgene_PE = [ parse_featurecounts(f) for f in filelist[boolean_fileexists] ]
df_fcgene_PE = pd.concat(list_fcgene_PE)

filelist = pd.Series(["featurecounts_gene/SE1_" + str(i) + ".summary" for i in batchnums])
boolean_fileexists = [os.path.exists(f) for f in filelist]
list_fcgene_SE1 = [ parse_featurecounts(f) for f in filelist[boolean_fileexists] ]
df_fcgene_SE1 = pd.concat(list_fcgene_SE1)

filelist = pd.Series(["featurecounts_gene/SE2_" + str(i) + ".summary" for i in batchnums])
boolean_fileexists = [os.path.exists(f) for f in filelist]
list_fcgene_SE2 = [ parse_featurecounts(f) for f in filelist[boolean_fileexists] ]
df_fcgene_SE2 = pd.concat(list_fcgene_SE2)

fcgene_joined = \
    pd.concat([df_fcgene_PE.add_prefix("PE_"),
               df_fcgene_SE1.add_prefix("SE1_"),
               df_fcgene_SE2.add_prefix("SE2_")], axis = 1
               )
fcgene_joined.index.names = ["wellprefix"]

# percent files missing
print("number of target files: " + str(len(filelist)))
print("fraction files missing: ")
print(round(1 - sum(boolean_fileexists)/len(boolean_fileexists), 3))
boolean_filemissing = [not f for f in boolean_fileexists]
if sum(boolean_filemissing) != 0:
    print("missing " + str(sum(boolean_filemissing)) + " files:")
    print(filelist[boolean_filemissing].to_string())

# column QC
print("number of NAs per column:")
print(fcgene_joined.isna().sum().to_string())

print("number of duplicated wells:")
ndupe = fcgene_joined.index.duplicated().sum()
print(ndupe)

# final export
print("exporting Metadata/A07d_RNA_featcounts_gene.tsv of shape: {}".format(*fcgene_joined.shape))
fcgene_joined.to_csv("Metadata/A07d_RNA_featcounts_gene.tsv", sep = '\t')


# exon-level -------------------------------------------------------------------

print("\n\nexon-level quants...")
filelist = pd.Series(["featurecounts_exon/PE_" + str(i) + ".summary" for i in batchnums])
boolean_fileexists = [os.path.exists(f) for f in filelist]
list_fcexon_PE = [ parse_featurecounts(f) for f in filelist[boolean_fileexists] ]
df_fcexon_PE = pd.concat(list_fcexon_PE)

filelist = pd.Series(["featurecounts_exon/SE1_" + str(i) + ".summary" for i in batchnums])
boolean_fileexists = [os.path.exists(f) for f in filelist]
list_fcexon_SE1 = [ parse_featurecounts(f) for f in filelist[boolean_fileexists] ]
df_fcexon_SE1 = pd.concat(list_fcexon_SE1)

filelist = pd.Series(["featurecounts_exon/SE2_" + str(i) + ".summary" for i in batchnums])
boolean_fileexists = [os.path.exists(f) for f in filelist]
list_fcexon_SE2 = [ parse_featurecounts(f) for f in filelist[boolean_fileexists] ]
df_fcexon_SE2 = pd.concat(list_fcexon_SE2)

fcexon_joined = \
    pd.concat([df_fcexon_PE.add_prefix("PE_"),
               df_fcexon_SE1.add_prefix("SE1_"),
               df_fcexon_SE2.add_prefix("SE2_")], axis = 1
               )
fcexon_joined.index.names = ["wellprefix"]

# percent files missing
print("number of target files: " + str(len(filelist)))
print("fraction files missing: ")
print(round(1 - sum(boolean_fileexists)/len(boolean_fileexists), 3))
boolean_filemissing = [not f for f in boolean_fileexists]
if sum(boolean_filemissing) != 0:
    print("missing " + str(sum(boolean_filemissing)) + " files:")
    print(filelist[boolean_filemissing].to_string())

# column QC
print("number of NAs per column:")
print(fcexon_joined.isna().sum().to_string())

print("number of duplicated wells:")
ndupe = fcexon_joined.index.duplicated().sum()
print(ndupe)

# final export
print("exporting Metadata/A07d_RNA_featcounts_exon.tsv of shape: {}".format(*fcexon_joined.shape))
fcexon_joined.to_csv("Metadata/A07d_RNA_featcounts_exon.tsv", sep = '\t')


## A07e. RNA: samtools stats

In [None]:
%%bash
cat > ../Scripts/A07e_RNA_samtools.py

# A07e_RNA_samtools.py =========================================================

# setup ------------------------------------------------------------------------

import os
import glob
import pandas as pd

filepath_wellmetadat = os.environ['metadat_well']
metadata_well = pd.read_csv(filepath_wellmetadat)

# import samtools stats
def parse_samstats(filepath):

    term_dict = {
        'raw total sequences': f'FilteredSeqCount',
        'error rate': f'ErrorRate',
        'insert size average': f'InsertSizeAvg',
        'insert size standard deviation': f'InsertSizeSD',
        }

    with open(filepath) as report:
        report_dict = {}
        for line in report:
            try:
                lhs, rhs = line.split(':')
            except ValueError:
                continue
            try:
                report_dict[term_dict[lhs]] = rhs.strip().split('\t')[0]
            except KeyError:
                pass
            
    return(report_dict)




# gather metadata --------------------------------------------------------------


# paired-end  ------------------------------------------------------------------

print("\n\nPE logs...")
filelist = metadata_well['A06e_txt_samtools_PE']
boolean_fileexists = [os.path.exists(f) for f in filelist]
list_samstats_PE = [parse_samstats(f) for f in filelist[boolean_fileexists]]
df_samstats_PE = pd.DataFrame(list_samstats_PE,
                               index = metadata_well['wellprefix'][boolean_fileexists]
).drop(["InsertSizeAvg", "InsertSizeSD"], axis = 1)


# percent files missing
print("number of target files: " + str(len(filelist)))
print("fraction files missing: ")
print(round(1 - sum(boolean_fileexists)/len(boolean_fileexists), 3))
boolean_filemissing = [not f for f in boolean_fileexists]
if sum(boolean_filemissing) != 0:
    print("missing " + str(sum(boolean_filemissing)) + " files:")
    print(filelist[boolean_filemissing].to_string())

# column QC
print("number of NAs per column:")
print(df_samstats_PE.isna().sum().to_string())

print("number of duplicated wells:")
ndupe = df_samstats_PE.index.duplicated().sum()
print(ndupe)

# final export
print("exporting Metadata/A07e_RNA_samstats_PE.tsv of shape: {}".format(*df_samstats_PE.shape))
df_samstats_PE.to_csv("Metadata/A07e_RNA_samstats_PE.tsv", sep = '\t')
print("\n\n")



# single-end, read 1 -----------------------------------------------------------

print("\n\nSE1 logs...")
filelist = metadata_well['A06e_txt_samtools_SE1']
boolean_fileexists = [os.path.exists(f) for f in filelist]
list_samstats_SE1 = [parse_samstats(f) for f in filelist[boolean_fileexists]]
df_samstats_SE1 = pd.DataFrame(list_samstats_SE1,
                               index = metadata_well['wellprefix'][boolean_fileexists]
).drop(["InsertSizeAvg", "InsertSizeSD"], axis = 1)


# percent files missing
print("number of target files: " + str(len(filelist)))
print("fraction files missing: ")
print(round(1 - sum(boolean_fileexists)/len(boolean_fileexists), 3))
boolean_filemissing = [not f for f in boolean_fileexists]
if sum(boolean_filemissing) != 0:
    print("missing " + str(sum(boolean_filemissing)) + " files:")
    print(filelist[boolean_filemissing].to_string())

# column QC
print("number of NAs per column:")
print(df_samstats_SE1.isna().sum().to_string())

print("number of duplicated wells:")
ndupe = df_samstats_SE1.index.duplicated().sum()
print(ndupe)

# final export
print("exporting Metadata/A07e_RNA_samstats_SE1.tsv of shape: {}".format(*df_samstats_SE1.shape))
df_samstats_SE1.to_csv("Metadata/A07e_RNA_samstats_SE1.tsv", sep = '\t')
print("\n\n")



# single-end, read 2 -----------------------------------------------------------

print("\n\nSE2 logs...")
filelist = metadata_well['A06e_txt_samtools_SE2']
boolean_fileexists = [os.path.exists(f) for f in filelist]
list_samstats_SE2 = [parse_samstats(f) for f in filelist[boolean_fileexists]]
df_samstats_SE2 = pd.DataFrame(list_samstats_SE2,
                        index = metadata_well['wellprefix'][boolean_fileexists]
                            ).drop(["InsertSizeAvg", "InsertSizeSD"], axis = 1)


# percent files missing
print("number of target files: " + str(len(filelist)))
print("fraction files missing: ")
print(round(1 - sum(boolean_fileexists)/len(boolean_fileexists), 3))
boolean_filemissing = [not f for f in boolean_fileexists]
if sum(boolean_filemissing) != 0:
    print("missing " + str(sum(boolean_filemissing)) + " files:")
    print(filelist[boolean_filemissing].to_string())

# column QC
print("number of NAs per column:")
print(df_samstats_SE2.isna().sum().to_string())

print("number of duplicated wells:")
ndupe = df_samstats_SE2.index.duplicated().sum()
print(ndupe)

# final export
print("exporting Metadata/A07e_RNA_samstats_SE2.tsv of shape: {}".format(*df_samstats_SE2.shape))
df_samstats_SE2.to_csv("Metadata/A07e_RNA_samstats_SE2.tsv", sep = '\t')
print("\n\n")



## A07f. RNA picard rna metrics

In [None]:
%%bash
cat > ../Scripts/A07f_RNA_picard.py

# A07f_RNA_picard.py ===========================================================

# setup ------------------------------------------------------------------------

import os
import glob
import pandas as pd

filepath_wellmetadat = os.environ['metadat_well']
metadata_well = pd.read_csv(filepath_wellmetadat)

# read picard log files
def parse_picard_rna(filepath):
    data_dedupe = pd.read_csv(filepath, delimiter = "\t",
                     comment = "#", nrows = 1).transpose()[0]
    return(data_dedupe)



# gather metadata --------------------------------------------------------------


# paired-end -------------------------------------------------------------------

print("\n\nPE logs...")
filelist = metadata_well['A06e_txt_picard_PE']
boolean_fileexists = [os.path.exists(f) for f in filelist]
list_picardrna_PE = [parse_picard_rna(f) for f in filelist[boolean_fileexists]]
df_picardrna_PE = pd.DataFrame(list_picardrna_PE,
                               index = metadata_well['wellprefix'][boolean_fileexists]
                               ).drop(["SAMPLE", "LIBRARY", "READ_GROUP"], axis = 1
                                      ).add_prefix("picard_")
df_picardrna_PE.columns = df_picardrna_PE.columns.str.lower()


# percent files missing
print("number of target files: " + str(len(filelist)))
print("fraction files missing: ")
print(round(1 - sum(boolean_fileexists)/len(boolean_fileexists), 3))
boolean_filemissing = [not f for f in boolean_fileexists]
if sum(boolean_filemissing) != 0:
    print("missing " + str(sum(boolean_filemissing)) + " files:")
    print(filelist[boolean_filemissing].to_string())

# column QC
print("number of NAs per column:")
print(df_picardrna_PE.isna().sum().to_string())

print("number of duplicated wells:")
ndupe = df_picardrna_PE.index.duplicated().sum()
print(ndupe)

# final export
print("exporting Metadata/A07f_RNA_picard_PE.tsv of shape: {}".format(*df_picardrna_PE.shape))
df_picardrna_PE.to_csv("Metadata/A07f_RNA_picard_PE.tsv", sep = '\t')
print("\n\n")





# single-end, read 1 -----------------------------------------------------------

print("\n\nSE1 logs...")
filelist = metadata_well['A06e_txt_picard_SE1']
boolean_fileexists = [os.path.exists(f) for f in filelist]
list_picardrna_SE1 = [parse_picard_rna(f) for f in filelist[boolean_fileexists]]
df_picardrna_SE1 = pd.DataFrame(list_picardrna_SE1,
                               index = metadata_well['wellprefix'][boolean_fileexists]
                               ).drop(["SAMPLE", "LIBRARY", "READ_GROUP"], axis = 1
                                      ).add_prefix("picard_")
df_picardrna_SE1.columns = df_picardrna_SE1.columns.str.lower()

# percent files missing
print("number of target files: " + str(len(filelist)))
print("fraction files missing: ")
print(round(1 - sum(boolean_fileexists)/len(boolean_fileexists), 3))
boolean_filemissing = [not f for f in boolean_fileexists]
if sum(boolean_filemissing) != 0:
    print("missing " + str(sum(boolean_filemissing)) + " files:")
    print(filelist[boolean_filemissing].to_string())

# column QC
print("number of NAs per column:")
print(df_picardrna_SE1.isna().sum().to_string())

print("number of duplicated wells:")
ndupe = df_picardrna_SE1.index.duplicated().sum()
print(ndupe)

# final export
print("exporting Metadata/A07f_RNA_picard_SE1.tsv of shape: {}".format(*df_picardrna_SE1.shape))
df_picardrna_SE1.to_csv("Metadata/A07f_RNA_picard_SE1.tsv", sep = '\t')
print("\n\n")



# single-end, read 2 -----------------------------------------------------------

print("\n\nSE2 logs...")
filelist = metadata_well['A06e_txt_picard_SE2']
boolean_fileexists = [os.path.exists(f) for f in filelist]
list_picardrna_SE2 = [parse_picard_rna(f) for f in filelist[boolean_fileexists]]
df_picardrna_SE2 = pd.DataFrame(list_picardrna_SE2,
                               index = metadata_well['wellprefix'][boolean_fileexists]
                               ).drop(["SAMPLE", "LIBRARY", "READ_GROUP"], axis = 1
                                      ).add_prefix("picard_")
df_picardrna_SE2.columns = df_picardrna_SE2.columns.str.lower()


# percent files missing
print("number of target files: " + str(len(filelist)))
print("fraction files missing: ")
print(round(1 - sum(boolean_fileexists)/len(boolean_fileexists), 3))
boolean_filemissing = [not f for f in boolean_fileexists]
if sum(boolean_filemissing) != 0:
    print("missing " + str(sum(boolean_filemissing)) + " files:")
    print(filelist[boolean_filemissing].to_string())

# column QC
print("number of NAs per column:")
print(df_picardrna_SE2.isna().sum().to_string())

print("number of duplicated wells:")
ndupe = df_picardrna_SE2.index.duplicated().sum()
print(ndupe)

# final export
print("exporting Metadata/A07f_RNA_picard_SE2.tsv of shape: {}".format(*df_picardrna_SE2.shape))
df_picardrna_SE2.to_csv("Metadata/A07f_RNA_picard_SE2.tsv", sep = '\t')
print("\n\n")

## A07. run helper script

In [None]:
%%bash
cat > ../Scripts/A07_compile_RNA_metadata.sub

#!/bin/bash
#$ -cwd
#$ -o sublogs/A07_compile_RNA.$JOB_ID.$TASK_ID
#$ -j y
#$ -l h_rt=2:00:00,h_data=8G
#$ -N A07_compile_RNA
#$ -t 2-6
#$ -hold_jid A06a_star,A06b_starfilt,A06d_featurecounts,A06e_samstat_star


echo "Job $JOB_ID.$SGE_TASK_ID started on:   " `hostname -s`
echo "Job $JOB_ID.$SGE_TASK_ID started on:   " `date `
echo " "





# environment init -------------------------------------------------------------

. /u/local/Modules/default/init/modules.sh # <--
module load anaconda3 # <--
conda activate snmCTseq # <--

export $(cat snmCT_parameters.env | grep -v '^#' | xargs) # <--



# run each helper script (A07*) ------------------------------------------------

# note: in practice these can each be submitted interactively/as its own task,
# as some of these scripts should be much lower resource than others;
# the -t 2-6 job parallelization is just for tidyness

echo "metadata script # $SGE_TASK_ID running:"

case $SGE_TASK_ID in

  1) # usually already run in A06a; run -t 1-6 instead of 2-6 if not yet run
    echo "python Scripts/A07a_trimming.py" 
    python Scripts/A07a_trimming.py
    ;;

  2)
    echo "python Scripts/A07b_RNA_maprate.py"
    python Scripts/A07b_RNA_maprate.py
    ;;

  3)
    echo "python Scripts/A07c_RNA_dedupe.py"
    python Scripts/A07c_RNA_dedupe.py
    ;;

  4)
    echo "python Scripts/A07d_RNA_featcounts.py"
    python Scripts/A07d_RNA_featcounts.py
    ;;

  5)
    echo "python Scripts/A07e_RNA_samtools.py"
    python Scripts/A07e_RNA_samtools.py
    ;;

  6)
    echo "python Scripts/A07f_RNA_picard.py"
    python Scripts/A07f_RNA_picard.py
    ;;

  *)
    ;;
esac


echo "completed 'A07_compile_RNA_metadata.'"

echo " "
echo "Job $JOB_ID.$SGE_TASK_ID ended on:   " `hostname -s`
echo "Job $JOB_ID.$SGE_TASK_ID ended on:   " `date `
echo " "

