# GermlineMasterWorkflow.wdl output check development notes

In [None]:
import os
import sys
import pandas as pd
import numpy as np
x_dir = '/Users/yo/zzIForge/fullyJan10/'
if not os.path.isdir(x_dir):
    print('directory not found\n', x_dir)

In [None]:
def get_rc_codes_df(x_dir):
    """ Usage: return_codes_dataframe = get_rc_codes_df(x_dir)
    args:
        x_dir:         the directory with the "call_..." subdirectories (else you get nothing)
    returns:
        rc_df:         pandas dataframe with the return codes and size of various return files
    """
    good_return_codes_list = ['0', '0\n']
    
    call_dirs = os.listdir(x_dir)
    call_dir_list = []
    call_dir_count = 0
    for cd in call_dirs:
        if os.path.isdir(os.path.join(cd,x_dir)) and cd[0:4] == 'call':
            call_dir_count += 1
            call_dir_list.append(cd)
    
    cols_list = ['rc', 'bam', 'bam.bai']
    #rc_data = np.zeros((call_dir_count,len(cols_list))) - 1
    rc_df = pd.DataFrame(index=call_dir_list,columns=cols_list).fillna('unk')
    root_trim_str, _ = os.path.split(x_dir)
    
    for dir_name, dir_list, files_list in os.walk(x_dir):
        for filename in files_list:
            full_filename = os.path.join(dir_name, filename)
            if filename in cols_list:
                if filename == 'rc':

                    with open(full_filename, 'r') as fh:
                        lines = fh.readlines()

                    if lines[0] in good_return_codes_list:
                        for call_dir in call_dir_list:
                            if call_dir in dir_name:
                                top_dir = dir_name.replace(root_trim_str, '..')
                                rc_df.loc[call_dir, 'rc'] = str(lines[0]).strip()

            else:
                for call_dir in call_dir_list:
                    if call_dir in dir_name:
                        fname, fext = os.path.splitext(filename)
                        some_stuff = os.stat(full_filename)
                        if fext[1:] == 'bam':
                            rc_df.loc[call_dir, 'bam'] = str(some_stuff.st_size)
                        elif fext[1:] == 'bai':
                            rc_df.loc[call_dir, 'bam.bai'] = str(some_stuff.st_size)
                                
    return rc_df                 

In [None]:
somedf = get_rc_codes_df(x_dir)
somedf

## wdl sequence of calls:
    * BashPreamble: src/shell/shell_preamble.sh (every task starts with it)
    * most scripts use (using shared_functions.sh)
1) GermlineMasterWorkflow.wdl (iffie) calls TestTasks/Runtrim_sequences.wdl calls Tasks/trim_sequences.wdl runs:
    */bin/bash trim_sequences.sh (PairedEnd, SampleName, Adapters, CutAdapt, CutAdaptThreads, TrimEnvProfile)
        * TrimSeqScript = src/shell/trim_sequences.sh (uses shared_functions.sh)
        * BashSharedFunctions = src/shell/shared_functions.sh
    * parameter settings
        * TrimEnvProfile = Config/TrimEnvProfile.file
        * CutAdapt = /usr/local/apps/bioapps/python/Python-3.6.1/bin
        * PairedEnd = true or false
        * SampleName = WGS_chr1_5X_E0.005_chunk1
        * Adapters = /Inputs/TruSeqAdaptors.fasta
        * CutAdaptThreads = 8
        * DebugMode = DNE
    * Puts Out: glob("*.fastq.gz") somewhere in hyper-cyber space
    
3) GermlineMasterWorkflow.wdl calls TestTasks/Runalignment.wdl (diffie) calls Tasks/alignment.wdl runs:
    * AlignmentScript: src/shell/alignment.sh 
        * calls: Sentieon /usr/local/apps/bioapps/sentieon/sentieon-genomics-201808.01
    * Warns: cpu, s_vmem, h_vmem
    * Puts Out:
        * OutputBams: SampleName.aligned.sorted.bam
        * OutputBais: SampleName.aligned.sorted.bam.bai
    
4) GermlineMasterWorkflow.wdl calls Tasks/merge_aligned_bam.wdl runs:
    * MergeBamScript: src/shell/merge_bams.sh (using shared_functions.sh)
    * with Config/MergeBamEnvProfile.file
    * Warns: cpu, s_vmem, h_vmem
    * Puts Out:
        * OutputBams: SampleName.aligned.sorted.merged.bam
        * OutputBais: SampleName.aligned.sorted.merged.bam.bai
        
5) GermlineMasterWorkflow.wdl calls Tasks/dedup.wdl runs:
    * DedupScript: src/shell/dedup.sh (using shared_functions.sh)
    * with Config/DedupEnvProfile.file
    * Warns: cpu, s_vmem, h_vmem
    * Puts Out:
        * OutputBams: SampleName.aligned.sorted.deduped.bam
        * OutputBais: SampleName.aligned.sorted.deduped.bam.bai
        
6) GermlineMasterWorkflow.wdl calls Tasks/deliver_alignment.wdl runs:
    * DeliveryAlignment_Script: src/shell/deliver_alignment.sh (using shared_functions.sh)
    * using: GermlineMasterWorkflow.FilledIn.json
    * no output but: dowop Delivery/Alignment
    
7) GermlineMasterWorkflow.wdl calls Tasks/realignment.wdl runs:
    * RealignmentScript: src/shell/realignment.sh (using shared_functions.sh)
    * using: Reference/Mills_and_1000G_gold_standard.indels.hg38.vcf
    * RealignEnvProfile: Config/RealignEnvProfile.file
    * Warns: cpu, s_vmem, h_vmem
    * Puts Out:
        * OutputBams: SampleName.aligned.sorted.deduped.realigned.bam
        * OutputBais: SampleName.aligned.sorted.deduped.realigned.bam.bai
        
8) GermlineMasterWorkflow.wdl calls Tasks/bqsr.wdl runs:
    * BqsrScript: src/shell/bqsr.sh (
    * using: 
        * Reference/Mills_and_1000G_gold_standard.indels.hg38.vcf
        * Reference/dbsnp_138.hg38.vcf
        * Config/BqsrEnvProfile.file
    * Warns: cpu, s_vmem, h_vmem
    * Puts Out:
        * RecalTable: SampleName.recal_data.table
        
9) GermlineMasterWorkflow.wdl calls Tasks/haplotyper.wdl runs:
    * HaplotyperScript: src/shell/haplotyper.sh (using shared_functions.sh)
    * using:
        * HaplotyperExtraOptionsString: (magic string)
        * Config/HaplotyperEnvProfile.file
        * Config/HaplotyperVCFSourceField.file
    * Warns: cpu, s_vmem, h_vmem
    * Puts Out:
        * OutputVcf: SampleName.vcf
        * OutputVcfIdx: SampleName.vcf.idx"
        
10) GermlineMasterWorkflow.wdl calls Tasks/vqsr.wdl runs:
    * VqsrScript: src/shell/vqsr.sh 
    * using magic strings:
        * VqsrSnpResourceString
        * VqsrIndelResourceString
        * AnnotateText
    * using:
        * InputVcf && InputVcfIdx (input output from) VCF from Haplotyper 
        * Config/VqsrEnvProfile.file
    * Warns: cpu, s_vmem, h_vmem
    * Puts Out:
        * OutputVcf: SampleName.SNP.recaled.vcf
        * OutputVcfIdx: SampleName.SNP.recaled.vcf.idx
        
10) GermlineMasterWorkflow.wdl calls Tasks/deliver_HaplotyperVC.wdl runs:
    * DeliveryHaplotyperVC_Script: src/shell/deliver_haplotyperVC.sh (using shared_functions.sh)
    * to dakine: Delivery/HaplotyperVC
    * using: GermlineMasterWorkflow.FilledIn.json    

In [65]:
json_dir = '/Users/yo/zzIForge/Jsons'
json_farameters_file = 'GermlineMasterWorkflow.FilledIn.json'
import knpackage.toolbox as kn
json_parameters = kn.get_run_parameters(json_dir, json_farameters_file)
dakine = 'WorkflowJson'
lineno = 0
for k, v in json_parameters.items():
    if dakine in k:
        print('%i %30s\n\t%s\n'%(lineno, k, v))
    lineno += 1

7 GermlineMasterWF.DAB.WorkflowJson
	/projects/bioinformatics/DEL/Jsons/GermlineMasterWorkflow.FilledIn.json

14 GermlineMasterWF.DHVC.WorkflowJson
	/projects/bioinformatics/DEL/Jsons/GermlineMasterWorkflow.FilledIn.json

