In [1]:
import warnings
warnings.filterwarnings('ignore')
# pandas future quard

import os
import sys
from collections import OrderedDict

import numpy as np
import pandas as pd
import yaml

clone_base = '/projects/bioinformatics/DEL'
config_base = clone_base

wdl_directory = os.path.join(clone_base,'MayomicsVC/src/wdl')

os.listdir(wdl_directory)

['GermlineMasterWorkflow.wdl',
 'DeliveryOfHaplotyperVC',
 'DeliveryOfAlignment',
 'SomaticMasterWorkflow.wdl',
 'HaplotyperVC',
 'Alignment',
 'SomaticVC',
 'DeliveryOfSomaticVC']

In [2]:
def get_task_types(wdl_directory=None):
    """ Usage: types_list = get_task_types(wdl_directory)
    get the unique list of types in the task section of all wdl files in the dir tree 
    """
    if not wdl_directory is None and os.path.isdir(wdl_directory):
        wdl_directory = wdl_directory
    else:
        wdl_directory = os.getcwd()

    skip_line_chars = ['#', '<', '>']
    switch_ON_words = ['task']
    switch_OFF_words = ['command', 'runtime', 'output']
    add_words_ON = False

    first_words = []
    for this_dir, dirs, files in os.walk(wdl_directory):
        for file in files:
            if file[-4:] == '.wdl':
                add_words_ON = False
                full_filename = os.path.join(this_dir, file)
                with open(full_filename, 'r') as fh:
                    lines = fh.readlines()
                for line in lines:
                    l = line.strip()
                    if len(l) > 0 and not l[0] in skip_line_chars:
                        first_word = l.split()[0]
                        if first_word in switch_OFF_words:
                            add_words_ON = False
                        if add_words_ON == True:
                            first_words.append(first_word)
                        if first_word in switch_ON_words:
                            add_words_ON = True

    return sorted(list(set(first_words)))

types_list = get_task_types(wdl_directory)
print('There are %i unique types in all the wdl files:\n'%(len(types_list)))
for wurd in types_list:
    print('\t',wurd)

There are 5 unique types in all the wdl files:

	 Array[File]
	 Boolean
	 File
	 File?
	 String


In [3]:
def get_wdl_variables_dict(wdl_directory=None):
    """ Usage  config_orderd_dict = get_wdl_variables_dict(wdl_directory)
    Get the sorted list of variables and their types from the wdl files in a directory tree
    
    Args:
        wdl_directory:       (default to run directory if not valid directory name)
        
    Returns:
        config_ordered_dict: python OrderedDict of  variable_name: "type_name"
        
    """
    # Get the complete config.txt file:
    skip_line_chars = ['#', '<', '>']
    task_types_list = sorted(['Array[File]', 'File', 'File?', 'Boolean', 'String'])
    config_vars_dict = {}
    for this_dir, dirs, files in os.walk(wdl_directory):
        for file in files:
            if file[-4:] == '.wdl' and file[0] != '.':
                full_filename = os.path.join(this_dir, file)
                with open(full_filename, 'r') as fh:
                    lines = fh.readlines()
                for line in lines:
                    l = line.strip()
                    if len(l) > 0 and not l[0] in skip_line_chars:
                        line_words_list = l.split()
                        first_word = line_words_list[0]
                        if first_word in task_types_list:
                            second_word = line_words_list[1]
                            if not second_word in config_vars_dict.keys():
                                config_vars_dict[second_word] = '"' + first_word + '"'
                                
                            elif first_word != config_vars_dict[second_word][1:-1]:
                                bugger = config_vars_dict[second_word][:-1]
                                config_vars_dict[second_word] = bugger + ', ' + first_word  + '"'
                                
    config_od = OrderedDict()
    for k, v in sorted(config_vars_dict.items()):
        config_od[k] = v

    return config_od

config_vars_dict_ret = get_wdl_variables_dict(wdl_directory)
print('found %i variables'%(len(config_vars_dict_ret)))
config_od = OrderedDict()
for k, v in sorted(config_vars_dict_ret.items()):
    config_od[k] = v
    
for k, v in config_od.items():
    print('%30s: %20s'%(k,v))


found 117 variables
                      Adapters:               "File"
               AlignEnvProfile:               "File"
             AlignHardMemLimit:             "String"
               AlignOutputBais:        "Array[File]"
               AlignOutputBams:        "Array[File]"
             AlignSoftMemLimit:             "String"
               AlignmentScript:               "File"
                  AnnotateText:             "String"
         BWAExtraOptionsString:             "String"
                  BashPreamble:               "File"
           BashSharedFunctions:               "File"
                          Bqsr:            "Boolean"
                BqsrEnvProfile:               "File"
              BqsrHardMemLimit:             "String"
                BqsrKnownSites:             "String"
                    BqsrScript:               "File"
              BqsrSoftMemLimit:             "String"
                    CenterName:             "String"
              ChunkSizeInB

In [4]:
def get_wdl_files_spreadsheet(wdl_directory=None):
    """ wdl_files_spreadsheet = get_wdl_files_spreadsheet(wdl_directory)
    get the section statistics spreadsheet for a directory tree of wdl files 
    
    Args:
        wdl_directory:       (default to run directory if not valid directory name)
        
    Returns:
        wdl_files_spreadsheet: pandas dataframe with rows of wdl filenames, cols of wdl keywords

    """
    if not wdl_directory is None and os.path.isdir(wdl_directory):
        may_dir = wdl_directory
    else:
        may_dir = os.getcwd()
    
    dir_name_stripper, _ = os.path.split(wdl_directory)

    prefix_ignores = ['.', '_']
    
    # assemble dictionary - wdl file names: full path  -------- Build empty pandas dataframe
    mdl_dict = {}
    for dir_name, dir_list, files_list in os.walk(may_dir):
        if len(files_list) > 0:
            for file_name in files_list:
                if file_name[-4:] == '.wdl' and file_name[0] not in prefix_ignores:
                    mdl_dict[file_name] = os.path.join(dir_name, file_name)
    wdl_df = pd.DataFrame.from_dict(mdl_dict, orient='index', columns=['src_path'])
    wdl_df.index.name = 'WDL file'
    
    # define the rest of the columns in terms of wdl keywords
    srch_dict = OrderedDict([('imports','import'), 
                             ('workflows','workflow'), 
                            ('tasks','task'), 
                            ('inputs','input'), 
                            ('outputs','output'),
                            ('commands', 'command')])
    for col_name, _ in srch_dict.items():
        wdl_df[col_name] = 0

    # for each wdl file: count keywords, get top path ---------- fill pandas dataframe
    for name, row in wdl_df.iterrows():
        """ using row.src_path to open files and name to refer to row in dataframe """
        lines = []
        try:
            with open(row.src_path, 'r') as fh:
                lines = fh.readlines()
        except:
            print('Fails to open:\n', row.src_path)
            pass
        
        # count & insert occurrence of keywords
        if len(lines) > 0:
            for l in lines:
                for col_name, key_word in srch_dict.items():
                    if key_word in l:
                        wdl_df[col_name].loc[name] += 1

        # replace the full path name with top path name for readability
        full_path, _ = os.path.split(mdl_dict[name])
        wdl_df['src_path'].loc[name] = full_path.replace(dir_name_stripper, '..')
        
    return wdl_df

wdl_df = get_wdl_files_spreadsheet(wdl_directory)
print(wdl_df.shape)
wdl_df

(30, 7)


Unnamed: 0_level_0,src_path,imports,workflows,tasks,inputs,outputs,commands
WDL file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
GermlineMasterWorkflow.wdl,../wdl,10,1,0,10,0,0
SomaticMasterWorkflow.wdl,../wdl,10,4,0,16,0,0
DeliverHaplotyperVC_ForWorkflow.wdl,../wdl/DeliveryOfHaplotyperVC/Workflow,1,1,0,1,0,0
Testdeliver_haplotyperVC.wdl,../wdl/DeliveryOfHaplotyperVC/TestTasks,1,1,0,0,0,0
deliver_HaplotyperVC.wdl,../wdl/DeliveryOfHaplotyperVC/Tasks,0,1,1,0,0,1
TestDeliverAlignment.wdl,../wdl/DeliveryOfAlignment/Workflow,1,1,0,0,0,0
DeliverAlignment_ForWorkflow.wdl,../wdl/DeliveryOfAlignment/Workflow,1,1,0,1,0,0
Testdeliver_alignment.wdl,../wdl/DeliveryOfAlignment/TestTasks,1,1,0,0,0,0
deliver_alignment.wdl,../wdl/DeliveryOfAlignment/Tasks,0,1,1,0,1,1
TestHaplotyperVC.wdl,../wdl/HaplotyperVC/Workflow,4,1,0,3,0,0


In [6]:
import yaml

def read_json_to_df(full_filename):
    """ Usage: json_df = read_json_to_df(full_filename)
    read a json file into a spreadsheet
    """
    if os.path.isfile(full_filename) != True:
        run_parameters = {}
    else:
        with open(full_filename, 'r') as file_handle:
            run_parameters = yaml.load(file_handle)
        if not isinstance(run_parameters, dict):
            run_parameters = {}
            
    json_df = pd.DataFrame.from_dict(run_parameters, orient='index', columns=['Variable Value'])
    json_df.index.name = 'Variable Name'
    
    return json_df

config_base = clone_base
try_dir = os.path.join(config_base, 'Jsons/GermlineMasterWorkflow.FilledIn.json')
jojo_df = read_json_to_df(try_dir)
print('spreadsheet_size = ', jojo_df.shape)
jojo_df

spreadsheet_size =  (193, 1)


Unnamed: 0_level_0,Variable Value
Variable Name,Unnamed: 1_level_1
GermlineMasterWF.Bqsr,true
GermlineMasterWF.DAB.BashPreamble,/projects/bioinformatics/DEL/MayomicsVC/src/sh...
GermlineMasterWF.DAB.BashSharedFunctions,/projects/bioinformatics/DEL/MayomicsVC/src/sh...
GermlineMasterWF.DAB.DebugMode,-d
GermlineMasterWF.DAB.DeliveryAlignment_Script,/projects/bioinformatics/DEL/MayomicsVC/src/sh...
GermlineMasterWF.DAB.DeliveryFolder_Alignment,/projects/bioinformatics/DEL/Delivery/Alignmen...
GermlineMasterWF.DAB.SampleName,WGS_chr20_21_22
GermlineMasterWF.DAB.WorkflowJson,/projects/bioinformatics/DEL/Jsons/SomaticMast...
GermlineMasterWF.DHVC.BashPreamble,/projects/bioinformatics/DEL/MayomicsVC/src/sh...
GermlineMasterWF.DHVC.BashSharedFunctions,/projects/bioinformatics/DEL/MayomicsVC/src/sh...


## develop test script

```python
%%writefile check_test_ouput.py
"""
check_test_ouput.py
Usage:
python ~/python/check_test_ouput.py -d /projects/mgc/Project_1/DEL/MVP/cromwell-executions/GermlineMasterWF/
or 
python ~/python/check_test_ouput.py -d `pwd`

check return codes and std_out in directory tree
"""

import os
import argparse

good_return_codes_list = ['0', '0\n']
file_names_to_show_list = ['rc', 'stdout', 'stderr']

def check_rc_codes(x_directory=None, show_stds=False):
    if not x_directory is None and os.path.isdir(x_directory):
        dir_tree_root = x_directory
    else:
        dir_tree_root = os.getcwd()
    root_trim_str, _ = os.path.split(dir_tree_root)
    for dir_name, dir_list, files_list in os.walk(dir_tree_root):
        for filename in files_list:
            if filename in file_names_to_show_list:
                full_filename = os.path.join(dir_name, filename)
                with open(full_filename, 'r') as fh:
                    lines = fh.readlines()
                if len(lines) <= 0:
                    continue
                elif filename == 'rc':
                    if lines[0] in good_return_codes_list:
                        top_dir = dir_name.replace(root_trim_str, '..')
                        print('good rc:  %s'%(top_dir))
                    else:
                        print('\n\tBad Dog! Bad Dog!')
                        print('code = %s in \n%s\n'%(str(lines[0]).strip(), full_filename))
                elif show_stds:
                    # opportunity to reformat for readability
                    for line in lines:
                        print(line)
                    
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('-d', type=str)
    parser.add_argument('-show_stds', nargs="?", const=False, type=bool)
    args = parser.parse_args()
    check_rc_codes(args.d, args.show_stds)
```