In [1]:
import sys, glob, os, yaml
import numpy as np
import pandas as pd

# FUNCTION TO CREATE BASH SCRIPTS

In [7]:
def make_single_drug_bash_script(script_outFile, out_dir, yaml_prefix, num_config_files, drug, drug_abbr):
    '''
    Arguments:
    
        script_outFile: the .sh file to create (if one with the same name exists, it will be overwritten)
        out_dir: the output analysis directory. It is the same one listed within each config file
        yaml_prefix: the format for these .sh scripts is a prefix followed by a number. This should be either binary, atu, or mic to reflect the types of analyses
        num_config_files: the total number of config_files to write to this script
        drug: full drug name, i.e. Isoniazid
        drug_abbr: drug abbreviation, i.e. INH
    '''
    
    assert yaml_prefix in ["binary", "atu", "mic"]
    
    with open(script_outFile, "w+") as file:
        
        # write the drug name and abbreviation
        file.write(f'drug="{drug}"\n')
        file.write(f'drug_abbr="{drug_abbr}"\n\n')
        
        # write the config_file array
        file.write("# list of config files to use\n")
        file.write("config_array=(\n")
        
        for i in range(1, num_config_files+1):
            
            # if the number is less than 10, add a 0 in front of it to keep them in order
            if i < 10:
                num_str = f"0{i}"
            else:
                num_str = str(i)
        
            file.write(f" 'config_files/{yaml_prefix}_{num_str}.yaml'\n")
            
        file.write(")\n\n")
        
        # write scripts to run for each config file
        file.write('for i in ${!config_array[@]}; do\n')
        file.write('    python3 -u 01_make_model_inputs.py "${config_array[$i]}" "$drug" "$drug_abbr"\n')
        file.write('    python3 -u 02_regression_with_bootstrap.py "${config_array[$i]}" "$drug" "$drug_abbr"\n')
        file.write('    python3 -u 03_model_analysis.py "${config_array[$i]}" "$drug" "$drug_abbr"\n')
        file.write('done\n\n')
        
        # write the final script
        file.write(f'python3 -u 04_compute_univariate_stats.py "$drug" "{yaml_prefix.upper()}" "{out_dir}"')

In [9]:
# drug = "Delamanid"

# drug_abbr_dict = {"Delamanid": "DLM",
#                   "Bedaquiline": "BDQ",
#                   "Clofazimine": "CFZ",
#                   "Ethionamide": "ETH",
#                   "Linezolid": "LZD",
#                   "Moxifloxacin": "MXF",
#                   "Capreomycin": "CAP",
#                   "Amikacin": "AMI",
#                   "Pyrazinamide": "PZA",
#                   "Kanamycin": "KAN",
#                   "Levofloxacin": "LEV",
#                   "Streptomycin": "STM",
#                   "Ethambutol": "EMB",
#                   "Isoniazid": "INH",
#                   "Rifampicin": "RIF"
#                  }

# out_dir = "/n/data1/hms/dbmi/farhat/Sanjana/who-mutation-catalogue"

# make_single_drug_bash_script(f"bash_scripts/run_{drug_abbr_dict[drug]}.sh", out_dir, "binary", 20, drug, drug_abbr_dict[drug])

In [19]:
drug_abbr_dict = {"Delamanid": "DLM",
                  "Bedaquiline": "BDQ",
                  "Clofazimine": "CFZ",
                  "Ethionamide": "ETH",
                  "Linezolid": "LZD",
                  "Moxifloxacin": "MXF",
                  "Capreomycin": "CAP",
                  "Amikacin": "AMI",
                  "Pyrazinamide": "PZA",
                  "Kanamycin": "KAN",
                  "Levofloxacin": "LEV",
                  "Streptomycin": "STM",
                  "Ethambutol": "EMB",
                  "Isoniazid": "INH",
                  "Rifampicin": "RIF"
                 }


# example to make all 15 bash scripts with the ATU config files
out_dir = "/n/data1/hms/dbmi/farhat/Sanjana/who-mutation-catalogue"

for drug in drug_abbr_dict.keys():
    make_single_drug_bash_script(f"bash_scripts/run_{drug_abbr_dict[drug]}.sh", out_dir, "atu", 8, drug, drug_abbr_dict[drug])

# CONFIG FILES FOR THE BINARY ANALYSIS: SHOULD BE 16 TOTAL

In [5]:
# make directory to store config files if it doesn't exist
if not os.path.isdir("config_files"):
    os.mkdir("config_files")

# order of parameters to be updated: pheno_category_lst, tiers_lst, pool_type, synonymous, amb_mode
all_combos = [[["WHO"], ["1"], "poolSeparate", False, "DROP"],
              [["WHO"], ["1"], "poolALL", False, "DROP"],
              [["WHO"], ["1"], "unpooled", False, "DROP"],
              [["WHO"], ["1"], "poolSeparate", True, "DROP"],
              ################################################
              [["WHO"], ["1", "2"], "poolSeparate", False, "DROP"],
              [["WHO"], ["1", "2"], "poolALL", False, "DROP"],
              [["WHO"], ["1", "2"], "unpooled", False, "DROP"],
              [["WHO"], ["1", "2"], "poolSeparate", True, "DROP"],
              ################################################
              [["ALL"], ["1"], "poolSeparate", False, "DROP"],
              [["ALL"], ["1"], "poolALL", False, "DROP"],
              [["ALL"], ["1"], "unpooled", False, "DROP"],
              [["ALL"], ["1"], "poolSeparate", True, "DROP"],
              ################################################
              [["ALL"], ["1", "2"], "poolSeparate", False, "DROP"],
              [["ALL"], ["1", "2"], "poolALL", False, "DROP"],
              [["ALL"], ["1", "2"], "unpooled", False, "DROP"],
              [["ALL"], ["1", "2"], "poolSeparate", True, "DROP"],
              ################################################
              [["WHO"], ["1"], "poolSeparate", False, "AF"],
              [["WHO"], ["1", "2"], "poolSeparate", False, "AF"],
              [["ALL"], ["1"], "poolSeparate", False, "AF"],
              [["ALL"], ["1", "2"], "poolSeparate", False, "AF"]
            ]

# example set of kwargs -- KEEP UPDATED!
kwargs = yaml.safe_load(open("config.yaml"))


# config files run from 1 - len(all_combos)
for i in list(range(1, len(all_combos)+1)):
        
    # if the number is less than 10, add a 0 in front of it to keep them in order
    if i < 10:
        num_str = f"0{i}"
    else:
        num_str = str(i)
    
    with open(f"config_files/binary_{num_str}.yaml", "w+") as file:
        
        # constant for all cases
        kwargs["binary"] = True
        kwargs["atu_analysis"] = False
        
        # delete unpooled because the key was updated to pool_type
        delete_keys = ["model_prefix", "unpooled"]
        for key in delete_keys:
            if key in kwargs.keys():
                del kwargs[key]
        
        # update param combinations and write to the file
        param_dict = dict(zip(["pheno_category_lst", "tiers_lst", "pool_type", "synonymous", "amb_mode"], all_combos[i-1]))
        kwargs.update(param_dict)
        yaml.dump(kwargs, file, default_flow_style=False, sort_keys=False)

# CONFIG FILES FOR THE CC vs. CC-ATU ANALYSES: SHOULD BE 8 TOTAL (SO FAR) 

In [6]:
# order of parameters to be updated:, tiers_lst, pool_type, atu_analysis_type
all_combos = [[["1"], "poolSeparate", "CC"],
              [["1"], "poolALL", "CC"],
              [["1"], "unpooled", "CC"],
              ############################################
              [["1"], "poolSeparate", "CC-ATU"],
              [["1"], "poolALL", "CC-ATU"],
              [["1"], "unpooled", "CC-ATU"],
              ############################################
              [["1", "2"], "poolSeparate", "CC"],
              [["1", "2"], "poolALL", "CC"],
              [["1", "2"], "unpooled", "CC"],
              ############################################
              [["1", "2"], "poolSeparate", "CC-ATU"],
              [["1", "2"], "poolALL", "CC-ATU"],
              [["1", "2"], "unpooled", "CC-ATU"]
            ]

# example set of kwargs
kwargs = yaml.safe_load(open("config.yaml"))

# config files run from 1 - len(all_combos)
for i in list(range(1, len(all_combos)+1)):
        
    # if the number is less than 10, add a 0 in front of it to keep them in order
    if i < 10:
        num_str = f"0{i}"
    else:
        num_str = str(i)
    
    with open(f"config_files/atu_{num_str}.yaml", "w+") as file:
        
        # constant for all cases
        kwargs["binary"] = True
        kwargs["atu_analysis"] = True
        kwargs["synonymous"] = False
        kwargs["amb_mode"] = "DROP"
        
        # not relevant, but set them all to WHO here for consistency
        kwargs["pheno_category_lst"] = "WHO"
        
        # delete unpooled because the key was updated to pool_type
        delete_keys = ["model_prefix", "unpooled"]
        for key in delete_keys:
            if key in kwargs.keys():
                del kwargs[key]
        
        # update param combinations and write to the file
        param_dict = dict(zip(["tiers_lst", "pool_type", "atu_analysis_type"], all_combos[i-1]))
        kwargs.update(param_dict)
        yaml.dump(kwargs, file, default_flow_style=False, sort_keys=False)

# TODO: CONFIG FILES FOR THE MIC ANALYSIS

In [None]:
# # not relevant, but the parameter will get ignored in the scripts
# phenos = ["WHO"]
# tiers = [["1"], ["1", "2"]]
# unpooled = [False, True]
# syn = [False, True]
# amb_mode = ["DROP", "AF"]

# all_combos = list(itertools.product(*[phenos, tiers, unpooled, syn, amb_mode]))
# print(len(all_combos))

# # example set of kwargs
# kwargs = yaml.safe_load(open("config.yaml"))

# # config files run from 1 - len(all_combos)
# for i in list(range(len(1, all_combos+1))):
        
#     # if the number is less than 10, add a 0 in front of it to keep them in order
#     if i < 10:
#         num_str = f"0{i}"
#     else:
#         num_str = str(i)
    
#     with open(f"config_files/mic_{num_str}.yaml", "r+") as file:
        
#         kwargs["binary"] = True
#         kwargs["atu_analysis"] = False
#         yaml.dump(kwargs, file, default_flow_style=False, sort_keys=False)