In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['figure.dpi'] = 125
import seaborn as sns
from Bio import SeqIO, Seq
import scipy.stats as st
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
import glob, os, sys, yaml, subprocess, itertools, sparse

who_variants_combined = pd.read_csv("who_confidence_2021.csv")
drug_gene_mapping = pd.read_csv("../data/drug_gene_mapping.csv")
samples_summary = pd.read_csv("../data/samples_summary.csv")
snp_scheme = pd.read_csv("../data/coll2014_SNP_scheme.tsv", sep="\t")
snp_scheme["#lineage"] = snp_scheme["#lineage"].str.replace("lineage", "")
snp_scheme.rename(columns={"#lineage": "Lineage"}, inplace=True)

lineages = pd.read_csv("../data/combined_lineages_samples.csv", low_memory=False)
solo_results = pd.read_csv("../results/SOLO_Final_Aug2023.csv", header=[1]).query("tier==1")
# solo_results = pd.read_excel("../results/SOLO primary_STATA_ver18Feb2023.xlsx", sheet_name=None)

# if len(solo_results) == 1:
#     solo_results = solo_results[list(solo_results.keys())[0]]
    
# solo_results = solo_results.rename(columns={"variant":"mutation"})
# solo_results["Initial_Confidence_Grading"] = solo_results["Initial_Confidence_Grading"].replace('4) Not assoc w R - interim', '4) Not assoc w R - Interim')
    
import warnings
warnings.filterwarnings(action='ignore')

# utils files are in a separate folder
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), "utils"))
from stats_utils import *
from data_utils import *

# CHANGE ANALYSIS DIR BEFORE RUNNING THE NOTEBOOK!
analysis_dir = '/n/data1/hms/dbmi/farhat/Sanjana/who-mutation-catalogue'
from functools import reduce

  solo_results = pd.read_csv("../results/SOLO_Final_Aug2023.csv", header=[1]).query("tier==1")


In [2]:
drugs_lst = os.listdir(analysis_dir)

drug_abbr_dict = {"Delamanid": "DLM",
                  "Bedaquiline": "BDQ",
                  "Clofazimine": "CFZ",
                  "Ethionamide": "ETA",
                  "Linezolid": "LZD",
                  "Moxifloxacin": "MXF",
                  "Capreomycin": "CAP",
                  "Amikacin": "AMI",
                  "Pretomanid": "PTM",
                  "Pyrazinamide": "PZA",
                  "Kanamycin": "KAN",
                  "Levofloxacin": "LEV",
                  "Streptomycin": "STM",
                  "Ethambutol": "EMB",
                  "Isoniazid": "INH",
                  "Rifampicin": "RIF"
                 }

# Write Final Dataframes for the Binary Analysis to an Excel File

Each drug will have a separate Excel file. Each file will have 12 sheets, one for each model.

In [3]:
def get_unpooled_table_by_tier(drug, tiers_lst, folder, model_prefix):
    
    ################## 1. READ IN RIDGE REGRESSION RESULTS ##################
    model_permute = pd.read_csv(os.path.join(analysis_dir, drug, folder, model_prefix, "model_analysis.csv")).query("~mutation.str.contains('PC')")
    
    ################## 2. READ IN LRT RESULTS ##################
    LRTresults = pd.read_csv(os.path.join(analysis_dir, drug, folder, model_prefix, "LRT_results.csv"))

    # because the p-values are NaN for the FULL model row, they will be removed, so then the dataframes can be merged using inner
    LRTresults = add_pval_corrections(LRTresults.iloc[1:, ])

    # check that all mutations are represented in both the LRT results and regression model results
    assert len(set(model_permute["mutation"].values).symmetric_difference(LRTresults["mutation"].values)) == 0
    
    # combine results into a single dataframe for easy searching. REMOVE BONFERRONI AND COEFS
    combined_results = model_permute[model_permute.columns[~model_permute.columns.str.contains("|".join(["Bonferroni", "coef"]))]].merge(LRTresults[["mutation", "LRT_pval", "BH_LRT_pval", "LRT_neutral_pval", "BH_LRT_neutral_pval"]]
                                                                                                                  , on="mutation", how="inner")

    combined_results["Tier"] = tiers_lst[-1]

    # columns to return, in the desired order
    keep_cols = ['mutation', 'Tier', 'predicted_effect', 'position', 'confidence', 'Odds_Ratio',
                           #'OR_LB', 'OR_UB', 
                 'pval', 'BH_pval', 'neutral_pval', 'BH_neutral_pval', 'LRT_pval', 'BH_LRT_pval', 'LRT_neutral_pval', 'BH_LRT_neutral_pval']

    keep_cols += ['Num_Isolates', "Mut_R", "Mut_S", "NoMut_S", "NoMut_R", 
                  'PPV', 'NPV', 'Sens', 'Spec', 'LR+', 'LR-',
                   'PPV_LB', 'PPV_UB', 'NPV_LB', 'NPV_UB', 'Sens_LB', 'Sens_UB', 'Spec_LB',
                   'Spec_UB', 'LR+_LB', 'LR+_UB', 'LR-_LB', 'LR-_UB'
                   ]

    return combined_results[keep_cols]

In [4]:
def add_significance_category(df, drug, model_path):
    '''
    Add significance category annotations, in the form of integers. Add the ones with the fewest requirements first, and then progressively add
    '''
    
    col_name = "regression_confidence"
    df = df.reset_index(drop=True)
    df[["Tier", "synonymous"]] = df[["Tier", "synonymous"]].astype(int)
    
    # lower significance threshold for tier 2 genes and silent variants
    if len(df["Tier"].unique()) == 2 or "withSyn" in model_path:
        thresh = 0.01
    else:
        thresh = 0.05
        
    # relaxed thresholds for pncA
    df.loc[(df["mutation"].str.contains('pncA')) & (df["BH_pval"] <= thresh) & (df["Odds_Ratio"] > 1) & (df["Mut_R"] >= 2) & (df["PPV"] >= 0.5), col_name] = "Possible Assoc w R"
    df.loc[(df["mutation"].str.contains('pncA')) & (df["BH_pval"] <= thresh) & (df["Odds_Ratio"] < 1) & (df["Mut_R"] >= 2) & (df["NPV"] >= 0.5), col_name] = "Possible Assoc w S"

    # "regular" thresholds for non-pncA mutations
    # anything without Num_Isolates >= 5 is Uncertain because it's too rare to make conclusions
    df.loc[(~df["mutation"].str.contains('pncA')) & (df["BH_pval"] <= thresh) & (df["Odds_Ratio"] > 1) & (df["Num_Isolates"] >= 5) & (df["PPV_LB"] >= 0.25), col_name] = "Possible Assoc w R"
    df.loc[(~df["mutation"].str.contains('pncA')) & (df["BH_pval"] <= thresh) & (df["Odds_Ratio"] < 1) & (df["Num_Isolates"] >= 5) & (df["NPV_LB"] >= 0.25), col_name] = "Possible Assoc w S"
    
    # get additional evidence from the LRT to be classified as Assoc w R/S. Only classify Tier 1 here
    df.loc[(df[col_name] == "Possible Assoc w R") & (df["BH_LRT_pval"] <= thresh) & (df["Tier"] == 1), col_name] = "Assoc w R"
    df.loc[(df[col_name] == "Possible Assoc w S") & (df["BH_LRT_pval"] <= thresh) & (df["Tier"] == 1), col_name] = "Assoc w S"
    
    # # phenos = ALL/WHO is the middle part of the path, so split and keep the first and last parts
    # MIC_model_path = "/".join([model_path.split("/")[0], model_path.split("/")[-1]])
    # MIC_model_analysis = pd.read_csv(os.path.join(analysis_dir, drug, "MIC", MIC_model_path, "model_analysis.csv")).query("~mutation.str.contains('PC')")
    
    # neutral mutations: not significant in regression AND significant in the neutral LRT test or the permutation neutral test AND present at high enough frequency
    df.loc[(df["BH_pval"] > thresh) & ((df["BH_neutral_pval"] <= thresh) | (df["BH_LRT_neutral_pval"] <= thresh)) & (df["Num_Isolates"] >= 5), col_name] = "Neutral"
        
    # upgrade mutations in the Possible Assoc categories if they have a significant associations with MIC
    # downgrade mutations in the top categories if they DO NOT HAVE significant associations with MIC
    # AT THIS POINT, THE HIGHEST A TIER 2 MUTATION CAN BE IS AT POSSIBLE

    # df = df.merge(MIC_model_analysis[["mutation", "coef", "BH_pval"]].rename(columns={"coef": "MIC_coef", "BH_pval": "BH_MIC_pval"}), on="mutation", how="left")

    # if want to use MIC model results to up- or downgrade mutations
    # mic_evidence_lst = []
    
    # for i, row in df.iterrows():

    #     OR, OR_BH_pval, MIC_coef, BH_MIC_pval = row[["Odds_Ratio", "BH_pval", "MIC_coef", "BH_MIC_pval"]].values
        
    #     if not pd.isnull(MIC_coef) and not pd.isnull(row[col_name]):
                        
    #         if "Possible" in row[col_name] or row["Tier"] == 2:
                
    #             # only upgrade or downgrade if the MIC coefficient is significant
    #             # if it is not significant, then don't make any changes
    #             if BH_MIC_pval < thresh:

    #                 # keep track of mutations that we use MIC evidence for
    #                 mic_evidence_lst.append(row["mutation"])

    #                 # HANDLE POSSIBLE MUTATIONS: if MIC evidence agrees and is significant, upgrade
    #                 # if MIC evidence disagrees or is NOT significant, downgrade
    #                 if OR > 1:
    #                     if MIC_coef > 0:
    #                         df.loc[i, col_name] = "Assoc w R"
    #                     else:
    #                         df.loc[i, col_name] = "Uncertain"

    #                 elif OR < 1:
    #                     if MIC_coef < 0:
    #                         df.loc[i, col_name] = "Assoc w S"
    #                     else:
    #                         df.loc[i, col_name] = "Uncertain"
                            
    # all other are uncertain
    df[col_name] = df[col_name].fillna("Uncertain")
    # df.loc[df["mutation"].isin(mic_evidence_lst), "Use_MIC_Evidence"] = 1
    # df["Use_MIC_Evidence"] = df["Use_MIC_Evidence"].fillna(0).astype(int)
    return df

In [5]:
def add_single_lineage_annotations_to_model(drug, df, folder, model_path):
    '''
    Annotate mutations that are present in a single lineage. Basically just a flag when performing further analyses.
    
    It's done at the lowest level because that's the output of Coll2014. So it will not flag mutations that are all present in L2, unless L2 was the deepest category fast-lineage-caller identified.
    '''
    
    model_matrix = pd.read_pickle(os.path.join(analysis_dir, drug, folder, model_path, "model_matrix.pkl"))

    lineages_single_model = lineages.query("~Coll2014.str.contains(',')")
    model_matrix = model_matrix.loc[model_matrix.index.isin(lineages_single_model['Sample_ID'].values)]
    model_matrix = model_matrix.merge(lineages_single_model[["Coll2014", "Sample_ID"]], left_index=True, right_on="Sample_ID")
    
    single_lineage_mutations = {}

    for col in model_matrix.columns:

        if col not in ["Sample_ID", "Coll2014"]:

            mutation_lineages = np.unique(model_matrix.loc[model_matrix[col]==1]["Coll2014"].values)

            # there is only one lineage present
            if len(mutation_lineages) == 1:
                single_lineage_mutations[col] = mutation_lineages[0]
    
    # add lineage annotations to dataframe. Only mutations that are in the model matrix are in this dictionary
    df["single_lineage"] = df["mutation"].map(single_lineage_mutations)
    return df

In [6]:
def export_binary_analyses(drugs_lst, read_folder, write_folder, analyses_lst, pooled_model_variants=False):
    '''
    pooled_model_variants boolean indicates whether to get the statistics for the non-lof, non-inframe mutations from the unpooled models or the pooled models
    '''
    
    if not os.path.isdir(f"../results/{write_folder}"):
        os.mkdir(f"../results/{write_folder}")
    
    for drug in np.sort(drugs_lst):
        
        all_analyses = {}

        for i, model_path in enumerate(analyses_lst):
            # some may not be there. Usually this is Pretomanid because there are no tier 2 genes or WHO phenotypes
            if os.path.isfile(os.path.join(analysis_dir, drug, read_folder, model_path, "model_analysis.csv")):
                            
                tiers_lst = [["1", "2"] if "1+2" in model_path else ["1"]][0]
                phenos_name = ["ALL" if "phenos=ALL" in model_path else "WHO"][0]
                
                # if "dropAF_withSyn_unpooled" in model_path:
                phenos_name = ["ALL" if "ALL" in model_path else "WHO"][0]
                add_analysis = get_unpooled_table_by_tier(drug, tiers_lst, read_folder, model_path)
                
                add_analysis["pool_type"] = model_path.split("_")[-1]
                add_analysis["synonymous"] = int("withSyn" in model_path)
                
                add_analysis = add_analysis[add_analysis.columns[~add_analysis.columns.str.contains("|".join(["coef", "Bonferroni"]))]]
                add_analysis = add_significance_category(add_analysis, drug, model_path)

                # add annotation denoting whether a mutation is only present in a single lineage
                add_analysis = add_single_lineage_annotations_to_model(drug, add_analysis, read_folder, model_path)

                # exclude mutations that are already covered in earlier models
                exclude_mutations = []
                
                # for models with synonymous mutations, keep only the data for the synonymous ones
                # the data for nonsyn mutations will come from the noSyn models
                if "withSyn" in model_path:
                    try:
                        exclude_mutations += list(pd.read_pickle(os.path.join(analysis_dir, drug, read_folder, model_path.replace("withSyn", "noSyn"), "model_matrix.pkl")).columns)
                    except:
                        pass
                        
                # select which model results to keep for mutations (non-LoF, non-inframe) tested in both the pooled and unpooled models
                # if pooled_model_variants = True, keep the stats from the pooled model. else, keep the stats from the unpooled model
                if pooled_model_variants:

                    # no pooled + synonymous models
                    if "unpooled" in model_path and "noSyn" in model_path:
                        try:
                            exclude_mutations += list(pd.read_pickle(os.path.join(analysis_dir, drug, read_folder, model_path.replace("unpooled", "poolSeparate"), "model_matrix.pkl")).columns)
                        except:
                            pass
                            
                else:
                    # exclude mutations in the pooled model so that we keep the values estimated in the unpooled model
                    if "poolSeparate" in model_path and "noSyn" in model_path:
                        try:
                            exclude_mutations += list(pd.read_pickle(os.path.join(analysis_dir, drug, read_folder, model_path.replace("poolSeparate", "unpooled"), "model_matrix.pkl")).columns)
                        except:
                            pass
                            
                add_analysis = add_analysis.query("mutation not in @exclude_mutations")

                # the phenotype category is only relevant for the binary analysis
                if read_folder == "BINARY":
                    add_analysis["Phenos"] = ["ALL" if "phenos=ALL" in model_path else "WHO"][0]

                add_analysis.rename(columns={"confidence": "confidence_V1",
                                             "Mut_R": "Present_R",
                                             "NoMut_R": "Absent_R",
                                             "Mut_S": "Present_S",
                                             "NoMut_S": "Absent_S"
                                            }, inplace=True)

                if len(add_analysis) > 0:
                    all_analyses[model_path.replace("phenos=", "").replace("/", ",").replace("tiers=", "T").replace("dropAF_", "")] = add_analysis
    
        with pd.ExcelWriter(f"../results/{write_folder}/{drug}.xlsx") as file:
            for key, val in all_analyses.items():
                val.to_excel(file, sheet_name=key, index=False)
                    
        print(f"Finished {len(all_analyses)} analyses for {drug}")

In [7]:
binary_analyses_lst = [
                        ########### Tier 1, WHO phenos ###########
                        "tiers=1/phenos=WHO/dropAF_noSyn_unpooled",
                        "tiers=1/phenos=WHO/dropAF_noSyn_poolSeparate",
                        "tiers=1/phenos=WHO/dropAF_withSyn_unpooled",
                        # ########### Tiers 1 + 2, WHO phenos ###########
                        # "tiers=1+2/phenos=WHO/dropAF_noSyn_unpooled",
                        # "tiers=1+2/phenos=WHO/dropAF_noSyn_poolSeparate",
                        # "tiers=1+2/phenos=WHO/dropAF_withSyn_unpooled",
                        ########### Tier 1, ALL phenos ###########
                        "tiers=1/phenos=ALL/dropAF_noSyn_unpooled",
                        "tiers=1/phenos=ALL/dropAF_noSyn_poolSeparate",
                        "tiers=1/phenos=ALL/dropAF_withSyn_unpooled",
                        # ########### Tiers 1 + 2, ALL phenos ###########
                        # "tiers=1+2/phenos=ALL/dropAF_noSyn_unpooled",
                        # "tiers=1+2/phenos=ALL/dropAF_noSyn_poolSeparate",
                        # "tiers=1+2/phenos=ALL/dropAF_withSyn_unpooled",
                      ]

export_binary_analyses(drugs_lst, "BINARY", "BINARY_POOL", binary_analyses_lst, pooled_model_variants=True)
export_binary_analyses(drugs_lst, "BINARY", "BINARY", binary_analyses_lst, pooled_model_variants=False)

Finished 6 analyses for Amikacin
Finished 6 analyses for Bedaquiline
Finished 6 analyses for Capreomycin
Finished 6 analyses for Clofazimine
Finished 6 analyses for Delamanid
Finished 6 analyses for Ethambutol
Finished 6 analyses for Ethionamide
Finished 6 analyses for Isoniazid
Finished 6 analyses for Kanamycin
Finished 6 analyses for Levofloxacin
Finished 5 analyses for Linezolid
Finished 6 analyses for Moxifloxacin
Finished 3 analyses for Pretomanid
Finished 6 analyses for Pyrazinamide
Finished 6 analyses for Rifampicin
Finished 6 analyses for Streptomycin
Finished 6 analyses for Amikacin
Finished 6 analyses for Bedaquiline
Finished 6 analyses for Capreomycin
Finished 6 analyses for Clofazimine
Finished 6 analyses for Delamanid
Finished 6 analyses for Ethambutol
Finished 6 analyses for Ethionamide
Finished 6 analyses for Isoniazid
Finished 6 analyses for Kanamycin
Finished 6 analyses for Levofloxacin
Finished 5 analyses for Linezolid
Finished 6 analyses for Moxifloxacin
Finished 3 a

In [156]:
# drugs_use_mic = []

# for drug in drugs_lst:

#     results = pd.read_excel(f"../results/BINARY/{drug}.xlsx", sheet_name=None)

#     for key, df in results.items():
#         if len(df.query("Use_MIC_Evidence == 1")) > 0:
#             drugs_use_mic.append(drug)
#             break

# print(len(drugs_use_mic))

7


In [157]:
# mut_use_mic = {}

# for drug in drugs_use_mic:

#     mut_use_mic[drug] = []
#     results = pd.read_excel(f"../results/BINARY/{drug}.xlsx", sheet_name=None)

#     for key, df in results.items():
#         pheno_category = df.Phenos.values[0]
#         if len(df.query("Use_MIC_Evidence == 1")) > 0:
#             muts_lst = list(df.query("Use_MIC_Evidence == 1").mutation.values)
#             mut_use_mic[drug] += [f"{pheno_category}_{mut}" for mut in muts_lst]

In [159]:
# pd.DataFrame.from_dict(mut_use_mic, orient='index').T

Unnamed: 0,Pyrazinamide,Amikacin,Ethambutol,Bedaquiline,Rifampicin,Isoniazid,Ethionamide
0,ALL_pncA_p.Glu15*,WHO_eis_p.Met1?,WHO_embB_c.2499G>A,WHO_Rv0678_p.Cys46Arg,ALL_rpoB_p.Val695Leu,WHO_inhA_c.-100C>A,WHO_ethA_p.His281Pro
1,ALL_pncA_c.3G>A,,ALL_ubiA_p.Glu273Asp,WHO_Rv0678_p.Ile67Ser,,ALL_katG_p.Phe167Cys,WHO_ethA_p.Trp256fs
2,,,ALL_embC_c.-900C>T,WHO_mmpL5_p.Thr794Ile,,,WHO_ethA_p.Thr321Pro
3,,,ALL_embB_c.3165C>A,WHO_Rv0678_p.Gln51fs,,,WHO_ethA_p.Leu225fs
4,,,ALL_embB_c.2499G>A,,,,WHO_ethA_p.Tyr211Ser
5,,,,,,,WHO_ethA_p.Asp464fs
6,,,,,,,WHO_mshA_lof
7,,,,,,,ALL_inhA_c.-100C>A
8,,,,,,,ALL_inhA_c.9A>C


In [8]:
def get_all_results_single_drug(drug, excel_dir):

    full_results_excel = pd.read_excel(f"../results/{excel_dir}/{drug}.xlsx", sheet_name=None)
    del_cols = ["Phenos", "pool_type", "synonymous", "confidence_V1", "pval", "neutral_pval", "LRT_pval", "LRT_neutral_pval"]
    
    WHO_combined = []
    ALL_combined = []

    for name, df in full_results_excel.items():

        df = df[df.columns[~df.columns.isin(del_cols)]]
        
        if "WHO" in name:
            WHO_combined.append(df)
        elif "ALL" in name:
            ALL_combined.append(df)
        else:
            print(name)

    # no ALL phenotype for Pretomanid. 
    # For one of the WHO models for Linezolid, pooling mutations doesn't make a difference, so the number of models is different between WHO and ALL
    if drug not in ["Pretomanid", "Linezolid"]:
        assert len(WHO_combined) == len(ALL_combined)
        assert len(pd.concat(WHO_combined)) == len(pd.concat(WHO_combined).drop_duplicates("mutation"))
        assert len(pd.concat(ALL_combined)) == len(pd.concat(ALL_combined).drop_duplicates("mutation"))

    if drug != 'Pretomanid':
        ALL_combined = pd.concat(ALL_combined)

    WHO_combined = pd.concat(WHO_combined)
    
    return WHO_combined, ALL_combined

In [9]:
def clean_WHO_results_write_to_csv(drug, in_folder, out_folder, tiers_lst=[1]):

    cols_lst = ['Odds_Ratio', 'BH_pval', 'BH_neutral_pval', 'BH_LRT_pval', 'BH_LRT_neutral_pval',
       'Num_Isolates', 'Present_R', 'Present_S', 'Absent_S', 'Absent_R', 'PPV',
       'NPV', 'Sens', 'Spec', 'PPV_LB', 'PPV_UB', 'NPV_LB',
       'NPV_UB', 'Sens_LB', 'Sens_UB', 'Spec_LB', 'Spec_UB', 'regression_confidence', 'single_lineage', ]#'Use_MIC_Evidence']
    
    WHO_results_single_drug, ALL_results_single_drug = get_all_results_single_drug(drug, in_folder)

    WHO_results_single_drug = pd.concat([WHO_results_single_drug[["mutation", "predicted_effect"]], 
                                        WHO_results_single_drug[cols_lst], 
                                        #WHO_results_single_drug[['MIC_coef', 'BH_MIC_pval']]
                                       ], axis=1)

    # fix LoF naming
    WHO_results_single_drug["mutation"] = WHO_results_single_drug.mutation.str.replace("lof", "LoF")
    WHO_results_single_drug["predicted_effect"] = WHO_results_single_drug.predicted_effect.str.replace("lof", "LoF")

    # any mutations that were not in any regression model are added back in here as Uncertain with additional info in the Reason column
    missing_mut_df = solo_results.query("drug==@drug & tier in @tiers_lst & variant not in @WHO_results_single_drug.mutation.values")[['variant', 'effect']].rename(columns={'variant': 'mutation', 'effect': 'predicted_effect'})
    missing_mut_df['regression_confidence'] = 'Uncertain'
    missing_mut_df['Reason'] = 'Not Graded'

    save_df = pd.concat([WHO_results_single_drug, missing_mut_df], axis=0).rename(columns=dict(zip(cols_lst, [f"WHO_{col}" for col in cols_lst]))).sort_values("WHO_Odds_Ratio", ascending=False)

    # make this column so that it can be used like for all the other drugs. It's just a copy of WHO_regression_confidence because there is no ALL dataset
    save_df["FINAL CONFIDENCE GRADING"] = save_df["WHO_regression_confidence"]

    # rename columns for consistency with SOLO, then save
    save_df.rename(columns={'WHO_regression_confidence': 'Initial confidence grading WHO dataset'}).to_csv(f"../results/{out_folder}/{drug}.csv", index=False)

In [10]:
# def upgrade_components_of_pooled_variants(df):
    
#     '''
#     Use this function if you want to upgrade component variants of LoF and inframe pooling based on the pooled results. Used only to apply the additional grading criterion. 
#     LoFs in katG, pncA, Rv0678, pepQ, ddn, fbiA, fbiB, fbiC, fgd!, Rv2983, gid, ethA, tlyA
#     '''
    
#     reason_col = 'AddGrading'
    
#     # useful to keep this for later
#     df.insert(1, 'Gene', df['mutation'].str.split('_', expand=True)[0].values)
    
#     #  dictionary mapping genes to effects of LoF variants
#     lof_features = ["frameshift", "start_lost", "stop_gained", "feature_ablation"]
#     lof_confidence_dict = dict(zip(df.loc[df['predicted_effect'].str.lower() == 'lof']['Gene'].values, df.loc[df['predicted_effect'].str.lower() == 'lof']['regression_confidence']))

#     # dictionary mapping genes to effects of inframe variants
#     inframe_features = ["inframe_insertion", "inframe_deletion"]
#     inframe_confidence_dict = dict(zip(df.loc[df['predicted_effect'].str.lower() == 'inframe']['Gene'].values, df.loc[df['predicted_effect'].str.lower() == 'inframe']['regression_confidence']))

#     # only make the replacement for variants that are Uncertain (basically don't override any categorizations made by the fully data-driven approach)
#     df.loc[(df['predicted_effect'].isin(lof_features)) & (df['regression_confidence'] == 'Uncertain'), reason_col] = 'LoF component'
#     df.loc[df[reason_col] == 'LoF component', 'regression_confidence_upgrade'] = df.loc[df[reason_col] == 'LoF component']['Gene'].map(lof_confidence_dict)

#     df.loc[(df['predicted_effect'].isin(inframe_features)) & (df['regression_confidence'] == 'Uncertain'), reason_col] = 'inframe component'
#     df.loc[df[reason_col] == 'inframe component', 'regression_confidence_upgrade'] = df.loc[df[reason_col] == 'inframe component']['Gene'].map(inframe_confidence_dict)

#     # if no update was actually made (meaning the confidence of the pooled variant was also Uncertain, then revert reason to NaN so that it is only not NaN for variants that have been updated)
#     df.loc[df['regression_confidence'] == df['regression_confidence_upgrade'], reason_col] = np.nan

#     # fill NaNs with the original value (meaning no upgrade was made because they are NOT Lof or inframe component variants)
#     df['regression_confidence_upgrade'] = df['regression_confidence_upgrade'].fillna(df['regression_confidence'])

#     # then keep just the one column
#     del df['regression_confidence']

#     return df.rename(columns={'regression_confidence_upgrade': 'regression_confidence'})

In [11]:
def combine_WHO_ALL_results_write_to_csv(drug, in_folder, out_folder, tiers_lst=[1]):

    WHO_results_single_drug, ALL_results_single_drug = get_all_results_single_drug(drug, in_folder)

    # # add upgrades if a variant is part of Lof and inframe pooled mutations
    # WHO_results_single_drug = upgrade_components_of_pooled_variants(WHO_results_single_drug)
    # ALL_results_single_drug = upgrade_components_of_pooled_variants(ALL_results_single_drug)

    all_mutations = list(set(WHO_results_single_drug.mutation).union(set(ALL_results_single_drug.mutation)))

    R_interim_lst = []
    S_interim_lst = []
    uncertain_lst = []
    neutral_lst = []
    
    for mutation in all_mutations:

        # if a mutation was not tested in a model, assign it to Ungraded for the purposes of combining results. It will still be NaN in the other columns of the dataframe though
        if len(ALL_results_single_drug.query("mutation==@mutation")) == 0:
            ALL_conf = "Ungraded"
        else:
            ALL_conf = ALL_results_single_drug.query("mutation==@mutation")["regression_confidence"].values[0]

        if len(WHO_results_single_drug.query("mutation==@mutation")) == 0:
            WHO_conf = "Ungraded"
        else:
            WHO_conf = WHO_results_single_drug.query("mutation==@mutation")["regression_confidence"].values[0]        

        # regardless of what the WHO grading is, if ALL is uncertain, keep uncertain because ALL is a bigger, more representative dataset
        if ALL_conf == "Uncertain":
            uncertain_lst.append(mutation)

        # if the two phenotypic categories disagree in the sign of the OR (and have significant ORs), make uncertain
        # this includs both the Possible and strict categories
        if "Assoc w R" in WHO_conf and "Assoc w S" in ALL_conf:
            uncertain_lst.append(mutation)

        if "Assoc w R" in ALL_conf and "Assoc w S" in WHO_conf:
            uncertain_lst.append(mutation)

        # because ALL is a bigger, more representative dataset, make interim if WHO = uncertain/neutral and ALL = assoc
        # if ALL is in the possible category, however, downgrade to uncertain
        if WHO_conf in ["Uncertain", "Neutral", "Ungraded"]:
            if ALL_conf == 'Assoc w R':
                R_interim_lst.append(mutation)
            elif ALL_conf == 'Assoc w S':
                S_interim_lst.append(mutation)
            elif "Possible" in ALL_conf:
                uncertain_lst.append(mutation)
        
        if ALL_conf == "Neutral":
            # Neutral if absent from WHO analysis
            if WHO_conf == 'Ungraded': 
                neutral_lst.append(mutation)

            # if WHO is Uncertain, make overall Uncertain
            elif WHO_conf == "Uncertain":
                uncertain_lst.append(mutation)

        if WHO_conf == "Possible Assoc w R":
            if ALL_conf == "Assoc w R":
                R_interim_lst.append(mutation)

        if WHO_conf == "Possible Assoc w S":
            if ALL_conf == "Assoc w S":
                S_interim_lst.append(mutation)

        if "Possible" in WHO_conf and "Possible" in ALL_conf:
            uncertain_lst.append(mutation)

    # check that the 4 up/downgrade lists are mutually exclusive (otherwise would indicate a bug)
    assert len(set(R_interim_lst).intersection(S_interim_lst)) == 0
    assert len(set(R_interim_lst).intersection(uncertain_lst)) == 0
    assert len(set(S_interim_lst).intersection(uncertain_lst)) == 0
    assert len(set(R_interim_lst).intersection(neutral_lst)) == 0
    assert len(set(S_interim_lst).intersection(neutral_lst)) == 0
    assert len(set(uncertain_lst).intersection(neutral_lst)) == 0

    # all columns to keep in each of the WHO and ALL dataset dataframes
    cols_lst = ['Odds_Ratio', 'BH_pval', 'BH_neutral_pval', 'BH_LRT_pval', 'BH_LRT_neutral_pval',
       'Num_Isolates', 'Present_R', 'Present_S', 'Absent_S', 'Absent_R', 'PPV',
       'NPV', 'Sens', 'Spec', 'PPV_LB', 'PPV_UB', 'NPV_LB',
       'NPV_UB', 'Sens_LB', 'Sens_UB', 'Spec_LB', 'Spec_UB', 'regression_confidence', 'single_lineage']#, 'AddGrading']
                
    WHO_final = WHO_results_single_drug
    WHO_final = pd.concat([WHO_final[["mutation", "predicted_effect"]], 
                           WHO_final[cols_lst], 
                           #WHO_final[['MIC_coef', 'BH_MIC_pval']]
                          ], axis=1)
    WHO_final.rename(columns=dict(zip(cols_lst, [f"WHO_{col}" for col in cols_lst])), inplace=True)

    ALL_final = ALL_results_single_drug
    ALL_final = pd.concat([ALL_final[["mutation", "predicted_effect"]], 
                           ALL_final[cols_lst],
                           #ALL_final[['MIC_coef', 'BH_MIC_pval']]
                          ], axis=1)
    ALL_final.rename(columns=dict(zip(cols_lst, [f"ALL_{col}" for col in cols_lst])), inplace=True)

    final_df = WHO_final.merge(ALL_final, on=["mutation", "predicted_effect"], how="outer").drop_duplicates().reset_index(drop=True)
    
    # start with WHO confidences first, then make up- or downgrades depending on the ALL results
    final_df["regression_confidence"] = final_df["WHO_regression_confidence"].fillna(final_df["ALL_regression_confidence"])

    # upgrades to interim
    final_df.loc[final_df["mutation"].isin(R_interim_lst), "regression_confidence"] = "Assoc w R - Interim"
    final_df.loc[final_df["mutation"].isin(S_interim_lst), "regression_confidence"] = "Assoc w S - Interim"
    final_df.loc[final_df["mutation"].isin(neutral_lst), "regression_confidence"] = "Neutral"

    # downgrades to uncertain and all remaining possibles downgraded to uncertain
    final_df.loc[(final_df["mutation"].isin(uncertain_lst)) | (final_df["regression_confidence"].str.contains("Possible")), "regression_confidence"] = "Uncertain"

    # rename columns for consistency with SOLO
    final_df.rename(columns={'WHO_regression_confidence': 'Initial confidence grading WHO dataset',
                             'ALL_regression_confidence': 'Initial confidence grading ALL dataset',
                             'regression_confidence': 'FINAL CONFIDENCE GRADING'
                            }, inplace=True)

    # check that no mutations have been duplicated
    assert len(final_df.mutation.unique()) == len(final_df)

    # fix LoF naming
    final_df["mutation"] = final_df.mutation.str.replace("lof", "LoF")
    final_df["predicted_effect"] = final_df.predicted_effect.str.replace("lof", "LoF")

    # reorder columns so that the MIC columns are at the end
    final_df = final_df[np.concatenate([final_df.columns[~final_df.columns.str.contains('MIC')],  final_df.columns[final_df.columns.str.contains('MIC')]])]

    # any mutations that were not in any regression model are added back in here as Uncertain with additional info in the Reason column
    missing_mut_df = solo_results.query("drug==@drug & tier in @tiers_lst & variant not in @final_df.mutation.values")[['variant', 'effect']].rename(columns={'variant': 'mutation', 'effect': 'predicted_effect'})
    missing_mut_df['FINAL CONFIDENCE GRADING'] = 'Uncertain'
    missing_mut_df['Reason'] = 'Not Graded'
    
    pd.concat([final_df, missing_mut_df], axis=0).sort_values("WHO_Odds_Ratio", ascending=False).to_csv(f"../results/{out_folder}/{drug}.csv", index=False)

In [12]:
def write_results_for_all_drugs(in_folder, out_folder, tiers_lst=[1]):

    if not os.path.isdir(f"../results/{out_folder}"):
        os.mkdir(f"../results/{out_folder}")
    
    for drug in np.sort(drugs_lst):
    
        if drug == "Pretomanid":
            clean_WHO_results_write_to_csv(drug, in_folder, out_folder, tiers_lst=[1])            
        else:
            combine_WHO_ALL_results_write_to_csv(drug, in_folder, out_folder, tiers_lst=[1])
    
        print(drug)

# Write Results for Pooled and Unpooled Models for All Drugs

In [13]:
write_results_for_all_drugs("BINARY", "UNPOOLED", tiers_lst=[1])

Amikacin
Bedaquiline
Capreomycin
Clofazimine
Delamanid
Ethambutol
Ethionamide
Isoniazid
Kanamycin
Levofloxacin
Linezolid
Moxifloxacin
Pretomanid
Pyrazinamide
Rifampicin
Streptomycin


In [14]:
write_results_for_all_drugs("BINARY_POOL", "POOLED", tiers_lst=[1])

Amikacin
Bedaquiline
Capreomycin
Clofazimine
Delamanid
Ethambutol
Ethionamide
Isoniazid
Kanamycin
Levofloxacin
Linezolid
Moxifloxacin
Pretomanid
Pyrazinamide
Rifampicin
Streptomycin


# Compare Gradings between Pooled and Unpooled Models

In [15]:
def resolve_pooled_unpooled_model_results(drug, include_silent=False):
    '''
    This function returns a dataframe of variants with different gradings between pooled and unpooled models. These variants are NOT LoF or inframe variants, nor the component variants of pooled LoF and inframe variants.

    There are several variants that are graded Neutral in one model and Uncertain in another. These are converted to Uncertain

    In the final results, we prioritize the results from unpooled models, then update the final confidence grading for any variants with differing gradings between pooled and unpooled models. These final results are then saved to the results/FINAL folder.
    '''
    
    final_col = 'RESOLVED FINAL GRADING'

    pooled_results = pd.read_csv(f"../results/POOLED/{drug}.csv")
    unpooled_results = pd.read_csv(f"../results/UNPOOLED/{drug}.csv")

    # there should not be any differences in the mutations in the two sets. These lists exclude all pooled mutations and all components of pooled variants
    # i.e. there are no LoF variants and also no frameshift variants because they are components of LoF variants
    assert len(set(pooled_results.mutation).symmetric_difference(set(unpooled_results.mutation))) == 0

    if not include_silent:
        pooled_results = pooled_results.query("predicted_effect not in ['synonymous_variant', 'stop_retained_variant', 'initiator_codon_variant']")
        unpooled_results = unpooled_results.query("predicted_effect not in ['synonymous_variant', 'stop_retained_variant', 'initiator_codon_variant']")

    combined_df = unpooled_results[["mutation", "FINAL CONFIDENCE GRADING"]].merge(pooled_results[["mutation", "FINAL CONFIDENCE GRADING"]], on="mutation", suffixes=["_unpooled", "_pooled"], how="outer")

    # most of the variants have the same grading in pooled and unpooled models, so take that one
    combined_df.loc[combined_df['FINAL CONFIDENCE GRADING_unpooled'] == combined_df['FINAL CONFIDENCE GRADING_pooled'], final_col] = combined_df['FINAL CONFIDENCE GRADING_unpooled']

    # add solo gradings for comparison
    combined_df = combined_df.merge(solo_results.query("drug==@drug")[["variant", "FINAL CONFIDENCE GRADING"]].rename(columns={"variant": "mutation", "FINAL CONFIDENCE GRADING": "SOLO_FINAL_GRADING"}), how="left")

    # print the drug name if there are differences for some variants that are NOT Uncertain/Neutral discrepancies
    diff_df = combined_df.loc[pd.isnull(combined_df[final_col])].reset_index(drop=True)
    del combined_df
    
    # print the drug name if there are differences for some variants that are NOT Uncertain/Neutral discrepancies
    if len(diff_df.loc[~((diff_df['FINAL CONFIDENCE GRADING_unpooled'].str.contains('|'.join(['Neutral', 'Uncertain']))) & (diff_df['FINAL CONFIDENCE GRADING_pooled'].str.contains('|'.join(['Neutral', 'Uncertain']))))]) > 0:
        print(drug)
    
    # resolve differences
    # can assume that there are no cases of one being assoc w R and the other being assoc w S because already assigned those to uncertain within each model above
    # highly unlikely to have passed that filter
    for i, row in diff_df.iterrows():

        # only variants with differences between pooled and unpooled models have NaNs in this column
        if pd.isnull(row[final_col]):

            if 'Uncertain' in [row['FINAL CONFIDENCE GRADING_unpooled'], row['FINAL CONFIDENCE GRADING_pooled']]: 
    
                # make Uncertain + Neutral (regardless of order) = Uncertain
                if 'Neutral' in [row['FINAL CONFIDENCE GRADING_unpooled'], row['FINAL CONFIDENCE GRADING_pooled']]: 
                    diff_df.loc[i, final_col] = 'Uncertain'
                    
                # if one is uncertain and the other is interim, then make uncertain
                elif 'Interim' in row['FINAL CONFIDENCE GRADING_unpooled'] or 'Interim' in row['FINAL CONFIDENCE GRADING_pooled']:
                    diff_df.loc[i, final_col] = 'Uncertain'
    
                # if the other does not contain Interim, then it must be Assoc w R/S, so make the final grading interim
                else:
                    if row['FINAL CONFIDENCE GRADING_unpooled'] == 'Uncertain':
                        diff_df.loc[i, final_col] = row['FINAL CONFIDENCE GRADING_pooled'] + ' - Interim'
                    else:
                        diff_df.loc[i, final_col] = row['FINAL CONFIDENCE GRADING_unpooled'] + ' - Interim'
        
            # if one is interim and other is Assoc w R/S, then make interim
            else:
                if 'Interim' in row['FINAL CONFIDENCE GRADING_unpooled'] and 'Interim' not in row['FINAL CONFIDENCE GRADING_pooled']:
                    diff_df.loc[i, final_col] = row['FINAL CONFIDENCE GRADING_unpooled']
                
                elif 'Interim' in row['FINAL CONFIDENCE GRADING_pooled'] and 'Interim' not in row['FINAL CONFIDENCE GRADING_unpooled']:
                    diff_df.loc[i, final_col] = row['FINAL CONFIDENCE GRADING_pooled']
            
    if len(diff_df) > 0:
        return diff_df
    else:
        return None

In [16]:
pooled_unpooled_resolved = {}

for drug in drugs_lst:
    pooled_unpooled_resolved[drug] = resolve_pooled_unpooled_model_results(drug, include_silent=False)

Pyrazinamide
Ethambutol
Kanamycin
Bedaquiline
Isoniazid
Ethionamide


In [94]:
df = pd.read_csv("../results/UNPOOLED/Bedaquiline.csv")

In [96]:
df['FINAL CONFIDENCE GRADING'].unique()

array(['Assoc w R', 'Uncertain', 'Assoc w R - Interim', 'Neutral',
       'Assoc w S - Interim'], dtype=object)

In [98]:
df.loc[df['FINAL CONFIDENCE GRADING']=='Assoc w S - Interim'][['Initial confidence grading WHO dataset', 'Initial confidence grading ALL dataset']]

Unnamed: 0,Initial confidence grading WHO dataset,Initial confidence grading ALL dataset
241,Uncertain,Assoc w S
258,Uncertain,Assoc w S


In [17]:
drug = 'Pyrazinamide'

# print the variants that are not Uncertain/Neutral in both pooled and unpooled. These are actual discrepancies that should be resolved
pooled_unpooled_resolved[drug].loc[~((pooled_unpooled_resolved[drug]['FINAL CONFIDENCE GRADING_unpooled'].str.contains('|'.join(['Neutral', 'Uncertain']))) & (pooled_unpooled_resolved[drug]['FINAL CONFIDENCE GRADING_pooled'].str.contains('|'.join(['Neutral', 'Uncertain']))))]

Unnamed: 0,mutation,FINAL CONFIDENCE GRADING_unpooled,FINAL CONFIDENCE GRADING_pooled,RESOLVED FINAL GRADING,SOLO_FINAL_GRADING
0,pncA_p.Val93Ala,Assoc w R,Uncertain,Assoc w R - Interim,2) Assoc w R - Interim
1,pncA_p.Val155Ala,Assoc w R,Uncertain,Assoc w R - Interim,2) Assoc w R - Interim


In [18]:
drug = 'Ethambutol'

# print the variants that are not Uncertain/Neutral in both pooled and unpooled. These are actual discrepancies that should be resolved
pooled_unpooled_resolved[drug].loc[~((pooled_unpooled_resolved[drug]['FINAL CONFIDENCE GRADING_unpooled'].str.contains('|'.join(['Neutral', 'Uncertain']))) & (pooled_unpooled_resolved[drug]['FINAL CONFIDENCE GRADING_pooled'].str.contains('|'.join(['Neutral', 'Uncertain']))))]

Unnamed: 0,mutation,FINAL CONFIDENCE GRADING_unpooled,FINAL CONFIDENCE GRADING_pooled,RESOLVED FINAL GRADING,SOLO_FINAL_GRADING
0,ubiA_p.Ala237Val,Uncertain,Assoc w R - Interim,Uncertain,3) Uncertain significance


In [19]:
drug = 'Kanamycin'

# print the variants that are not Uncertain/Neutral in both pooled and unpooled. These are actual discrepancies that should be resolved
pooled_unpooled_resolved[drug].loc[~((pooled_unpooled_resolved[drug]['FINAL CONFIDENCE GRADING_unpooled'].str.contains('|'.join(['Neutral', 'Uncertain']))) & (pooled_unpooled_resolved[drug]['FINAL CONFIDENCE GRADING_pooled'].str.contains('|'.join(['Neutral', 'Uncertain']))))]

Unnamed: 0,mutation,FINAL CONFIDENCE GRADING_unpooled,FINAL CONFIDENCE GRADING_pooled,RESOLVED FINAL GRADING,SOLO_FINAL_GRADING
0,whiB7_p.Leu42Pro,Assoc w S - Interim,Uncertain,Uncertain,3) Uncertain significance


In [20]:
drug = 'Bedaquiline'

# print the variants that are not Uncertain/Neutral in both pooled and unpooled. These are actual discrepancies that should be resolved
pooled_unpooled_resolved[drug].loc[~((pooled_unpooled_resolved[drug]['FINAL CONFIDENCE GRADING_unpooled'].str.contains('|'.join(['Neutral', 'Uncertain']))) & (pooled_unpooled_resolved[drug]['FINAL CONFIDENCE GRADING_pooled'].str.contains('|'.join(['Neutral', 'Uncertain']))))]

Unnamed: 0,mutation,FINAL CONFIDENCE GRADING_unpooled,FINAL CONFIDENCE GRADING_pooled,RESOLVED FINAL GRADING,SOLO_FINAL_GRADING
0,Rv0678_p.Cys46Arg,Assoc w R - Interim,Assoc w R,Assoc w R - Interim,2) Assoc w R - Interim


In [21]:
drug = 'Isoniazid'

# print the variants that are not Uncertain/Neutral in both pooled and unpooled. These are actual discrepancies that should be resolved
pooled_unpooled_resolved[drug].loc[~((pooled_unpooled_resolved[drug]['FINAL CONFIDENCE GRADING_unpooled'].str.contains('|'.join(['Neutral', 'Uncertain']))) & (pooled_unpooled_resolved[drug]['FINAL CONFIDENCE GRADING_pooled'].str.contains('|'.join(['Neutral', 'Uncertain']))))]

Unnamed: 0,mutation,FINAL CONFIDENCE GRADING_unpooled,FINAL CONFIDENCE GRADING_pooled,RESOLVED FINAL GRADING,SOLO_FINAL_GRADING
0,ahpC_c.-48G>A,Assoc w R,Assoc w R - Interim,Assoc w R - Interim,3) Uncertain significance
1,ahpC_c.-54C>T,Assoc w R,Assoc w R - Interim,Assoc w R - Interim,3) Uncertain significance
2,katG_p.Gln525Pro,Assoc w R,Assoc w R - Interim,Assoc w R - Interim,3) Uncertain significance
5,katG_p.Ser140Asn,Uncertain,Assoc w S,Assoc w S - Interim,3) Uncertain significance


In [22]:
drug = 'Ethionamide'

# print the variants that are not Uncertain/Neutral in both pooled and unpooled. These are actual discrepancies that should be resolved
pooled_unpooled_resolved[drug].loc[~((pooled_unpooled_resolved[drug]['FINAL CONFIDENCE GRADING_unpooled'].str.contains('|'.join(['Neutral', 'Uncertain']))) & (pooled_unpooled_resolved[drug]['FINAL CONFIDENCE GRADING_pooled'].str.contains('|'.join(['Neutral', 'Uncertain']))))]

Unnamed: 0,mutation,FINAL CONFIDENCE GRADING_unpooled,FINAL CONFIDENCE GRADING_pooled,RESOLVED FINAL GRADING,SOLO_FINAL_GRADING
0,inhA_c.-770T>C,Assoc w R,Assoc w R - Interim,Assoc w R - Interim,2) Assoc w R - Interim
1,inhA_c.-770T>A,Assoc w R,Assoc w R - Interim,Assoc w R - Interim,2) Assoc w R - Interim
2,inhA_c.-796C>T,Uncertain,Assoc w R,Assoc w R - Interim,3) Uncertain significance
3,ethA_p.Cys403Trp,Assoc w R - Interim,Uncertain,Uncertain,3) Uncertain significance


# Combine results for all drugs into a single grading file

In [23]:
def write_final_results_dataframe_single_drug(drug, pooled_unpooled_resolved):

    unpooled_results = pd.read_csv(f"../results/UNPOOLED/{drug}.csv").rename(columns={'FINAL CONFIDENCE GRADING': 'UNPOOLED CONFIDENCE GRADING'})

    if pooled_unpooled_resolved[drug] is not None:
        
        update_dict = dict(zip(pooled_unpooled_resolved[drug]['mutation'], pooled_unpooled_resolved[drug]['RESOLVED FINAL GRADING']))
        pooled_result_dict = dict(zip(pooled_unpooled_resolved[drug]['mutation'], pooled_unpooled_resolved[drug]['FINAL CONFIDENCE GRADING_pooled']))
        
        # fill empty with the unpooled confidence grading because if they are not in the dictionary, then they had the same results across pooled and unpooled
        unpooled_results['POOLED CONFIDENCE GRADING'] = unpooled_results['mutation'].map(pooled_result_dict)
        unpooled_results.loc[~pd.isnull(unpooled_results['POOLED CONFIDENCE GRADING']), 'Reason'] = 'Pooled Unpooled Different'
        unpooled_results['POOLED CONFIDENCE GRADING'] = unpooled_results['POOLED CONFIDENCE GRADING'].fillna(unpooled_results['UNPOOLED CONFIDENCE GRADING'])
        unpooled_results['FINAL CONFIDENCE GRADING'] = unpooled_results['mutation'].map(update_dict).fillna(unpooled_results['UNPOOLED CONFIDENCE GRADING'])

    # no differences, so just copy the dataframe columns
    else:
        unpooled_results['POOLED CONFIDENCE GRADING'] = unpooled_results['UNPOOLED CONFIDENCE GRADING']
        unpooled_results['FINAL CONFIDENCE GRADING'] = unpooled_results['UNPOOLED CONFIDENCE GRADING']
    
    if not os.path.isdir("../results/FINAL"):
        os.mkdir("../results/FINAL")

    # add SOLO results for easy comparison
    unpooled_results.merge(solo_results.query("drug==@drug")[['variant', 'FINAL CONFIDENCE GRADING']].rename(columns={'variant': 'mutation', 'FINAL CONFIDENCE GRADING': 'SOLO FINAL CONFIDENCE GRADING'}), on='mutation', how='left').to_csv(f"../results/FINAL/{drug}.csv", index=False)

In [24]:
for drug in np.sort(drugs_lst):
    write_final_results_dataframe_single_drug(drug, pooled_unpooled_resolved)
    print(drug)

Amikacin
Bedaquiline
Capreomycin
Clofazimine
Delamanid
Ethambutol
Ethionamide
Isoniazid
Kanamycin
Levofloxacin
Linezolid
Moxifloxacin
Pretomanid
Pyrazinamide
Rifampicin
Streptomycin


In [295]:
results_all_drugs = []

for drug in drugs_lst:
    df = pd.read_csv(f"../results/FINAL/{drug}.csv")
    df['Drug'] = drug
    results_all_drugs.append(df)

results_all_drugs = pd.concat(results_all_drugs, axis=0)

# compare lengths. Regression includes inframe mutations
print(results_all_drugs.shape, solo_results.shape)

(21799, 55) (21589, 114)
