In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['figure.dpi'] = 125
import seaborn as sns
from Bio import SeqIO, Seq
import scipy.stats as st
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
import glob, os, sys, yaml, subprocess, itertools, sparse

who_variants_combined = pd.read_csv("who_confidence_2021.csv")
drug_gene_mapping = pd.read_csv("../data/drug_gene_mapping.csv")
samples_summary = pd.read_csv("../data/samples_summary.csv")
snp_scheme = pd.read_csv("../data/coll2014_SNP_scheme.tsv", sep="\t")
snp_scheme["#lineage"] = snp_scheme["#lineage"].str.replace("lineage", "")
snp_scheme.rename(columns={"#lineage": "Lineage"}, inplace=True)

lineages = pd.read_csv("../data/combined_lineages_samples.csv", low_memory=False)
solo_results = pd.read_excel("../data/SOLO primary_STATA_ver18Feb2023.xlsx", sheet_name=None)

if len(solo_results) == 1:
    solo_results = solo_results[list(solo_results.keys())[0]]
    
solo_results = solo_results.rename(columns={"variant":"mutation"})
    
import warnings
warnings.filterwarnings(action='ignore')

# utils files are in a separate folder
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), "utils"))
from stats_utils import *
from data_utils import *

# CHANGE ANALYSIS DIR BEFORE RUNNING THE NOTEBOOK!
analysis_dir = '/n/data1/hms/dbmi/farhat/Sanjana/who-mutation-catalogue'

Anything Num_Isolates < 5 should be uncertain 

Tier 2 can only be in the highest categories if there is MIC evidence supporting it

Write an email to everybody about the results for the new drugs: BDQ, LZD, PTM, DLM, CFZ

Associations added to list only if they are both in WHO and ALL datasets. 

In [3]:
drugs_lst = os.listdir(analysis_dir)

drug_abbr_dict = {"Delamanid": "DLM",
                  "Bedaquiline": "BDQ",
                  "Clofazimine": "CFZ",
                  "Ethionamide": "ETH",
                  "Linezolid": "LZD",
                  "Moxifloxacin": "MXF",
                  "Capreomycin": "CAP",
                  "Amikacin": "AMI",
                  "Pretomanid": "PTM",
                  "Pyrazinamide": "PZA",
                  "Kanamycin": "KAN",
                  "Levofloxacin": "LEV",
                  "Streptomycin": "STM",
                  "Ethambutol": "EMB",
                  "Isoniazid": "INH",
                  "Rifampicin": "RIF"
                 }

# Write Final Dataframes for the Binary Analysis to an Excel File

Each drug will have a separate Excel file. Each file will have 16 sheets, one for each model.

For INH, surprising that hadA variants are found with high associations. Might be homoplastic!
For PZA, surprising that clpC1_c.2302T>C co-occurs with pncA_p.His57Asp

In [4]:
def get_unpooled_table_by_tier(drug, tiers_lst, folder, model_prefix):
    
    ################## 1. READ IN RIDGE REGRESSION RESULTS ##################
    model_permute = pd.read_csv(os.path.join(analysis_dir, drug, folder, model_prefix, "model_analysis.csv")).query("~mutation.str.contains('PC')")
    
#     # remove the tier 1 genes for the purposes of this analysis
#     if len(tiers_lst) == 2:
#         model1_permute = pd.read_csv(os.path.join(analysis_dir, drug, folder, model_prefix.replace("tiers=1+2", "tiers=1"), "model_analysis.csv")).query("~mutation.str.contains('PC')")
#         model_permute = model_permute.query("mutation not in @model1_permute.mutation")
    
#     if "withSyn" in model_prefix:
#         model_noSyn_permute = pd.read_csv(os.path.join(analysis_dir, drug, folder, model_prefix.replace("withSyn", "noSyn"), "model_analysis.csv")).query("~mutation.str.contains('PC')")
#         model_permute = model_permute.query("mutation not in @model_noSyn_permute.mutation")
    

    ################## 2. READ IN LRT RESULTS ##################
    LRTresults = pd.read_csv(os.path.join(analysis_dir, drug, folder, model_prefix, "LRT_results.csv"))

    # because the p-values are NaN for the FULL model row, they will be removed, so then the dataframes can be merged using inner
    LRTresults = add_pval_corrections(LRTresults.iloc[1:, ])
    
    # combine results into a single dataframe for easy searching. REMOVE BONFERRONI AND COEFS
    combined_results = model_permute[model_permute.columns[~model_permute.columns.str.contains("|".join(["Bonferroni", "coef"]))]].merge(LRTresults[["mutation", "LRT_pval", "BH_LRT_pval", "LRT_neutral_pval", "BH_LRT_neutral_pval"]]
                                                                                                                  , on="mutation", how="inner")

    combined_results["Tier"] = tiers_lst[-1]

    # columns to return, in the desired order
    keep_cols = ['mutation', 'Tier', 'predicted_effect', 'position', 'confidence', 'Odds_Ratio',
                           #'OR_LB', 'OR_UB', 
                 'pval', 'BH_pval', 'neutral_pval', 'BH_neutral_pval', 'LRT_pval', 'BH_LRT_pval', 'LRT_neutral_pval', 'BH_LRT_neutral_pval']

    keep_cols += ['Num_Isolates', "Mut_R", "Mut_S", "NoMut_S", "NoMut_R", 
                  'PPV', 'NPV', 'Sens', 'Spec', 'LR+', 'LR-',
                   'PPV_LB', 'PPV_UB', 'NPV_LB', 'NPV_UB', 'Sens_LB', 'Sens_UB', 'Spec_LB',
                   'Spec_UB', 'LR+_LB', 'LR+_UB', 'LR-_LB', 'LR-_UB'
                   ]

    return combined_results[keep_cols]

In [5]:
def add_significance_category(df, drug, model_path):
    '''
    Add significance category annotations, in the form of integers. Add the ones with the fewest requirements first, and then progressively add
    '''
    
    col_name = "regression_confidence"
    df = df.reset_index(drop=True)
    df[["Tier", "synonymous"]] = df[["Tier", "synonymous"]].astype(int)
    
    # lower significance threshold for tier 2 genes and synonymous mutations
    if df["Tier"].values[0] == 2 or df["synonymous"].values[0] == 1:
        thresh = 0.01
    else:
        thresh = 0.05
        
    # anything without Num_Isolates >= 5 is Uncertain because it's too rare to make conclusions
    df.loc[(df["BH_pval"] < thresh) & (df["Odds_Ratio"] > 1) & (df["Num_Isolates"] >= 5) & (df["PPV_LB"] >= 0.25), col_name] = "Possible Assoc w R"
    df.loc[(df["BH_pval"] < thresh) & (df["Odds_Ratio"] < 1) & (df["Num_Isolates"] >= 5) & (df["NPV_LB"] >= 0.25), col_name] = "Possible Assoc w S"
    
    # get additional evidence from the LRT to be classified as Assoc w R/S - strict. Only classify Tier 1 here
    # Tier 2 mutations can only be upgraded to the highest categories if there is MIC evidence supporting it
    df.loc[(df[col_name] == "Possible Assoc w R") & (df["BH_LRT_pval"] < thresh) & (df["Tier"] == 1), col_name] = "Assoc w R - strict"
    df.loc[(df[col_name] == "Possible Assoc w S") & (df["BH_LRT_pval"] < thresh) & (df["Tier"] == 1), col_name] = "Assoc w S - strict"
    
    # phenos = ALL/WHO is the middle part of the path, so split and keep the first and last parts
    MIC_model_path = "/".join([model_path.split("/")[0], model_path.split("/")[-1]])
    MIC_model_analysis = pd.read_csv(os.path.join(analysis_dir, drug, "MIC", MIC_model_path, "model_analysis.csv")).query("~mutation.str.contains('PC')")
    
    # neutral mutations: not significant in regression AND significant in the neutral LRT test AND present at high enough frequency
    # should be in the lowest tail of the p-value.
    # so a mutation is significant in regression if p-value < 0.05, then a mutation is significant in the neutral test if p-value > 0.95
    # This means that more than 95% of permuted samples are NOT AS EXTREME as the test statistic, indicating the probably the test statistic is not significant
    df.loc[(df["BH_neutral_pval"] < thresh) & (df["BH_LRT_neutral_pval"] < thresh) & (df["Num_Isolates"] >= 5), col_name] = "Neutral"
        
    # upgrade mutations in the Possible Assoc categories if they have a significant associations with MIC
    # downgrade mutations in the strict categories if they DO NOT HAVE significant associations with MIC
    # AT THIS POINT, THE HIGHEST A TIER 2 MUTATION CAN BE IS AT POSSIBLE
        
    df = df.merge(MIC_model_analysis[["mutation", "coef", "BH_pval"]].rename(columns={"coef": "MIC_coef", "BH_pval": "BH_MIC_pval"}), on="mutation", how="left")
        
    for i, row in df.iterrows():
        
        OR, OR_BH_pval, MIC_coef, BH_MIC_pval = row[["Odds_Ratio", "BH_pval", "MIC_coef", "BH_MIC_pval"]].values
        
        if not pd.isnull(MIC_coef) and not pd.isnull(row[col_name]):
                        
            if "Possible" in row[col_name] or row["Tier"] == 2:
                
                # only upgrade or downgrade if the MIC coefficient is significant
                # if it is not significant, then don't make any changes
                if BH_MIC_pval < thresh:

                    # HANDLE POSSIBLE MUTATIONS: if MIC evidence agrees and is significant, upgrade
                    # if MIC evidence disagrees or is NOT significant, downgrade
                    if OR > 1:
                        if MIC_coef > 0:
                            df.loc[i, col_name] = "Assoc w R - strict"
                        else:
                            df.loc[i, col_name] = "Uncertain"

                    elif OR < 1:
                        if MIC_coef < 0:
                            df.loc[i, col_name] = "Assoc w S - strict"
                        else:
                            df.loc[i, col_name] = "Uncertain"
                
    # ALSO REPLACE THE UPPER OR LOWER BOUND OF THE ODDS RATIO WITH 1, DEPENDING ON THE DIRECTION    
    # for i, row in df.iterrows():
    #     if row[col_name] == 'Assoc w R - strict':
    #         if row["OR_LB"] < 1:
    #             df.loc[i, "OR_LB"] = 1
    #     elif row[col_name] == 'Assoc w S - strict':
    #         if row["OR_UB"] > 1:
    #             df.loc[i, "OR_UB"] = 1

    # all other are uncertain
    df[col_name] = df[col_name].fillna("Uncertain")
    return df

In [6]:
def add_single_lineage_annotations_to_model(drug, df, folder, model_path):
    '''
    Annotate mutations that are present in a single lineage. Basically just a flag when performing further analyses.
    
    It's done at the lowest level because that's the output of Coll2014. So it will not flag mutations that are all present in L2, unless L2 was the deepest category fast-lineage-caller identified.
    '''
    
    model_matrix = pd.read_pickle(os.path.join(analysis_dir, drug, folder, model_path, "model_matrix.pkl"))

    lineages_single_model = lineages.query("~Coll2014.str.contains(',')").rename(columns={"Coll2014": "Lineage"})
    model_matrix = model_matrix.loc[model_matrix.index.isin(lineages_single_model['sample_id'].values)]
    model_matrix = model_matrix.merge(lineages_single_model[["Lineage", "sample_id"]], left_index=True, right_on="sample_id")
    
    single_lineage_mutations = {}

    for col in model_matrix.columns:

        if col not in ["sample_id", "Lineage"]:

            mutation_lineages = model_matrix.loc[model_matrix[col]==1]["Lineage"].unique()

            # there is only one lineage present
            if len(mutation_lineages) == 1:
                single_lineage_mutations[col] = mutation_lineages[0]
    
    # add lineage annotations to dataframe. Only mutations that are in the model matrix are in this dictionary
    df["single_lineage"] = df["mutation"].map(single_lineage_mutations)
    return df

In [7]:
def export_binary_analyses(drugs_lst, read_folder, write_folder, analyses_lst):
    
    if not os.path.isdir(f"../results/{write_folder}"):
        os.mkdir(f"../results/{write_folder}")
    
    for drug in np.sort(drugs_lst):
        
        all_analyses = {}

        for i, model_path in enumerate(analyses_lst):
            # some may not be there. Usually this is Pretomanid because there are no tier 2 genes or WHO phenotypes
            if os.path.isfile(os.path.join(analysis_dir, drug, read_folder, model_path, "model_matrix.pkl")):
                            
                tiers_lst = [["1", "2"] if "1+2" in model_path else ["1"]][0]
                phenos_name = ["ALL" if "phenos=ALL" in model_path else "WHO"][0]
                
                # if "dropAF_withSyn_unpooled" in model_path:
                phenos_name = ["ALL" if "ALL" in model_path else "WHO"][0]
                add_analysis = get_unpooled_table_by_tier(drug, tiers_lst, read_folder, model_path)
                
                add_analysis["pool_type"] = model_path.split("_")[-1]
                add_analysis["synonymous"] = int("withSyn" in model_path)
                
                add_analysis = add_analysis[add_analysis.columns[~add_analysis.columns.str.contains("|".join(["coef", "Bonferroni"]))]]
                add_analysis = add_significance_category(add_analysis, drug, model_path)

                # add annotation denoting whether a mutation is only present in a single lineage
                add_analysis = add_single_lineage_annotations_to_model(drug, add_analysis, read_folder, model_path)
                
                # for models with synonymous mutations, keep only the data for the synonymous ones
                # the data for nonsyn mutations will come from the noSyn models
                if "withSyn" in model_path:
                    add_analysis = add_analysis.query("predicted_effect in ['synonymous_variant', 'stop_retained_variant', 'initiator_codon_variant']")
                    
                # for models with pooling, only keep the results for the pooled mutations
                if "poolSeparate" in model_path:
                    add_analysis = add_analysis.query("predicted_effect in ['lof', 'inframe']")
                    
                # add SOLO V2 gradings
                add_analysis = add_analysis.merge(solo_results.query("drug==@drug")[["mutation", "Initial_Confidence_Grading"]], on="mutation", how="left")

                # the phenotype category is only relevant for the binary analysis
                if read_folder == "BINARY":
                    add_analysis["Phenos"] = ["ALL" if "phenos=ALL" in model_path else "WHO"][0]

                #add_analysis["HET"] = ["DROP" if "drop" in model_path else "AF"][0]
            
#                 # get the features dropped during data processing and append them to the end of the dataframe
#                 dropped_features_dir = os.path.join(analysis_dir, drug, read_folder, model_path, "dropped_features")

#                 if len(os.listdir(dropped_features_dir)) > 0:
#                     # print("   Appending dropped features")
#                     for file in os.listdir(dropped_features_dir):

#                         drop_features = pd.read_csv(os.path.join(dropped_features_dir, file), sep="\t", header=None).values
                        
#                         drop_df = pd.DataFrame({"mutation": np.squeeze(drop_features), 
#                                                 "predicted_effect": [file.split(".")[0]]*len(drop_features)
#                                                })
                        
#                         drop_df = drop_df.merge(who_variants_combined.query(f"drug=='{drug_abbr_dict[drug]}'")[["confidence", "mutation"]], on="mutation", how="left")

#                         # add them to the dataframe with the file name in the predicted_effect column (because it is the second column)
#                         add_analysis = pd.concat([add_analysis, drop_df], axis=0)

                add_analysis.rename(columns={"confidence": "confidence_V1"}, inplace=True)
                #all_analyses[f"Model_{i+1}"] = add_analysis
                all_analyses[model_path.replace("phenos=", "").replace("/", ",").replace("tiers=", "T").replace("dropAF_", "")] = add_analysis
        
        with pd.ExcelWriter(f"../results/{write_folder}/{drug}.xlsx") as file:
            for key, val in all_analyses.items():
                val.to_excel(file, sheet_name=key, index=False)
                    
        print(f"Finished {len(all_analyses)} analyses for {drug}")
        # return all_analyses

In [8]:
# hard coded paths to preserve the model hierarchy
binary_analyses_lst = [
                        ########### Tier 1, WHO phenos ###########
                        "tiers=1/phenos=WHO/dropAF_noSyn_unpooled",
                        "tiers=1/phenos=WHO/dropAF_noSyn_poolSeparate",
                        "tiers=1/phenos=WHO/dropAF_withSyn_unpooled",
                        ########### Tiers 1 + 2, WHO phenos ###########
                        "tiers=1+2/phenos=WHO/dropAF_noSyn_unpooled",
                        "tiers=1+2/phenos=WHO/dropAF_noSyn_poolSeparate",
                        "tiers=1+2/phenos=WHO/dropAF_withSyn_unpooled",
                        ########### Tier 1, ALL phenos ###########
                        "tiers=1/phenos=ALL/dropAF_noSyn_unpooled",
                        "tiers=1/phenos=ALL/dropAF_noSyn_poolSeparate",
                        "tiers=1/phenos=ALL/dropAF_withSyn_unpooled",
                        ########### Tiers 1 + 2, ALL phenos ###########
                        "tiers=1+2/phenos=ALL/dropAF_noSyn_unpooled",
                        "tiers=1+2/phenos=ALL/dropAF_noSyn_poolSeparate",
                        "tiers=1+2/phenos=ALL/dropAF_withSyn_unpooled",
                      ]

mic_analyses_lst = ["tiers=1/dropAF_noSyn_unpooled",
                    "tiers=1/dropAF_noSyn_poolSeparate",
                    "tiers=1/dropAF_withSyn_unpooled",
                    "tiers=1+2/dropAF_noSyn_unpooled",
                    "tiers=1+2/dropAF_noSyn_poolSeparate",
                    "tiers=1+2/dropAF_withSyn_unpooled",
                   ]
    
# # export_binary_analyses(drugs_lst, "BINARY", "NEW", binary_analyses_lst)

#new_drugs_lst = ["Pretomanid", "Bedaquiline", "Linezolid", "Delamanid", "Clofazimine", "Moxifloxacin", "Levofloxacin"]
new_drugs_lst = ["Rifampicin"]
export_binary_analyses(new_drugs_lst, "BINARY", "NEW", binary_analyses_lst)

KeyError: 'pval'

In [40]:
def create_significance_summary_by_drug(drug, folder):
    
    col_name = "regression_confidence"
    drug_excel_file = pd.read_excel(f"../results/{folder}/{drug}.xlsx", sheet_name=None)
    summary_df = pd.DataFrame()

    # dropna drops PCs (because they are NaN in the univariate stats columns) and also the dropped mutations because they are NaN in most columns
    for model in drug_excel_file.keys():
        summary_df = pd.concat([summary_df, pd.DataFrame(drug_excel_file[model][col_name].value_counts(dropna=False))], axis=1)

    summary_df = summary_df.fillna(0).astype(int)
    summary_df.columns = drug_excel_file.keys()

    sig_groups = ["Assoc w R - strict", "Possible Assoc w R", "Uncertain", "Possible Assoc w S", "Assoc w S - strict", "Neutral"]

    missing_groups = list(set(sig_groups) - set(summary_df.index.values))
    summary_df = pd.concat([summary_df, pd.DataFrame(0, index=missing_groups, columns=summary_df.columns)])

    return summary_df.loc[sig_groups], drug_excel_file

In [41]:
summary_dfs_dict = {}
full_results_dict = {}

# new_drugs_lst = ["Pretomanid", "Bedaquiline", "Clofazimine", "Linezolid", "Delamanid"]
for drug in np.sort(new_drugs_lst):
    
    res = create_significance_summary_by_drug(drug, "NEW")
    summary_dfs_dict[drug] = res[0]
    full_results_dict[drug] = res[1]
        
# with pd.ExcelWriter(f"../results/BINARY/unpooled_significance_summaries.xlsx") as file:
#     for drug, df in summary_dfs_dict.items():
#         df.to_excel(file, sheet_name=drug)

In [34]:
summary_dfs_dict["Levofloxacin"]

Unnamed: 0,"T1,WHO,noSyn_unpooled","T1,WHO,noSyn_poolSeparate","T1,WHO,withSyn_unpooled","T1+2,WHO,noSyn_unpooled","T1+2,WHO,noSyn_poolSeparate","T1+2,WHO,withSyn_unpooled","T1,ALL,noSyn_unpooled","T1,ALL,noSyn_poolSeparate","T1,ALL,withSyn_unpooled","T1+2,ALL,noSyn_unpooled","T1+2,ALL,noSyn_poolSeparate","T1+2,ALL,withSyn_unpooled"
Assoc w R - strict,19,0,0,0,0,0,25,0,0,0,0,0
Possible Assoc w R,5,0,0,3,1,0,3,0,1,3,0,0
Uncertain,410,2,282,647,5,193,768,3,562,1179,7,376
Possible Assoc w S,1,0,1,0,0,0,5,0,2,3,0,0
Assoc w S - strict,2,0,0,0,0,0,2,0,0,0,0,0
Neutral,0,0,0,0,0,0,0,0,2,0,0,0


In [56]:
drug = "Pretomanid"
for key, df in full_results_dict[drug].items():
    print(key, df.query("regression_confidence=='Neutral'").mutation.unique())

T1,ALL,noSyn_unpooled ['fbiB_p.Leu447Arg']
T1,ALL,noSyn_poolSeparate []
T1,ALL,withSyn_unpooled ['fgd1_c.465C>T' 'fbiC_c.165G>A']


In [59]:
full_results_dict[drug]["T1,ALL,noSyn_unpooled"].query("regression_confidence=='Neutral'")

Unnamed: 0,mutation,Tier,predicted_effect,position,confidence_V1,Odds_Ratio,pval,BH_pval,neutral_pval,BH_neutral_pval,...,LR-_LB,LR-_UB,pool_type,synonymous,regression_confidence,MIC_coef,BH_MIC_pval,single_lineage,Initial_Confidence_Grading,Phenos
11,fbiB_p.Leu447Arg,1,missense_variant,3642874,,1.00167,0.103,0.583,0.0,0.0,...,0.741723,1.121633,unpooled,0,Neutral,-0.051964,0.453061,,3) Uncertain significance,ALL


In [60]:
full_results_dict[drug]["T1,ALL,withSyn_unpooled"].query("regression_confidence=='Neutral'")

Unnamed: 0,mutation,Tier,predicted_effect,position,confidence_V1,Odds_Ratio,pval,BH_pval,neutral_pval,BH_neutral_pval,...,LR-_LB,LR-_UB,pool_type,synonymous,regression_confidence,MIC_coef,BH_MIC_pval,single_lineage,Initial_Confidence_Grading,Phenos
35,fgd1_c.465C>T,1,synonymous_variant,491247,,0.986894,0.867,0.991,0.0,0.0,...,1.003662,1.020359,unpooled,1,Neutral,-0.058697,0.262212,4.2.1,3) Uncertain significance,ALL
37,fbiC_c.165G>A,1,synonymous_variant,1303095,,0.983564,0.805,0.991,0.001,0.004419,...,1.011145,1.034373,unpooled,1,Neutral,0.030477,0.481622,4.3.3,3) Uncertain significance,ALL


In [44]:
full_results_dict["Moxifloxacin"]['T1,WHO,noSyn_unpooled'].query("regression_confidence=='Neutral'")

Unnamed: 0,mutation,Tier,predicted_effect,position,confidence_V1,Odds_Ratio,pval,BH_pval,neutral_pval,BH_neutral_pval,...,LR-_LB,LR-_UB,pool_type,synonymous,regression_confidence,MIC_coef,BH_MIC_pval,single_lineage,Initial_Confidence_Grading,Phenos
81,gyrB_p.Met291Ile,1,missense_variant,6112,5) Not assoc w R,1.00096,0.453,0.829,0.014,0.028627,...,1.090823,1.114121,unpooled,0,Neutral,0.100452,0.262691,,5) Not assoc w R,WHO


In [48]:
full_results_dict["Moxifloxacin"]['T1,WHO,withSyn_unpooled'].query("regression_confidence=='Neutral'")

Unnamed: 0,mutation,Tier,predicted_effect,position,confidence_V1,Odds_Ratio,pval,BH_pval,neutral_pval,BH_neutral_pval,...,LR-_LB,LR-_UB,pool_type,synonymous,regression_confidence,MIC_coef,BH_MIC_pval,single_lineage,Initial_Confidence_Grading,Phenos
124,gyrA_c.1629C>T,1,synonymous_variant,8930,,0.988166,0.804,0.826804,0.0,0.0,...,1.000371,1.002047,unpooled,1,Neutral,-0.014564,0.506034,,3) Uncertain significance,WHO


In [49]:
full_results_dict["Moxifloxacin"]['T1,ALL,noSyn_unpooled'].query("regression_confidence=='Neutral'")

Unnamed: 0,mutation,Tier,predicted_effect,position,confidence_V1,Odds_Ratio,pval,BH_pval,neutral_pval,BH_neutral_pval,...,LR-_LB,LR-_UB,pool_type,synonymous,regression_confidence,MIC_coef,BH_MIC_pval,single_lineage,Initial_Confidence_Grading,Phenos
434,gyrA_p.Thr572Asn,1,missense_variant,9016,3) Uncertain significance,0.986382,0.501,0.857256,0.007,0.014457,...,1.000065,1.000581,unpooled,0,Neutral,0.009454,0.474483,4.8,3) Uncertain significance,ALL


In [62]:
solo_results.query("drug=='Pretomanid'").Initial_Confidence_Grading.unique()

array(['3) Uncertain significance', '5) Not assoc w R'], dtype=object)

In [63]:
solo_results.query("drug=='Pretomanid' & Initial_Confidence_Grading == '5) Not assoc w R'")

Unnamed: 0,drug,mutation,Initial_Confidence_Grading
35488,Pretomanid,fgd1_c.960T>C,5) Not assoc w R


In [54]:
solo_results.query("drug=='Moxifloxacin' & mutation in ['gyrB_p.Met291Ile', 'gyrA_c.1629C>T', 'gyrA_p.Thr572Asn']")

Unnamed: 0,drug,mutation,Initial_Confidence_Grading
34245,Moxifloxacin,gyrA_c.1629C>T,3) Uncertain significance
34844,Moxifloxacin,gyrA_p.Thr572Asn,3) Uncertain significance
35322,Moxifloxacin,gyrB_p.Met291Ile,5) Not assoc w R


In [38]:
full_results_dict["Levofloxacin"]['T1,WHO,noSyn_unpooled'].query("regression_confidence != 'Uncertain'")[["mutation", "regression_confidence", "Odds_Ratio", "BH_pval", "BH_neutral_pval", "BH_LRT_pval", "BH_LRT_neutral_pval"]]

Unnamed: 0,mutation,regression_confidence,Odds_Ratio,BH_pval,BH_neutral_pval,BH_LRT_pval,BH_LRT_neutral_pval
0,gyrA_p.Asp94Gly,Assoc w R - strict,4.358019,0.0,1.0,0.0,1.0
1,gyrA_p.Ala90Val,Assoc w R - strict,3.102918,0.0,1.0,0.0,1.0
2,gyrA_p.Asp94Ala,Assoc w R - strict,1.979966,0.0,1.0,3.637538e-286,1.0
3,gyrA_p.Asp94Asn,Assoc w R - strict,1.776615,0.0,1.0,5.302805e-222,1.0
4,gyrA_p.Ser91Pro,Assoc w R - strict,1.748225,0.0,1.0,3.93679e-214,1.0
5,gyrA_p.Asp94Tyr,Assoc w R - strict,1.643989,0.0,1.0,2.147853e-170,1.0
6,gyrA_p.Asp94His,Assoc w R - strict,1.355066,0.0,1.0,2.098103e-68,1.0
7,gyrA_p.Gly88Cys,Assoc w R - strict,1.269439,0.0,1.0,5.833249e-35,1.0
8,gyrB_p.Asp461Asn,Assoc w R - strict,1.210282,0.0,1.0,1.0790079999999999e-24,1.0
9,gyrB_p.Asp461His,Assoc w R - strict,1.189096,0.0,1.0,1.752478e-15,1.0


In [11]:
def combine_results_single_drug(full_results_dict, drug):
    
    del_cols = ["Phenos", "pool_type", "synonymous", "confidence_V1", "pval", "neutral_pval", "LRT_pval", "LRT_neutral_pval"]
    
    WHO_combined = []
    ALL_combined = []

    for name, df in full_results_dict[drug].items():

        df = df[df.columns[~df.columns.isin(del_cols)]]
        
        if "WHO" in name:
            WHO_combined.append(df)
        elif "ALL" in name:
            ALL_combined.append(df)
        else:
            print(name)

    if drug != "Pretomanid":
        assert len(WHO_combined) == len(ALL_combined)
        assert len(pd.concat(WHO_combined)) == len(pd.concat(WHO_combined).drop_duplicates("mutation"))
        WHO_combined = pd.concat(WHO_combined)
    
    assert len(pd.concat(ALL_combined)) == len(pd.concat(ALL_combined).drop_duplicates("mutation"))
    ALL_combined = pd.concat(ALL_combined)
    
    return WHO_combined, ALL_combined

In [12]:
WHO_phenos_results = {}
ALL_phenos_results = {}

for drug in new_drugs_lst:
    # print(f"\n{drug}\n")
    WHO_df, ALL_df = combine_results_single_drug(full_results_dict, drug)

    if drug != "Pretomanid":
        #print(WHO_df["regression_confidence"].value_counts())
        WHO_phenos_results[drug] = WHO_df
        assert len(WHO_df.query("regression_confidence != 'Uncertain' & Num_Isolates < 5")) == 0
        assert len(WHO_df.query("regression_confidence not in ['Uncertain', 'Neutral'] & Odds_Ratio > 1 & PPV_LB < 0.25")) == 0
        assert len(WHO_df.query("regression_confidence not in ['Uncertain', 'Neutral'] & Odds_Ratio < 1 & NPV_LB < 0.25")) == 0
    
    #print(ALL_df["regression_confidence"].value_counts())
    ALL_phenos_results[drug] = ALL_df
    assert len(ALL_df.query("regression_confidence != 'Uncertain' & Num_Isolates < 5")) == 0
    assert len(ALL_df.query("regression_confidence not in ['Uncertain', 'Neutral'] & Odds_Ratio > 1 & PPV_LB < 0.25")) == 0
    assert len(ALL_df.query("regression_confidence not in ['Uncertain', 'Neutral'] & Odds_Ratio < 1 & NPV_LB < 0.25")) == 0
    
    
with pd.ExcelWriter(f"../results/NewDrugs_WHO_results.xlsx") as file:
    for key, val in WHO_phenos_results.items():
        val.to_excel(file, sheet_name=key, index=False)
        
with pd.ExcelWriter(f"../results/NewDrugs_ALL_results.xlsx") as file:
    for key, val in ALL_phenos_results.items():
        val.to_excel(file, sheet_name=key, index=False)

In [13]:
# for drug in ALL_phenos_results.keys():
    
#     if len(ALL_phenos_results[drug].loc[~pd.isnull(ALL_phenos_results[drug]["MIC"])]) > 0:
#         #print(ALL_phenos_results[drug].loc[~pd.isnull(ALL_phenos_results[drug]["MIC"])])
#         print(drug)

In [14]:
# for drug in WHO_phenos_results.keys():
    
#     if len(WHO_phenos_results[drug].loc[~pd.isnull(WHO_phenos_results[drug]["MIC"])]) > 0:
#         #print(WHO_phenos_results[drug].loc[~pd.isnull(WHO_phenos_results[drug]["MIC"])])
#         print(drug)

# MIC Analyses

In [None]:
# # hard coded paths to preserve the model hierarchy
# mic_analyses_lst = ["tiers=1/dropAF_noSyn_poolSeparate",
#                     "tiers=1/dropAF_noSyn_unpooled",
#                     "tiers=1+2/dropAF_noSyn_poolSeparate",
#                     "tiers=1+2/dropAF_noSyn_unpooled",
#                    ]

# export_binary_mic_analyses(drugs_lst, "MIC", mic_analyses_lst)

# Binary Analysis Summaries File

## Make an Excel file summarizing the results for each drug for the binary analyses

## Break down results by FDR (Significant), OR > 1, Primary Analysis, PPV ≥ 25%, and True Positive (# with variant and resistant) ≥ 5

In [None]:
# def generate_summary_data(drug, folder):
    
#     # read in Excel file with all the sheets 
#     df = pd.read_excel(f"../results/{folder}/{drug}.xlsx", sheet_name=None)

#     # combine sheets into a single dataframe and keep only the first instance of every mutation
#     df = pd.concat(list(df.values())).drop_duplicates("mutation", keep="first")

#     # OR > 1 or OR < 1 --> associated with resistance or susceptibility
#     df["OR>1"] = (df["Odds_Ratio"] > 1)
#     df["PPV_LB≥0.25"] = (df["PPV_LB"] >= 0.25)
#     df["TP≥5"] = (df["TP"] >= 5)

#     df[["OR>1", "PPV_LB≥0.25", "TP≥5"]] = df[["OR>1", "PPV_LB≥0.25", "TP≥5"]].fillna(0).astype(int)
#     df["pooled"] = [0 if "unpooled" in val else 1 for val in df["pool_type"].values]

#     summary_cols = ["OR>1", "PPV_LB≥0.25", "TP≥5", "Significant", "Tier", "Phenos", "synonymous", "pooled"]
#     summary = pd.DataFrame(df[summary_cols].value_counts()).reset_index().rename(columns={0:"Count"})
#     summary = summary[["Count"] + summary_cols].sort_values(by=summary_cols, ascending=[False, False, False, False, True, False, True, True])

#     summary.loc[(summary["OR>1"] == 1) & 
#                 (summary["PPV_LB≥0.25"] == 1) &
#                 (summary["TP≥5"] == 1), "Expert_PASS"
#                 ] = 1

#     summary["Expert_PASS"] = summary["Expert_PASS"].fillna(0).astype(int)

#     return summary.rename(columns={"Tier": "Gene_Tier", "pooled": "LOF_inframe_pooling"}).reset_index(drop=True)

In [None]:
# analysis_summaries = {}
# folder = "BINARY"

# for drug in np.sort(drugs_lst):
    
#     if os.path.isfile(f"../results/{folder}/{drug}.xlsx"):
#         analysis_summaries[drug] = generate_summary_data(drug, folder)
    
# # write results to an Excel file, where each sheet name is a drug
# with pd.ExcelWriter("../results/ALLDrugs_summaries.xlsx") as file:
   
#     for key, val in analysis_summaries.items():
#         val.to_excel(file, sheet_name=key, index=False)

# CC vs. CC-ATU Analyses

Export only variants that would pass the Expert rule:

<ul>
    <li>Odds Ratio > 1</li>
    <li>N_resistant (i.e. true positive) ≥ 5</li>
    <li>PPV_LB ≥ 0.25</li>
    <li>Benjamini-Hochberg p-value < 0.01</li>
</ul>

In [None]:
# def export_cc_atu_analyses(drugs_lst, pval_thresh=0.01):
#     '''
#     This function combines the CC and CC-ATU analyses for each drug. 
#     '''
    
#     if not os.path.isdir("../results/ATU"):
#         os.mkdir("../results/ATU")
    
#     # hard coded paths to preserve the model hierarchy
#     analysis_paths = ["tiers=1/dropAF_noSyn_poolSeparate",
#                       "tiers=1/dropAF_noSyn_poolALL",
#                       "tiers=1/dropAF_noSyn_unpooled",
#                       "tiers=1+2/dropAF_noSyn_poolSeparate",
#                       "tiers=1+2/dropAF_noSyn_poolALL",
#                       "tiers=1+2/dropAF_noSyn_unpooled",
#     ]
    
#     for drug in drugs_lst:
        
#         all_analyses = {}
        
#         for i, model_path in enumerate(analysis_paths):

#             if os.path.isfile(os.path.join(analysis_dir, drug, "ATU", model_path, "model_analysis_with_stats_CC.csv")):
            
#                 cc = pd.read_csv(os.path.join(analysis_dir, drug, "ATU", model_path, "model_analysis_with_stats_CC.csv"))
#                 cc_atu = pd.read_csv(os.path.join(analysis_dir, drug, "ATU", model_path, "model_analysis_with_stats_CC_ATU.csv"))

#                 cc.loc[(cc["Odds_Ratio"] > 1) & 
#                        (cc["TP"] >= 5) &
#                        (cc["PPV_LB"] >= 0.25) & 
#                        (cc["BH_pval"] < pval_thresh), "Expert_PASS"
#                       ] = 1

#                 cc_atu.loc[(cc_atu["Odds_Ratio"] > 1) & 
#                            (cc_atu["TP"] >= 5) &
#                            (cc_atu["PPV_LB"] >= 0.25) & 
#                            (cc_atu["BH_pval"] < pval_thresh), "Expert_PASS"
#                           ] = 1

#                 cc["ATU"] = 0
#                 cc_atu["ATU"] = 1

#                 combined_df = pd.concat([cc.query("Expert_PASS == 1"), cc_atu.query("Expert_PASS == 1")])
#                 del combined_df["Expert_PASS"]
                
#                 combined_df["Tier"] = [2 if "+2" in model_path else 1][0]
#                 combined_df["Phenos"] = ["ALL" if "ALL" in model_path else "WHO"][0]
#                 combined_df["pool_type"] = model_path.split("_")[-1]
#                 combined_df["synonymous"] = int("withSyn" in model_path)
#                 combined_df["HET"] = ["DROP" if "drop" in model_path else "AF"][0]

#                 # remove principal components
#                 combined_df = combined_df.loc[~combined_df["mutation"].str.contains("PC", case=True)]
            
#                 all_analyses[f"Model_{i+1}"] = combined_df.sort_values("Odds_Ratio", ascending=False)

#         if len(all_analyses) > 0:
                
#             with pd.ExcelWriter(f"../results/ATU/{drug}.xlsx") as file:

#                 for key, val in all_analyses.items():
#                     val.to_excel(file, sheet_name=key, index=False)
                  
#             print(f"Finished {drug}!")

In [None]:
# export_cc_atu_analyses(drugs_lst)

# Volcano Plots

In [None]:
def volcano_plot(drug, folder, plot_x, pval_col="BH_pval", color_col="Phenos", saveFig=None):
    '''
    This function generates a volcano scatterplot of p-values against odds ratios to visualize the results for each drug. 
    
    It generates 2 plots for each drug: one for the primary analyses and another for the additional variants picked up by the secondary analyses. Separating them makes the secondary results
    easier to see because the effect sizes for those are much smaller, and they can get obscured by the primary analysis results. 
    
    Arguments:
    
        df = dataframe of results
        plot_x = column in df to plot as the x variable
        plot_y = column in df to plot as the y variable
        pval_col = p-value column to plot (i.e. p-value, Bonferroni, or Benjamini-Hochberg)
        drug = drug name for plotting
        color_col = column in df to color point by. Default is Significant
        or_thresh = to clean up the plot, you can exclude some mutations with very small effect sizes. or_thresh should be a value in [0, ∞). i.e. a threshold of 0.01 will
                    exclude variants with odds ratios in [0.99, 1.01]
        save_fig = file name to save plot to. If it is None, the plot is rendered in the notebook.
        
    Primary analysis: Tier = 1, Phenos = WHO, unpooled = False, synonymous = False, HET mutations = DROP
    
    The function computes the log odds ratio, negative logarithm of p-values, and the logarithm of the negative log of p-values. 
    It plots the logarithm of the negative log of p-values vs. the coefficient in regression (linear or logistic). The log of the logistic regression coefficient is the log-Odds
    '''
    
    # read in Excel file with all the sheets 
    fName = f"../results/{folder}/{drug}.xlsx"
    
    if os.path.isfile(fName):
        
        # plot only unpooled mutation restuls
        if drug == "Pretomanid":
            df = pd.read_excel(fName, sheet_name=["Model_11"])
        else:
            df = pd.read_excel(fName, sheet_name=["Model_3", "Model_7", "Model_11", "Model_15"])

        # combine sheets into a single dataframe and keep only the first instance of every mutation
        df = pd.concat([single_df.loc[~pd.isnull(single_df["Odds_Ratio"])] for single_df in df.values()], axis=0)

        fig, ax = plt.subplots(1, 2, figsize=(12, 4))

        plot_df = df.copy()

        # add color categories
        plot_df.loc[(plot_df["Phenos"] == "WHO") & (plot_df["Tier"]==1) & (plot_df[pval_col] < 0.05), "color_col"] = "WHO"
        plot_df.loc[(plot_df["Phenos"] == "WHO") & (plot_df["Tier"]==2) & (plot_df[pval_col] < 0.01), "color_col"] = "WHO"

        plot_df.loc[(plot_df["Phenos"] == "ALL") & (plot_df["Tier"]==1) & (plot_df[pval_col] < 0.05), "color_col"] = "ALL"
        plot_df.loc[(plot_df["Phenos"] == "ALL") & (plot_df["Tier"]==2) & (plot_df[pval_col] < 0.01), "color_col"] = "ALL"

        # the remaining are insignificant
        plot_df["color_col"] = plot_df["color_col"].fillna("NOT")

        if plot_df[pval_col].min() == 0:
            second_smallest = np.sort(np.unique(plot_df[pval_col]))[1]
            plot_df[pval_col] += second_smallest

        plot_df[f"neg_log_{pval_col}"] = -np.log(plot_df[pval_col])
        plot_df[f"log_neg_log_{pval_col}"] = np.log(plot_df[f"neg_log_{pval_col}"])
        color_palette = {"WHO":sns.color_palette("tab10").as_hex()[0], "ALL":sns.color_palette("tab10").as_hex()[1], "NOT":"lightgray"}

        if plot_x == "coef":
            plot_df["coef"] = np.log(plot_df["Odds_Ratio"])
        plot_y = f"log_neg_log_{pval_col}"

        sns.scatterplot(data=plot_df.query("Tier==1"), 
                        x=plot_x, 
                        y=plot_y, 
                        alpha=0.8,
                        hue="color_col", 
                        linewidth=0.25,
                        edgecolor='white',
                        s=30,
                        palette=color_palette,
                        ax=ax[0]
                       )

        sns.scatterplot(data=plot_df.query("Tier==2"), 
                        x=plot_x, 
                        y=plot_y, 
                        alpha=0.8,
                        hue="color_col",
                        linewidth=0.25,
                        edgecolor='white',
                        s=30,
                        palette=color_palette,
                        ax=ax[1]
                       )

        
        if plot_x == "coef":
            baseline = 0
        else:
            baseline = 1
            
        bound_0 = np.max(np.abs(plot_df.query("Tier==1")[plot_x]))*1.1
        ax[0].set_xlim(baseline - (bound_0-baseline), baseline + (bound_0-baseline))
        ax[0].legend(title='')
        ax[0].set_title(f'''Tier 1: {len(plot_df.query("Tier==1 & color_col != 'NOT'"))} Mutations Significant in Ridge''')

        if drug != "Pretomanid":
            bound_1 = np.max(np.abs(plot_df.query("Tier==2")[plot_x]))*1.1
            ax[1].set_xlim(baseline - (bound_1-baseline), baseline + (bound_1-baseline))
            ax[1].legend(title='')
            ax[1].set_title(f'''Tier 2: {len(plot_df.query("Tier==2 & color_col != 'NOT'"))} Mutations Significant in Ridge''')

        sns.despine()

        if saveFig is not None:
            if not os.path.isdir(os.path.dirname(saveFig)):
                os.makedir(os.path.dirname(saveFig))
            plt.savefig(saveFig, dpi=300, bbox_inches="tight")
        else:
            plt.show()

        # return plot_df

In [None]:
def volcano_plot_by_significance(drug, color_dict, plot_x="coef", pval_col="AUC_BH_pval", saveFig=None):
    '''
    This function generates a volcano scatterplot of p-values against odds ratios to visualize the results for each drug. 
    
    It generates 2 plots for each drug: one for the primary analyses and another for the additional variants picked up by the secondary analyses. Separating them makes the secondary results
    easier to see because the effect sizes for those are much smaller, and they can get obscured by the primary analysis results. 
    
    Arguments:
    
        df = dataframe of results
        plot_x = column in df to plot as the x variable
        plot_y = column in df to plot as the y variable
        pval_col = p-value column to plot (i.e. p-value, Bonferroni, or Benjamini-Hochberg)
        drug = drug name for plotting
        color_col = column in df to color point by. Default is Significant
        or_thresh = to clean up the plot, you can exclude some mutations with very small effect sizes. or_thresh should be a value in [0, ∞). i.e. a threshold of 0.01 will
                    exclude variants with odds ratios in [0.99, 1.01]
        save_fig = file name to save plot to. If it is None, the plot is rendered in the notebook.
        
    Primary analysis: Tier = 1, Phenos = WHO, unpooled = False, synonymous = False, HET mutations = DROP
    
    The function computes the log odds ratio, negative logarithm of p-values, and the logarithm of the negative log of p-values. 
    It plots the logarithm of the negative log of p-values vs. the coefficient in regression (linear or logistic). The log of the logistic regression coefficient is the log-Odds
    '''
    
    model_lst =  ["Model_3", "Model_7", "Model_11", "Model_15"]
    dfs_lst = []

    for key, df in full_results_dict[drug].items():
        if key in model_lst:
            dfs_lst.append(df.loc[~pd.isnull(df["Odds_Ratio"])])

    plot_df = pd.concat(dfs_lst, axis=0)

    if plot_df[pval_col].min() == 0:
        second_smallest = np.sort(np.unique(plot_df[pval_col]))[1]
        plot_df[pval_col] += second_smallest

    plot_df[f"neg_log_{pval_col}"] = -np.log(plot_df[pval_col])
    plot_df[f"log_neg_log_{pval_col}"] = np.log(plot_df[f"neg_log_{pval_col}"])
    
#     groups = ["1) Assoc w R", "2) Assoc w R - Interim", "3) Uncertain", "4) Assoc w S - Interim", "5) Assoc w S", "6) No Assoc"]
#     names = ["Assoc w R", "Assoc w R-Interim", "Uncertain", "Assoc w S-Interim", "Assoc w S", "No Assoc"]

#     plot_df["regression_confidence"] = plot_df["regression_confidence"].map(dict(zip(groups, names)))
    
    
    # categories = ["Evidence against Assoc", "Assoc w S", "Assoc w R", "Assoc w S - strict", "Assoc w R - strict"]
    # color_palette = dict(zip(categories, sns.color_palette("colorblind").as_hex()[:len(categories)]))
    # color_palette["Evidence against Assoc"] = "lightgray"
    
    plot_df["regression_confidence"] = plot_df["regression_confidence"].replace("Assoc w R - strict", "5").replace("Assoc w S - strict", "4")
    
    # lots of overlap between WHO and ALL groups, so don't double plot. Keep first (WHO)
    plot_df = plot_df.sort_values("regression_confidence", ascending=False).drop_duplicates("mutation", keep="last")
    plot_df["regression_confidence"] = plot_df["regression_confidence"].replace("5", "Assoc w R - strict").replace("4", "Assoc w S - strict").replace("Evidence against Assoc", "No Assoc")
    
    if plot_x == "coef":
        plot_df["coef"] = np.log(plot_df["Odds_Ratio"])
            
    plot_y = f"log_neg_log_{pval_col}"
    fig, ax = plt.subplots(1, 2, figsize=(12, 4))

    sns.scatterplot(data=plot_df.query("Tier==1"), 
                    x=plot_x, 
                    y=plot_y, 
                    alpha=1,
                    hue="regression_confidence", 
                    linewidth=0.25,
                    edgecolor='white',
                    s=30,
                    palette=color_dict,
                    ax=ax[0]
                   )

    sns.scatterplot(data=plot_df.query("Tier==2"), 
                    x=plot_x, 
                    y=plot_y, 
                    alpha=1,
                    hue="regression_confidence",
                    linewidth=0.25,
                    edgecolor='white',
                    s=30,
                    palette=color_dict,
                    ax=ax[1]
                   )

    if plot_x == "coef":
        baseline = 0
    else:
        baseline = 1
        
    try:
        bound_0 = np.max(np.abs(plot_df.query("Tier==1")[plot_x]))*1.1
        ax[0].set_xlim(baseline - (bound_0-baseline), baseline + (bound_0-baseline))
        ax[0].legend(title='')
        
        bound_1 = np.max(np.abs(plot_df.query("Tier==2")[plot_x]))*1.1
        ax[1].set_xlim(baseline - (bound_1-baseline), baseline + (bound_1-baseline))
        ax[1].get_legend().remove()
    except:
        pass

    ax[0].set_title(f"{drug}, Tier 1")
    ax[1].set_title(f"{drug}, Tier 2")
    
    sns.despine()

    if saveFig is not None:
        if not os.path.isdir(os.path.dirname(saveFig)):
            os.makedir(os.path.dirname(saveFig))
        plt.savefig(saveFig, dpi=300, bbox_inches="tight")
    else:
        plt.show()

In [None]:
# https://imagecolorpicker.com/en

color_dict = {"Assoc w R - strict": "#c34a4f",
              "Assoc w R": "#e2a19e",
              "Assoc w S - strict": "#2b79c4",
              "Assoc w S": "#acbdd4",
              "No Assoc": "lightgray"
             }

In [None]:
volcano_plot_by_significance("Rifampicin", color_dict, pval_col="LRT_BH_pval", saveFig="../results/Figures/RIF_volcano.png")

In [None]:
# for drug in np.sort(drugs_lst):
#     try:
#         volcano_plot_by_significance(drug, color_dict, pval_col="LRT_BH_pval")
#     except:
#         print(drug)

In [None]:
drug = "Rifampicin"
model_lst =  ["Model_3", "Model_7", "Model_11", "Model_15"]
dfs_lst = []

for key, df in full_results_dict[drug].items():
    if key in model_lst:
        dfs_lst.append(df.loc[~pd.isnull(df["Odds_Ratio"])])

# lots of overlap between WHO and ALL groups, so don't double plot. Keep first (WHO)
plot_df = pd.concat(dfs_lst, axis=0)#.drop_duplicates("mutation", keep="first")

In [None]:
plot_df.drop_duplicates("mutation", keep="first").query("Tier==2 & regression_confidence=='Assoc w R - strict'")

In [None]:
plot_df.drop_duplicates(["mutation", "Tier"], keep="first").query("Tier==2 & regression_confidence=='Assoc w R - strict'")

In [None]:
plot_df.query("mutation=='rpoC_p.Val483Gly'").sort_values("regression_confidence", ascending=False)

In [None]:
plot_df.drop_duplicates("regression_confidence").sort_values("regression_confidence", ascending=False)

In [None]:
plot_df.query("Tier==2 & regression_confidence=='Assoc w R - strict'")

In [None]:
# volcano_plot("Delamanid", "BINARY", "coef")

In [None]:
for drug in drugs_lst:
    if drug != "Isoniazid":
        for key, df in full_results_dict[drug].items():
            if "_3" in key or "_7" in key or "_11" in key or "_15" in key:
                if len(df.query("mutation.str.contains('Rv2752c') & regression_confidence != '6) No Assoc'")) > 0:
                    print(drug)

In [None]:
len(set(full_results_dict["Rifampicin"]["Model_3"].query("High_Predictive_Value==1").mutation).union(full_results_dict["Rifampicin"]["Model_11"].query("High_Predictive_Value==1").mutation))

In [None]:
len(set(full_results_dict["Rifampicin"]["Model_3"].query("regression_confidence=='Assoc w R - strict'").mutation).union(full_results_dict["Rifampicin"]["Model_11"].query("regression_confidence=='Assoc w R - strict'").mutation))

In [None]:
summary_dfs_dict["Rifampicin"]

In [None]:
full_results_dict["Rifampicin"]["Model_7"].query("regression_confidence=='Assoc w R - strict'")

In [None]:
for mut in full_results_dict["Rifampicin"]["Model_15"].query("regression_confidence=='Assoc w R - strict'").mutation.values:
    print(mut)

In [None]:
full_results_dict["Rifampicin"]["Model_7"].query("regression_confidence=='Assoc w R - strict'")