In [143]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['figure.dpi'] = 125
import seaborn as sns
from Bio import SeqIO, Seq
import scipy.stats as st
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
import glob, os, yaml, subprocess, itertools, sparse

who_variants = pd.read_csv("/n/data1/hms/dbmi/farhat/Sanjana/MIC_data/WHO_resistance_variants_all.csv", usecols=["drug", "confidence", "variant"])
variant_mapping = pd.read_csv("../data/v1_to_v2_variants_mapping.csv", usecols=["gene_name", "variant", "raw_variant_mapping_data.variant_category"])
variant_mapping.columns = ["gene", "V1", "V2"]
variant_mapping["mutation"] = variant_mapping["gene"] + "_" + variant_mapping["V2"]

# combine with the new names to get a dataframe with the confidence leve,s and variant mappings between 2021 and 2022
who_variants_combined = who_variants.merge(variant_mapping[["V1", "mutation"]], left_on="variant", right_on="V1", how="inner")
# check that all mutations are there
assert len(set(who_variants_combined.mutation) - set(variant_mapping.mutation)) == 0
del who_variants_combined["variant"]

assert len(set(who_variants_combined["V1"]).symmetric_difference(set(who_variants["variant"]))) == 0
who_variants_combined = who_variants_combined.drop_duplicates()
del who_variants_combined["V1"]
print(who_variants_combined.shape)

# pd.set_option('display.float_format', lambda x: '%.4f' % x)
samples_summary = pd.read_csv("../data/samples_summary.csv")
analysis_dir = '/n/data1/hms/dbmi/farhat/Sanjana/who-mutation-catalogue'

import warnings
warnings.filterwarnings(action='ignore')

(17348, 3)


# Write Final Dataframes to an Excel File

Each drug will have a separate Excel file. Each file will have 16 sheets, one for each model.

For INH, surprising that hadA variants are found with high associations. Might be homoplastic!
For PZA, surprising that clpC1_c.2302T>C co-occurs with pncA_p.His57Asp

In [101]:
finished_drugs = ["Delamanid", "Bedaquiline", "Clofazimine", "Ethionamide", "Linezolid",
                  "Moxifloxacin", "Capreomycin", "Amikacin", "Pyrazinamide", "Kanamycin", "Levofloxacin",
                  "Streptomycin", "Ethambutol", "Isoniazid", "Rifampicin", 
                 ]

drug_abbr_dict = {"Delamanid": "DLM",
                  "Bedaquiline": "BDQ",
                  "Clofazimine": "CFZ",
                  "Ethionamide": "ETH",
                  "Linezolid": "LZD",
                  "Moxifloxacin": "MXF",
                  "Capreomycin": "CAP",
                  "Amikacin": "AMI",
                  "Pyrazinamide": "PZA",
                  "Kanamycin": "KAN",
                  "Levofloxacin": "LEV",
                  "Streptomycin": "STM",
                  "Ethambutol": "EMB",
                  "Isoniazid": "INH",
                  "Rifampicin": "RIF"
                 }

In [138]:
analysis_paths = ["tiers=1/phenos=WHO/dropAF_noSyn",
                  "tiers=1/phenos=WHO/dropAF_noSyn_unpooled",
                  "tiers=1/phenos=WHO/dropAF_withSyn",
                  "tiers=1+2/phenos=WHO/dropAF_noSyn",
                  "tiers=1+2/phenos=WHO/dropAF_noSyn_unpooled",
                  "tiers=1+2/phenos=WHO/dropAF_withSyn",
                  "tiers=1/phenos=ALL/dropAF_noSyn",
                  "tiers=1/phenos=ALL/dropAF_noSyn_unpooled",
                  "tiers=1/phenos=ALL/dropAF_withSyn",
                  "tiers=1+2/phenos=ALL/dropAF_noSyn",
                  "tiers=1+2/phenos=ALL/dropAF_noSyn_unpooled",
                  "tiers=1+2/phenos=ALL/dropAF_withSyn",
                  "tiers=1/phenos=WHO/encodeAF_noSyn",
                  "tiers=1+2/phenos=WHO/encodeAF_noSyn",
                  "tiers=1/phenos=ALL/encodeAF_noSyn",
                  "tiers=1+2/phenos=ALL/encodeAF_noSyn",
]

drug_analyses = {}
for drug in np.sort(finished_drugs):
    
    all_analyses = {}
    
    for i, model_path in enumerate(analysis_paths):
        add_analysis = pd.read_csv(os.path.join(analysis_dir, drug, model_path, "model_analysis_with_stats.csv"))

        add_analysis["Tier"] = [2 if "+2" in model_path else 1][0]
        add_analysis["Phenos"] = ["ALL" if "ALL" in model_path else "WHO"][0]
        add_analysis["unpooled"] = int("unpooled" in model_path)
        add_analysis["synonymous"] = int("withSyn" in model_path)
        add_analysis["HET"] = ["DROP" if "drop" in model_path else "AF"][0]
        
        # remove principal components
        add_analysis = add_analysis.loc[~add_analysis["mutation"].str.contains("PC", case=True)]
        
        all_analyses[f"Model_{i+1}"] = add_analysis
    
    with pd.ExcelWriter(f"../results/{drug}.xlsx") as file:

        for key, val in all_analyses.items():
            val.to_excel(file, sheet_name=key, index=False)

# Analysis Summaries File

## Make an Excel file summarizing the results for each drug

In [139]:
# # Significant, Assoc_Resistance, Primary_Analysis, Tier
# all_permutations = list(itertools.product(*[[1, 0], [1, 0], [1, 0], [1, 2]]))
# all_permutations = np.array([np.array(array) for array in all_permutations])

# # remove the impossible ones (they'll be 0 anyway, but not worth reporting)
# # Primary analysis is not tier 2, and secondary analysis is not tier 1
# all_permutations_df = pd.DataFrame(all_permutations)
# all_permutations_df.columns = ["Significant", "Assoc_Resistance", "Primary_Analysis", "Tier"]
# all_permutations_df = all_permutations_df.query("~(Primary_Analysis==1 & Tier==2) | (Primary_Analysis==0 & Tier==1)")

# all_permutations = all_permutations_df.values
# print(all_permutations.shape)
# all_permutations

# # Significant, Assoc_Resistance, Tier
# all_permutations = list(itertools.product(*[[1, 0], [1, 0], [1, 2]]))

# Significant, Assoc_Resistance, Primary_Analysis
all_permutations = list(itertools.product(*[[1, 0], [1, 0], [1, 0]]))
all_permutations = np.array([np.array(array) for array in all_permutations])
print(all_permutations.shape)
all_permutations

(8, 3)


array([[1, 1, 1],
       [1, 1, 0],
       [1, 0, 1],
       [1, 0, 0],
       [0, 1, 1],
       [0, 1, 0],
       [0, 0, 1],
       [0, 0, 0]])

In [140]:
def generate_summary_data(drug, all_permutations):
    
    # read in Excel file with all the sheets 
    df = pd.read_excel(f"../results/{drug}.xlsx", sheet_name=None)
    
    # combine sheets into a single dataframe and keep only the first instance of every mutation
    df = pd.concat(list(df.values())).drop_duplicates("mutation", keep="first")
    
    # annotation for primary vs. secondary analysis
    df.loc[(df["Tier"] == 1) & 
           (df["Phenos"] == 'WHO') & 
           (df["unpooled"] == 0) & 
           (df["synonymous"] == 0) &
           (df["HET"] == 'DROP'), "Primary_Analysis"
           ] = 1

    df["Primary_Analysis"] = df["Primary_Analysis"].fillna(0).astype(int)
    
    # OR > 1 or OR < 1 --> associated with resistance or susceptibility
    df["Assoc_Resistance"] = (df["Odds_Ratio"] > 1).astype(int)
    
    # summary_cols = ["Significant", "Assoc_Resistance", "Tier"]
    summary_cols = ["Significant", "Assoc_Resistance", "Primary_Analysis"]
    
    summary = pd.DataFrame(df.groupby(["Assoc_Resistance", "Primary_Analysis"]).Significant.value_counts())
    summary.columns = ["Count"]
    summary = summary.reset_index()
    summary = summary.sort_values(by=summary_cols, ascending=[False, False, False])
    
    # rearrange columns
    summary = summary[summary.columns[::-1]].reset_index(drop=True)

    # add rows with count = 0 if not all 8 rows are in the table
    while len(summary) < len(all_permutations):
        for i, row in summary.iterrows():
            if sum(row[summary_cols].values != all_permutations[i]) > 0:

                add_df = pd.DataFrame({"Count": 0}, index=[i])
                add_df[summary_cols] = all_permutations[i]

                summary = pd.concat([summary, add_df], axis=0)
                summary = summary.sort_values(by=summary_cols, ascending=[False, False, False]).reset_index(drop=True)
                break
                
    return summary.rename(columns={"Tier": "Gene_Tier"})

In [141]:
analysis_summaries = {}

for drug in np.sort(finished_drugs):
    analysis_summaries[drug] = generate_summary_data(drug, all_permutations)
    print(f"Finished {drug}!")
    
# write results to an Excel file, where each sheet name is a drug
with pd.ExcelWriter("../results/ALLDrugs_summaries.xlsx") as file:
   
    for key, val in analysis_summaries.items():
        val.to_excel(file, sheet_name=key, index=False)

Finished Amikacin!
Finished Bedaquiline!
Finished Capreomycin!
Finished Clofazimine!
Finished Delamanid!
Finished Ethambutol!
Finished Ethionamide!
Finished Isoniazid!
Finished Kanamycin!
Finished Levofloxacin!
Finished Linezolid!
Finished Moxifloxacin!
Finished Pyrazinamide!
Finished Rifampicin!
Finished Streptomycin!


# Volcano Plots

In [None]:
def volcano_plot(df, drug, plot_x="log-OR", plot_y="log_neg_log_pval", pval_col="BH_pval", color_col="Significant", or_thresh=0, saveFig=None):
    '''
    This function generates a volcano scatterplot of p-values against odds ratios to visualize the results for each drug. 
    
    It generates 2 plots for each drug: one for the primary analyses and another for the additional variants picked up by the secondary analyses. Separating them makes the secondary results
    easier to see because the effect sizes for those are much smaller, and they can get obscured by the primary analysis results. 
    
    Arguments:
    
        df = dataframe of results
        plot_x = column in df to plot as the x variable
        plot_y = column in df to plot as the y variable
        pval_col = p-value column to plot (i.e. p-value, Bonferroni, or Benjamini-Hochberg)
        drug = drug name for plotting
        color_col = column in df to color point by. Default is Significant
        or_thresh = to clean up the plot, you can exclude some mutations with very small effect sizes. or_thresh should be a value in [0, ∞). i.e. a threshold of 0.01 will
                    exclude variants with odds ratios in [0.99, 1.01]
        save_fig = file name to save plot to. If it is None, the plot is rendered in the notebook.
        
    Primary analysis: Tier = 1, Phenos = WHO, unpooled = False, synonymous = False, HET mutations = DROP
    
    The function computes the log odds ratio, negative logarithm of p-values, and the logarithm of the negative log of p-values. All of these can be plotted. 
    
    '''
    plot_df = df.copy()
    lower, upper = 1 - or_thresh, 1 + or_thresh
    plot_df = plot_df.query("Odds_Ratio < @lower | Odds_Ratio > @upper")
    
    if or_thresh > 0:
        print(f"Excluded {len(df) - len(plot_df)} variants from plotting")
    
    if plot_df[pval_col].min() == 0:
        second_smallest = np.sort(np.unique(plot_df[pval_col]))[1]
        plot_df[pval_col] += second_smallest
        
    plot_df["neg_log_pval"] = -np.log(plot_df[pval_col])
    plot_df["log_neg_log_pval"] = np.log(plot_df["neg_log_pval"])
    plot_df["log-OR"] = np.log(plot_df["Odds_Ratio"])
    plot_df.loc[(plot_df["Tier"]==1) & 
                (plot_df["Phenos"]=='WHO') & 
                (plot_df["unpooled"]==0) & 
                (plot_df["synonymous"]==0) & 
                (plot_df["HET"]=='DROP'), 
                "Analysis"] = "Primary"
    plot_df["Analysis"] = plot_df["Analysis"].fillna("Secondary")
        
    fig, ax = plt.subplots(1, 2, figsize=(12, 4))
    sns.scatterplot(data=plot_df.query("Analysis=='Primary'"), 
                    x=plot_x, 
                    y=plot_y, 
                    alpha=0.8,
                    hue="Significant", 
                    linewidth=0.25,
                    edgecolor='white',
                    s=30,
                    palette={1:sns.color_palette("tab10").as_hex()[0], 0:"lightgray"},
                    #palette={"Primary": sns.color_palette("Set2").as_hex()[2], "Secondary": sns.color_palette("Set2").as_hex()[1]},
                    ax=ax[0]
                   )
    sns.scatterplot(data=plot_df.query("Analysis=='Secondary'"), 
                    x=plot_x, 
                    y=plot_y, 
                    alpha=0.8,
                    hue="Significant",
                    linewidth=0.25,
                    edgecolor='white',
                    s=30,
                    palette={1:sns.color_palette("tab10").as_hex()[0], 0:"lightgray"},
                    #palette={"Primary": sns.color_palette("Set2").as_hex()[2], "Secondary": sns.color_palette("Set2").as_hex()[1]},
                    ax=ax[1]
                   )
    
    if plot_x[:3] == 'log':
        center = 0
    else:
        center = 1
      
    for axis in ax:
        if plot_y[:3] == 'log':
            axis.set_ylabel("Log(-Log(p-value))")
        else:
            axis.set_ylabel("-Log(p-value)")

    bound_0 = np.max(np.abs(plot_df.query("Analysis=='Primary'")[plot_x] - center))*1.1
    bound_1 = np.max(np.abs(plot_df.query("Analysis=='Secondary'")[plot_x] - center))*1.1
        
    for axis in ax:
        axis.set_xlabel("Regression Log-Odds")
        axis.legend().set_visible(False)
        
    ax[0].set_title(f"Primary Analysis Results for {drug}")
    ax[0].set_xlim(center - bound_0, center + bound_0)
    
    ax[1].set_title(f"Secondary Analysis Results for {drug}")
    ax[1].set_xlim(center - bound_1, center + bound_1)

    sns.despine()
    
    if saveFig is not None:
        plt.savefig(saveFig, dpi=300, bbox_inches="tight")
    else:
        plt.show()
        
    return plot_df

In [None]:
# for drug in finished_drugs:
#     _ = volcano_plot(drug_analyses[drug], drug, "log-OR", "log_neg_log_pval", "BH_pval", "Analysis", 0, saveFig=f"../results/{drug}_volcano.png")

# TODO: Get model metrics for every drug, and write them to a single CSV

In [3]:
# results_df = pd.DataFrame(columns=["Drug", "Sensitivity", "Sens_Lower", "Sens_Upper", "Specificity", "Spec_Lower", "Spec_Upper", 
#                                    "accuracy", "accuracy_Lower", "accuracy_Upper", "AUC", "AUC_Lower", "AUC_Upper"]).set_index("Drug")

# def get_model_summary(drug, analyses_dict, results_df):
    
#     var_lst = ["Sens", "Spec", "AUC", "accuracy"]
#     name_lst = ["Sensitivity", "Specificity", "AUC", "accuracy"]
#     summary_df = analyses_dict[drug]    
    
#     # add the actual values
#     results_df.loc[drug, name_lst] = summary_df.query("BS==0")[var_lst].values[0]
    
#     # add the confidence intervals
#     for i, variable in enumerate(var_lst):
#         lower, upper = np.percentile(summary_df.query("BS==1")[variable], q=[2.5, 97.5])
#         results_df.loc[drug, [variable + "_Lower", variable + "_Upper"]] = [lower, upper]
#         assert lower <= results_df.loc[drug, name_lst[i]]
#         assert upper >= results_df.loc[drug, name_lst[i]]
    
#     return results_df

In [4]:
# analyses_dict = {}
# for drug in os.listdir(analysis_dir):
    
#     if os.path.isfile(os.path.join(analysis_dir, drug, "core_logReg_summary.csv")):
#         df = pd.read_csv(os.path.join(analysis_dir, drug, "core_logReg_summary.csv"))
#         if len(df) == 1001:
#             analyses_dict[drug] = df
            
# for drug in list(analyses_dict.keys()):
#     results_df = get_model_summary(drug, analyses_dict, results_df)