In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['figure.dpi'] = 125
import seaborn as sns
from Bio import SeqIO, Seq
import scipy.stats as st
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
import glob, os, yaml, subprocess, itertools, sparse

who_variants_combined = pd.read_csv("who_confidence_2021.csv")

# pd.set_option('display.float_format', lambda x: '%.4f' % x)
samples_summary = pd.read_csv("../data/samples_summary.csv")
analysis_dir = '/n/data1/hms/dbmi/farhat/Sanjana/who-mutation-catalogue'

import warnings
warnings.filterwarnings(action='ignore')

# Write Final Dataframes for the Binary Analysis to an Excel File

Each drug will have a separate Excel file. Each file will have 16 sheets, one for each model.

For INH, surprising that hadA variants are found with high associations. Might be homoplastic!
For PZA, surprising that clpC1_c.2302T>C co-occurs with pncA_p.His57Asp

In [21]:
finished_drugs = ["Delamanid", "Bedaquiline", "Clofazimine", "Ethionamide", "Linezolid",
                  "Moxifloxacin", "Capreomycin", "Amikacin", "Pyrazinamide", "Kanamycin", "Levofloxacin",
                  "Streptomycin", "Ethambutol", "Isoniazid", "Rifampicin", 
                 ]

drug_abbr_dict = {"Delamanid": "DLM",
                  "Bedaquiline": "BDQ",
                  "Clofazimine": "CFZ",
                  "Ethionamide": "ETH",
                  "Linezolid": "LZD",
                  "Moxifloxacin": "MXF",
                  "Capreomycin": "CAP",
                  "Amikacin": "AMI",
                  "Pyrazinamide": "PZA",
                  "Kanamycin": "KAN",
                  "Levofloxacin": "LEV",
                  "Streptomycin": "STM",
                  "Ethambutol": "EMB",
                  "Isoniazid": "INH",
                  "Rifampicin": "RIF"
                 }

In [22]:
analysis_paths = ["tiers=1/phenos=WHO/dropAF_noSyn",
                  "tiers=1/phenos=WHO/dropAF_noSyn_unpooled",
                  "tiers=1/phenos=WHO/dropAF_withSyn",
                  "tiers=1+2/phenos=WHO/dropAF_noSyn",
                  "tiers=1+2/phenos=WHO/dropAF_noSyn_unpooled",
                  "tiers=1+2/phenos=WHO/dropAF_withSyn",
                  "tiers=1/phenos=ALL/dropAF_noSyn",
                  "tiers=1/phenos=ALL/dropAF_noSyn_unpooled",
                  "tiers=1/phenos=ALL/dropAF_withSyn",
                  "tiers=1+2/phenos=ALL/dropAF_noSyn",
                  "tiers=1+2/phenos=ALL/dropAF_noSyn_unpooled",
                  "tiers=1+2/phenos=ALL/dropAF_withSyn",
                  "tiers=1/phenos=WHO/encodeAF_noSyn",
                  "tiers=1+2/phenos=WHO/encodeAF_noSyn",
                  "tiers=1/phenos=ALL/encodeAF_noSyn",
                  "tiers=1+2/phenos=ALL/encodeAF_noSyn",
]

drug_analyses = {}
for drug in np.sort(finished_drugs):
    
    all_analyses = {}
    
    for i, model_path in enumerate(analysis_paths):
        add_analysis = pd.read_csv(os.path.join(analysis_dir, drug, "BINARY", model_path, "model_analysis_with_stats.csv"))

        add_analysis["Tier"] = [2 if "+2" in model_path else 1][0]
        add_analysis["Phenos"] = ["ALL" if "ALL" in model_path else "WHO"][0]
        add_analysis["unpooled"] = int("unpooled" in model_path)
        add_analysis["synonymous"] = int("withSyn" in model_path)
        add_analysis["HET"] = ["DROP" if "drop" in model_path else "AF"][0]
        
        # remove principal components
        add_analysis = add_analysis.loc[~add_analysis["mutation"].str.contains("PC", case=True)]
        
        all_analyses[f"Model_{i+1}"] = add_analysis
    
    with pd.ExcelWriter(f"../results/{drug}.xlsx") as file:

        for key, val in all_analyses.items():
            val.to_excel(file, sheet_name=key, index=False)

# Analysis Summaries File

## Make an Excel file summarizing the results for each drug

## Break down results by FDR (Significant), OR > 1, Primary Analysis, PPV ≥ 25%, and N_resistant ≥ 5

True positive = N_resistant

In [139]:
# the following code is done so that zeroes can be added to the summary dataframe if appropriate
# the zeroes are not necessary, but make the tables easier to read if they are the same length for each drug

# Significant, OR > 1, Primary_Analysis
all_permutations = list(itertools.product(*[[1, 0], [1, 0], [1, 0]]))
all_permutations = np.array([np.array(array) for array in all_permutations])
print(all_permutations.shape)
all_permutations

(8, 3)


array([[1, 1, 1],
       [1, 1, 0],
       [1, 0, 1],
       [1, 0, 0],
       [0, 1, 1],
       [0, 1, 0],
       [0, 0, 1],
       [0, 0, 0]])

In [19]:
df = pd.read_excel(f"../results/Bedaquiline.xlsx", sheet_name=None)

# combine sheets into a single dataframe and keep only the first instance of every mutation
df = pd.concat(list(df.values())).drop_duplicates("mutation", keep="first")

# annotation for primary vs. secondary analysis
df.loc[(df["Tier"] == 1) & 
       (df["Phenos"] == 'WHO') & 
       (df["unpooled"] == 0) & 
       (df["synonymous"] == 0) &
       (df["HET"] == 'DROP'), "Primary_Analysis"
       ] = 1

# OR > 1 or OR < 1 --> associated with resistance or susceptibility
df["OR>1"] = (df["Odds_Ratio"] > 1).astype(int)

# annotation for primary vs. secondary analysis
df.loc[(df["TP"] >= 5) & 
       (df["PPV_LB"] >= 0.25) & 
       (df["Odds_Ratio"] > 1) & 
       (df["Significant"] == 1), "PASS"
       ] = 1

df[["Primary_Analysis", "PASS"]] = df[["Primary_Analysis", "PASS"]].fillna(0).astype(int)

In [20]:
df

Unnamed: 0,mutation,predicted_effect,position,confidence,Odds_Ratio,OR_LB,OR_UB,pval,BH_pval,Bonferroni_pval,...,LR-_LB,LR-_UB,Tier,Phenos,unpooled,synonymous,HET,Primary_Analysis,OR>1,PASS
0,Rv0678_lof,lof,,,4.504952,3.903240,5.266060,2.808526e-76,4.774494e-74,4.774494e-74,...,0.542028,0.608505,1,WHO,0,0,DROP,1,1,1
1,mmpS5_c.-74G>T,upstream_gene_variant,778979,3) Uncertain significance,1.700938,1.528633,2.019422,3.629015e-15,6.854806e-14,6.169325e-13,...,0.949391,0.975535,1,WHO,0,0,DROP,1,1,1
2,Rv0678_p.Met146Thr,missense_variant,779426,3) Uncertain significance,1.450660,1.382118,1.502118,1.935615e-65,1.645273e-63,3.290546e-63,...,0.980664,0.994998,1,WHO,0,0,DROP,1,1,1
3,Rv0678_p.Leu117Arg,missense_variant,779336;779339,3) Uncertain significance,1.351214,1.246566,1.408410,1.258784e-20,4.279866e-19,2.139933e-18,...,0.985028,0.997272,1,WHO,0,0,DROP,1,1,1
4,pepQ_lof,lof,,,1.313033,1.217284,1.364183,3.292058e-15,6.995624e-14,5.596499e-13,...,0.986526,0.997986,1,WHO,0,0,DROP,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
258,Rv1979c_p.Ser425Ala,missense_variant,2221892,,0.994501,0.989337,1.000017,6.390134e-02,7.381079e-02,1.000000e+00,...,,,2,ALL,0,0,AF,0,0,0
300,Rv1979c_p.Asp414His,missense_variant,2221925,,0.992515,0.988979,1.000019,3.634261e-02,6.115459e-02,1.000000e+00,...,,,2,ALL,0,0,AF,0,0,0
301,lpqB_lof,lof,,,0.992508,0.982633,0.999450,6.843857e-02,7.860348e-02,1.000000e+00,...,,,2,ALL,0,0,AF,0,0,0
336,lpqB_p.Pro425Ser,missense_variant,3623638,,0.989749,0.985986,1.000018,3.314178e-02,7.097806e-02,1.000000e+00,...,,,2,ALL,0,0,AF,0,0,0


In [140]:
def generate_summary_data(drug, all_permutations):
    
    # read in Excel file with all the sheets 
    df = pd.read_excel(f"../results/{drug}.xlsx", sheet_name=None)
    
    # combine sheets into a single dataframe and keep only the first instance of every mutation
    df = pd.concat(list(df.values())).drop_duplicates("mutation", keep="first")
    
    # annotation for primary vs. secondary analysis
    df.loc[(df["Tier"] == 1) & 
           (df["Phenos"] == 'WHO') & 
           (df["unpooled"] == 0) & 
           (df["synonymous"] == 0) &
           (df["HET"] == 'DROP'), "Primary_Analysis"
           ] = 1
    
    # OR > 1 or OR < 1 --> associated with resistance or susceptibility
    df["OR>1"] = (df["Odds_Ratio"] > 1).astype(int)
        
    # annotation for primary vs. secondary analysis
    df.loc[(df["TP"] >= 5) & 
           (df["PPV_LB"] >= 0.25) & 
           (df["Odds_Ratio"] > 1) & 
           (df["Significant"] == 1), "PASS"
           ] = 1
    
    df[["Primary_Analysis", "PASS"]] = df[["Primary_Analysis", "PASS"]].fillna(0).astype(int)
    
    # summary_cols = ["Significant", "Assoc_Resistance", "Tier"]
    summary_cols = ["PASS", "Significant", "OR>1", "Primary_Analysis"]
    
    summary = pd.DataFrame(df.groupby(["Assoc_Resistance", "Primary_Analysis"])["Significant"].value_counts())
    summary.columns = ["Count"]
    summary = summary.reset_index()
    summary = summary.sort_values(by=summary_cols, ascending=[False, False, False])
    
    # rearrange columns
    summary = summary[summary.columns[::-1]].reset_index(drop=True)

    # add rows with count = 0 if not all 8 rows are in the table
    while len(summary) < len(all_permutations):
        for i, row in summary.iterrows():
            if sum(row[summary_cols].values != all_permutations[i]) > 0:

                add_df = pd.DataFrame({"Count": 0}, index=[i])
                add_df[summary_cols] = all_permutations[i]

                summary = pd.concat([summary, add_df], axis=0)
                summary = summary.sort_values(by=summary_cols, ascending=[False, False, False]).reset_index(drop=True)
                break
                
    return summary.rename(columns={"Tier": "Gene_Tier"})

In [141]:
analysis_summaries = {}

for drug in np.sort(finished_drugs):
    analysis_summaries[drug] = generate_summary_data(drug, all_permutations)
    print(f"Finished {drug}!")
    
# write results to an Excel file, where each sheet name is a drug
with pd.ExcelWriter("../results/ALLDrugs_summaries.xlsx") as file:
   
    for key, val in analysis_summaries.items():
        val.to_excel(file, sheet_name=key, index=False)

Finished Amikacin!
Finished Bedaquiline!
Finished Capreomycin!
Finished Clofazimine!
Finished Delamanid!
Finished Ethambutol!
Finished Ethionamide!
Finished Isoniazid!
Finished Kanamycin!
Finished Levofloxacin!
Finished Linezolid!
Finished Moxifloxacin!
Finished Pyrazinamide!
Finished Rifampicin!
Finished Streptomycin!


# Volcano Plots

In [None]:
def volcano_plot(df, drug, plot_x="log-OR", plot_y="log_neg_log_pval", pval_col="BH_pval", color_col="Significant", or_thresh=0, saveFig=None):
    '''
    This function generates a volcano scatterplot of p-values against odds ratios to visualize the results for each drug. 
    
    It generates 2 plots for each drug: one for the primary analyses and another for the additional variants picked up by the secondary analyses. Separating them makes the secondary results
    easier to see because the effect sizes for those are much smaller, and they can get obscured by the primary analysis results. 
    
    Arguments:
    
        df = dataframe of results
        plot_x = column in df to plot as the x variable
        plot_y = column in df to plot as the y variable
        pval_col = p-value column to plot (i.e. p-value, Bonferroni, or Benjamini-Hochberg)
        drug = drug name for plotting
        color_col = column in df to color point by. Default is Significant
        or_thresh = to clean up the plot, you can exclude some mutations with very small effect sizes. or_thresh should be a value in [0, ∞). i.e. a threshold of 0.01 will
                    exclude variants with odds ratios in [0.99, 1.01]
        save_fig = file name to save plot to. If it is None, the plot is rendered in the notebook.
        
    Primary analysis: Tier = 1, Phenos = WHO, unpooled = False, synonymous = False, HET mutations = DROP
    
    The function computes the log odds ratio, negative logarithm of p-values, and the logarithm of the negative log of p-values. All of these can be plotted. 
    
    '''
    plot_df = df.copy()
    lower, upper = 1 - or_thresh, 1 + or_thresh
    plot_df = plot_df.query("Odds_Ratio < @lower | Odds_Ratio > @upper")
    
    if or_thresh > 0:
        print(f"Excluded {len(df) - len(plot_df)} variants from plotting")
    
    if plot_df[pval_col].min() == 0:
        second_smallest = np.sort(np.unique(plot_df[pval_col]))[1]
        plot_df[pval_col] += second_smallest
        
    plot_df["neg_log_pval"] = -np.log(plot_df[pval_col])
    plot_df["log_neg_log_pval"] = np.log(plot_df["neg_log_pval"])
    plot_df["log-OR"] = np.log(plot_df["Odds_Ratio"])
    plot_df.loc[(plot_df["Tier"]==1) & 
                (plot_df["Phenos"]=='WHO') & 
                (plot_df["unpooled"]==0) & 
                (plot_df["synonymous"]==0) & 
                (plot_df["HET"]=='DROP'), 
                "Analysis"] = "Primary"
    plot_df["Analysis"] = plot_df["Analysis"].fillna("Secondary")
        
    fig, ax = plt.subplots(1, 2, figsize=(12, 4))
    sns.scatterplot(data=plot_df.query("Analysis=='Primary'"), 
                    x=plot_x, 
                    y=plot_y, 
                    alpha=0.8,
                    hue="Significant", 
                    linewidth=0.25,
                    edgecolor='white',
                    s=30,
                    palette={1:sns.color_palette("tab10").as_hex()[0], 0:"lightgray"},
                    #palette={"Primary": sns.color_palette("Set2").as_hex()[2], "Secondary": sns.color_palette("Set2").as_hex()[1]},
                    ax=ax[0]
                   )
    sns.scatterplot(data=plot_df.query("Analysis=='Secondary'"), 
                    x=plot_x, 
                    y=plot_y, 
                    alpha=0.8,
                    hue="Significant",
                    linewidth=0.25,
                    edgecolor='white',
                    s=30,
                    palette={1:sns.color_palette("tab10").as_hex()[0], 0:"lightgray"},
                    #palette={"Primary": sns.color_palette("Set2").as_hex()[2], "Secondary": sns.color_palette("Set2").as_hex()[1]},
                    ax=ax[1]
                   )
    
    if plot_x[:3] == 'log':
        center = 0
    else:
        center = 1
      
    for axis in ax:
        if plot_y[:3] == 'log':
            axis.set_ylabel("Log(-Log(p-value))")
        else:
            axis.set_ylabel("-Log(p-value)")

    bound_0 = np.max(np.abs(plot_df.query("Analysis=='Primary'")[plot_x] - center))*1.1
    bound_1 = np.max(np.abs(plot_df.query("Analysis=='Secondary'")[plot_x] - center))*1.1
        
    for axis in ax:
        axis.set_xlabel("Regression Log-Odds")
        axis.legend().set_visible(False)
        
    ax[0].set_title(f"Primary Analysis Results for {drug}")
    ax[0].set_xlim(center - bound_0, center + bound_0)
    
    ax[1].set_title(f"Secondary Analysis Results for {drug}")
    ax[1].set_xlim(center - bound_1, center + bound_1)

    sns.despine()
    
    if saveFig is not None:
        plt.savefig(saveFig, dpi=300, bbox_inches="tight")
    else:
        plt.show()
        
    return plot_df

In [None]:
for drug in finished_drugs:
    _ = volcano_plot(drug_analyses[drug], drug, "log-OR", "log_neg_log_pval", "BH_pval", "Analysis", 0, saveFig=f"../results/{drug}_volcano.png")

# CC vs. CC-ATU Analyses

## This section is not finalized yet

In [7]:
out_dir = "/n/data1/hms/dbmi/farhat/Sanjana/who-mutation-catalogue/Delamanid/ATU/tiers=1/dropAF_noSyn"

cc_analysis = pd.read_csv(os.path.join(out_dir, "model_analysis_CC.csv"))
cc_atu_analysis = pd.read_csv(os.path.join(out_dir, "model_analysis_CC_ATU.csv"))
cc_analysis.shape

(454, 11)

In [10]:
cc_analysis.query("BH_pval < 0.05").loc[cc_analysis["mutation"].str.contains("PC")]

Unnamed: 0,mutation,coef,coef_LB,coef_UB,pval,BH_pval,Bonferroni_pval,Odds_Ratio,OR_LB,OR_UB,confidence
0,PC0,0.270883,0.107039,0.448694,0.00094,0.005333,0.426666,1.311122,1.112977,1.566265,
1,PC4,0.254505,0.080691,0.419788,0.001351,0.007573,0.613431,1.289823,1.084036,1.52164,
5,PC1,0.151696,-0.050061,0.287617,0.036386,0.042249,1.0,1.163806,0.951172,1.333246,
453,PC3,-0.26701,-0.440578,-0.094574,0.00151,0.008258,0.68543,0.765666,0.643665,0.90976,


In [18]:
cc_analysis.query("((coef_LB > 0 & coef_UB > 0) | (coef_LB < 0 & coef_UB < 0))").shape

(63, 11)

In [19]:
cc_atu_analysis.query("((coef_LB > 0 & coef_UB > 0) | (coef_LB < 0 & coef_UB < 0))").shape

(59, 11)

In [21]:
set(cc_atu_analysis.query("((coef_LB > 0 & coef_UB > 0) | (coef_LB < 0 & coef_UB < 0))").mutation) - set(cc_analysis.query("((coef_LB > 0 & coef_UB > 0) | (coef_LB < 0 & coef_UB < 0))").mutation)



{'PC1',
 'ddn_p.Gly81Ser',
 'ddn_p.Tyr65Ser',
 'fbiA_p.Arg321Ser',
 'fgd1_p.Arg64Ser',
 'fgd1_p.Val170Met'}

In [22]:
set(cc_analysis.query("((coef_LB > 0 & coef_UB > 0) | (coef_LB < 0 & coef_UB < 0))").mutation) - set(cc_atu_analysis.query("((coef_LB > 0 & coef_UB > 0) | (coef_LB < 0 & coef_UB < 0))").mutation)


{'PC3',
 'Rv2983_p.Gly145Arg',
 'ddn_p.Gly34Arg',
 'fbiA_c.-97A>T',
 'fbiB_p.Arg174His',
 'fbiB_p.Arg253His',
 'fbiB_p.Leu447Arg',
 'fbiC_p.Ala333Val',
 'fbiC_p.Ser762Asn',
 'fbiC_p.Thr273Ala'}

In [12]:
cc_atu_analysis.query("BH_pval < 0.05").loc[cc_atu_analysis["mutation"].str.contains("PC")]

Unnamed: 0,mutation,coef,coef_LB,coef_UB,pval,BH_pval,Bonferroni_pval,Odds_Ratio,OR_LB,OR_UB,confidence
0,PC4,0.228386,0.158683,0.293854,5.16107e-11,1.952605e-09,2.343126e-08,1.25657,1.171967,1.341588,
1,PC1,0.144476,0.073678,0.210999,1.74821e-05,0.0002736853,0.007936874,1.155434,1.07646,1.234912,


# TODO: Get model metrics for every drug, and write them to a single CSV

In [3]:
# results_df = pd.DataFrame(columns=["Drug", "Sensitivity", "Sens_Lower", "Sens_Upper", "Specificity", "Spec_Lower", "Spec_Upper", 
#                                    "accuracy", "accuracy_Lower", "accuracy_Upper", "AUC", "AUC_Lower", "AUC_Upper"]).set_index("Drug")

# def get_model_summary(drug, analyses_dict, results_df):
    
#     var_lst = ["Sens", "Spec", "AUC", "accuracy"]
#     name_lst = ["Sensitivity", "Specificity", "AUC", "accuracy"]
#     summary_df = analyses_dict[drug]    
    
#     # add the actual values
#     results_df.loc[drug, name_lst] = summary_df.query("BS==0")[var_lst].values[0]
    
#     # add the confidence intervals
#     for i, variable in enumerate(var_lst):
#         lower, upper = np.percentile(summary_df.query("BS==1")[variable], q=[2.5, 97.5])
#         results_df.loc[drug, [variable + "_Lower", variable + "_Upper"]] = [lower, upper]
#         assert lower <= results_df.loc[drug, name_lst[i]]
#         assert upper >= results_df.loc[drug, name_lst[i]]
    
#     return results_df

In [4]:
# analyses_dict = {}
# for drug in os.listdir(analysis_dir):
    
#     if os.path.isfile(os.path.join(analysis_dir, drug, "core_logReg_summary.csv")):
#         df = pd.read_csv(os.path.join(analysis_dir, drug, "core_logReg_summary.csv"))
#         if len(df) == 1001:
#             analyses_dict[drug] = df
            
# for drug in list(analyses_dict.keys()):
#     results_df = get_model_summary(drug, analyses_dict, results_df)