In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['figure.dpi'] = 125
import seaborn as sns
from Bio import SeqIO, Seq
import scipy.stats as st
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
import glob, os, yaml, subprocess, itertools, sparse, pickle

who_variants = pd.read_csv("/n/data1/hms/dbmi/farhat/Sanjana/MIC_data/WHO_resistance_variants_all.csv")
pd.set_option('display.float_format', lambda x: '%.4f' % x)

# Comparison with and without Population Structure Correction, previously done

In [2]:
def compare_models_population_structure_correction(pop_corr, no_pop_corr, plot=False, alpha=0.05, diff_thresh=0):
    '''
    Comparison to determine if population structure determination is necessary. Things to compare:
    
    1. Do all the same variants have non-zero coefficients (the returned dataframes are only non-zero coefficients)?
    2. Compute Pearson correlation between coefficients.
    3. Do features have p-values that are on the same side of the threshold (0.05 or 0.01)?
    '''
    
    # merge outer to include anything (in case some features have 0 or non-zero coefficients, depending on whether PCs are included)
    plot_df = pop_corr[["orig_variant", "coef", "pval", "BH_pval", "coef_LB", "coef_UB"]].merge(no_pop_corr[["orig_variant", "coef", "pval", "BH_pval", "coef_LB", "coef_UB"]], how="outer", on="orig_variant")    
    
    # then drop the principal components
    plot_df = plot_df.loc[~plot_df["orig_variant"].str.contains("PC")]
    
    # plot_df_missing = plot_df.loc[(pd.isnull(plot_df["coef_x"])) | (pd.isnull(plot_df["coef_y"]))]
    # print(f"{len(plot_df_missing)} coefficients are zero in one model and non-zero in the other")
    # plot_df_nominal_sig = plot_df.query("pval_x < @alpha & pval_y < @alpha")
    # print("   ", len(plot_df_nominal_sig.query("(coef_x < 0 & coef_y > 0) | (coef_x > 0 & coef_y < 0)")), "nominally significant features have conflicting coefficients")
    
    plot_df_sig = plot_df.query("BH_pval_x < @alpha & BH_pval_y < @alpha")
    print("   ", len(plot_df_sig.query("(coef_x < 0 & coef_y > 0) | (coef_x > 0 & coef_y < 0)")), "significant features have conflicting coefficients")    
    
    # NaNs are 0 coefficients, fill them in for plotting and correlation computations
    plot_df[["coef_x", "coef_y"]] = plot_df[["coef_x", "coef_y"]].fillna(0)
    
    # coefficient without population structure correction - coefficient with correction
    plot_df["diff"] = plot_df["coef_y"] - plot_df["coef_x"]
    plot_df["norm_diff"] = plot_df["diff"] / plot_df["coef_x"]
    
    pearson = st.pearsonr(plot_df['coef_x'], plot_df['coef_y'])[0]

    if plot:
        fig, ax = plt.subplots(1, 2, figsize=(9, 3))
        sns.scatterplot(data=plot_df, x="coef_x", y="coef_y", ax=ax[0], alpha=0.5)
        ax[0].set_xlabel("Coef, Population Structure Correction")
        ax[0].set_ylabel("Coef, No Correction")
        ax[0].set_title(np.round(pearson, 4), "Pearson R")

        sns.scatterplot(data=plot_df, x="coef_x", y="norm_diff", ax=ax[1], alpha=0.5)
        ax[1].set_xlabel("Coef, Population Structure Correction")
        ax[1].set_ylabel("")
        ax[1].set_title("Norm. Difference in Coefs.")

        sns.despine()
        plt.show()
    else:
        print("   ", np.round(pearson, 4), "Pearson R")
    
    # get features whose coefficients differ by more than some threshold (0 to 100) for percent difference
    # (difference between coefficients without and with correction is greater than the coefficient with population structure correction
    #large_diff = plot_df.query("percent_diff >= @diff_thresh | percent_diff <= -@diff_thresh")
    
    # get the number of features with confidence intervals that lie on different sides of 0
    print('   ', len(plot_df.query("(coef_LB_x > 0 & coef_UB_x > 0 & coef_LB_y < 0 & coef_UB_y < 0) | (coef_LB_x < 0 & coef_UB_x < 0 & coef_LB_y > 0 & coef_UB_y > 0)")), 'features have confidence intervals on different sides of 0')
    #print(f'{len(large_diff.query("coef_LB_x > 0 & coef_UB_x > 0 & coef_LB_y < 0 & coef_UB_y < 0"))} features with very different coefficients have confidence intervals on different sides of 0')
    
    # TO-DO: COMPARISON OF CONFIDENCE INTERVALS
    return pearson

# 1 variant in INH has positive or negative coefficients in the 2 different models

## It's a category 5 mutation, and the confidence intervals are significant

## The other 2 variants do not have significant p-values, and the confidence intervals are not fully above or below 0

In [10]:
drug = "Isoniazid"
out_dir = os.path.join(analysis_dir, drug, "tiers=1/phenos=WHO")
corr_prefix = "dropAF_noSyn"
no_corr_prefix = "dropAF_noSyn_noPopCorr"

pop_corr = pd.read_pickle(os.path.join(out_dir, corr_prefix, "model_analysis.pkl"))
no_pop_corr = pd.read_pickle(os.path.join(out_dir, no_corr_prefix, "model_analysis.pkl"))

plot_df = pop_corr[["orig_variant", "coef", "pval", "BH_pval", "coef_LB", "coef_UB", "confidence_WHO_2021"]].merge(no_pop_corr[["orig_variant", "coef", "pval", "BH_pval", "coef_LB", "coef_UB", "confidence_WHO_2021"]], how="outer", on="orig_variant")    
    
# then drop the principal components
plot_df = plot_df.loc[~plot_df["orig_variant"].str.contains("PC")]

In [18]:
show_cols = ["orig_variant", "coef_x", "BH_pval_x", "coef_LB_x", "coef_UB_x", "coef_y", "BH_pval_y", "coef_LB_y", "coef_UB_y", "confidence_WHO_2021_y"]
plot_df.query("(coef_LB_x > 0 & coef_UB_x > 0 & coef_LB_y < 0 & coef_UB_y < 0) | (coef_LB_x < 0 & coef_UB_x < 0 & coef_LB_y > 0 & coef_UB_y > 0)")[show_cols]

Unnamed: 0,orig_variant,coef_x,BH_pval_x,coef_LB_x,coef_UB_x,coef_y,BH_pval_y,coef_LB_y,coef_UB_y,confidence_WHO_2021_y
1317,katG_p.Arg463Leu,-0.1853,0.0005,-0.2719,-0.0975,0.1827,0.0,0.1364,0.229,5) Not assoc w R


In [12]:
df = pd.read_csv("/n/data1/hms/dbmi/farhat/ye12/who/analysis/Moxifloxacin/tiers=1+2/phenos=WHO/encodeAF_noSyn/model_analysis.csv")

In [13]:
df.loc[df["orig_variant"].str.contains("PC")]

Unnamed: 0,orig_variant,coef,coef_LB,coef_UB,pval,BH_pval,Bonferroni_pval,genome_index,confidence_WHO_2021,Odds_Ratio,OR_LB,OR_UB
79,PC0,0.0362,-0.0177,0.0939,0.1195,0.2875,1.0,,,1.0368,0.9824,1.0984
119,PC3,0.0184,-0.0485,0.084,0.2941,0.6729,1.0,,,1.0185,0.9527,1.0876
989,PC1,-0.0889,-0.1317,-0.0515,0.0,0.0003,0.0086,,,0.915,0.8766,0.9498
992,PC2,-0.1073,-0.1743,-0.0352,0.0012,0.0342,1.0,,,0.8983,0.84,0.9654
993,PC4,-0.1635,-0.2228,-0.0962,0.0,0.0,0.0007,,,0.8492,0.8003,0.9083


# Catalog-Based Method Comparison

In [20]:
def get_logReg_summary(out_dir):
    
    model_summary = pd.read_csv(os.path.join(out_dir, "logReg_summary.csv"))
    model_analysis = pd.read_csv(os.path.join(out_dir, "model_analysis.csv"))
    
    summary_sens = model_summary.loc[0, "Sens"]
    summary_spec = model_summary.loc[0, "Spec"]
    
    #max_idx = [np.argmax(model_analysis["Sens"]), np.argmax(model_analysis["Spec"])]
    
    better_variant = model_analysis.query("Sens >= @summary_sens & Spec >= @summary_spec")
    
    if len(better_variant) == 0:
        print("No variants have comparable sensitivity AND specificity")
    else:
        print(better_variant)
    return model_summary, model_analysis
    
    #return model_summary, model_analysis.iloc[max_idx, :][["orig_variant", "coef", "Sens", "Spec", "accuracy", "balanced_accuracy"]]

In [22]:
out_dir = "/n/data1/hms/dbmi/farhat/ye12/who/analysis/Levofloxacin/tiers=1+2/phenos=ALL/dropAF_withSyn"
#out_dir = "/n/data1/hms/dbmi/farhat/ye12/who/analysis/Pyrazinamide/tiers=1+2/phenos=WHO/dropAF_withSyn"

summary, analysis = get_logReg_summary(out_dir)
summary

No variants have comparable sensitivity AND specificity


Unnamed: 0,Sens,Spec,accuracy,balanced_accuracy
0,0.9006,0.9575,0.9463,0.929


In [24]:
analysis.query("Sens > 0.9006")

Unnamed: 0,orig_variant,coef,coef_LB,coef_UB,pval,BH_pval,Bonferroni_pval,genome_index,confidence_WHO_2021,Odds_Ratio,...,Sens_LB,Sens_UB,Spec_LB,Spec_UB,LR+_LB,LR+_UB,LR-_LB,LR-_UB,accuracy,balanced_accuracy
3,gyrA_p.Gly668Asp,0.6452,0.5473,0.7364,0.0,0.0,0.0,9304.0,5) Not assoc w R,1.9063,...,0.9308,0.9439,0.0891,0.0971,1.0253,1.0422,0.5986,0.7477,0.2593,0.5153
629,gyrA_p.Ser95Thr,-1.0187,-1.0779,-0.9568,0.0,0.0,0.0,7585.0,5) Not assoc w R,0.361,...,0.8944,0.9105,0.0867,0.0948,0.982,1.0026,0.9746,1.1823,0.7494,0.5033


In [25]:
analysis.query("Spec > 0.9575")

Unnamed: 0,orig_variant,coef,coef_LB,coef_UB,pval,BH_pval,Bonferroni_pval,genome_index,confidence_WHO_2021,Odds_Ratio,...,Sens_LB,Sens_UB,Spec_LB,Spec_UB,LR+_LB,LR+_UB,LR-_LB,LR-_UB,accuracy,balanced_accuracy
0,gyrA_p.Asp94Gly,1.5328,1.4966,1.5798,0.0000,0.0000,0.0000,7582.0000,1) Assoc w R,4.6311,...,0.3633,0.3890,0.9899,0.9926,37.1506,51.2469,0.6162,0.6424,0.8701,0.6835
1,gyrA_p.Ala90Val,1.0799,1.0525,1.1229,0.0000,0.0000,0.0000,7570.0000,1) Assoc w R,2.9444,...,0.2090,0.2311,0.9858,0.9889,15.2741,19.9406,0.7788,0.8016,0.8363,0.6036
2,gyrA_p.Asp94Asn,0.6619,0.6320,0.7023,0.0000,0.0000,0.0000,7581.0000,1) Assoc w R,1.9384,...,0.0618,0.0764,0.9975,0.9987,27.5232,53.0658,0.9254,0.9399,0.8153,0.5336
4,gyrA_p.Asp94Ala,0.5983,0.5662,0.6363,0.0000,0.0000,0.0000,7582.0000,1) Assoc w R,1.8191,...,0.0664,0.0812,0.9935,0.9956,10.9727,16.9858,0.9239,0.9388,0.8133,0.5341
5,gyrA_p.Asp94Tyr,0.5244,0.4915,0.5648,0.0000,0.0000,0.0000,7581.0000,1) Assoc w R,1.6895,...,0.0347,0.0461,0.9986,0.9994,27.1517,69.8162,0.9548,0.9662,0.8103,0.5196
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
622,Rv1129c_p.Glu135Gly,-0.0798,-0.1007,-0.0180,0.0030,0.0332,1.0000,,,0.9233,...,0.0000,0.0000,0.9994,0.9999,0.0000,0.0000,1.0001,1.0006,0.1971,0.5002
623,gyrA_c.1959G>C,-0.0968,-0.1307,-0.0594,0.0000,0.0000,0.0007,,,0.9077,...,0.0000,0.0006,0.9938,0.9959,0.0000,0.1261,1.0039,1.0061,0.2009,0.5025
624,gyrB_p.Gly520Ala,-0.1028,-0.1133,-0.0903,0.0000,0.0000,0.0000,6798.0000,3) Uncertain significance,0.9023,...,0.0000,0.0006,0.9974,0.9986,0.0000,0.3472,1.0010,1.0024,0.1983,0.5009
625,gyrA_c.2256G>C,-0.1341,-0.1877,-0.0545,0.0010,0.0135,1.0000,,,0.8745,...,0.0000,0.0000,0.9984,0.9993,0.0000,0.0000,1.0007,1.0016,0.1977,0.5006


In [19]:
out_dir = "/n/data1/hms/dbmi/farhat/ye12/who/analysis/Moxifloxacin/tiers=1+2/phenos=ALL/dropAF_withSyn"

get_logReg_summary(out_dir)

No variants have comparable sensitivity AND specificity


Unnamed: 0,Sens,Spec,accuracy,balanced_accuracy
0,0.923,0.8945,0.8983,0.9088
