In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['figure.dpi'] = 125
import seaborn as sns
from Bio import SeqIO, Seq
import scipy.stats as st
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
import glob, os, yaml, subprocess, itertools, sparse, pickle

who_variants = pd.read_csv("/n/data1/hms/dbmi/farhat/Sanjana/MIC_data/WHO_resistance_variants_all.csv")
pd.set_option('display.float_format', lambda x: '%.4f' % x)

# Comparison with and without Population Structure Correction, previously done

In [2]:
def compare_models_population_structure_correction(pop_corr, no_pop_corr, plot=False, alpha=0.05, diff_thresh=0):
    '''
    Comparison to determine if population structure determination is necessary. Things to compare:
    
    1. Do all the same variants have non-zero coefficients (the returned dataframes are only non-zero coefficients)?
    2. Compute Pearson correlation between coefficients.
    3. Do features have p-values that are on the same side of the threshold (0.05 or 0.01)?
    '''
    
    # merge outer to include anything (in case some features have 0 or non-zero coefficients, depending on whether PCs are included)
    plot_df = pop_corr[["orig_variant", "coef", "pval", "BH_pval", "coef_LB", "coef_UB"]].merge(no_pop_corr[["orig_variant", "coef", "pval", "BH_pval", "coef_LB", "coef_UB"]], how="outer", on="orig_variant")    
    
    # then drop the principal components
    plot_df = plot_df.loc[~plot_df["orig_variant"].str.contains("PC")]
    
    # plot_df_missing = plot_df.loc[(pd.isnull(plot_df["coef_x"])) | (pd.isnull(plot_df["coef_y"]))]
    # print(f"{len(plot_df_missing)} coefficients are zero in one model and non-zero in the other")
    # plot_df_nominal_sig = plot_df.query("pval_x < @alpha & pval_y < @alpha")
    # print("   ", len(plot_df_nominal_sig.query("(coef_x < 0 & coef_y > 0) | (coef_x > 0 & coef_y < 0)")), "nominally significant features have conflicting coefficients")
    
    plot_df_sig = plot_df.query("BH_pval_x < @alpha & BH_pval_y < @alpha")
    print("   ", len(plot_df_sig.query("(coef_x < 0 & coef_y > 0) | (coef_x > 0 & coef_y < 0)")), "significant features have conflicting coefficients")    
    
    # NaNs are 0 coefficients, fill them in for plotting and correlation computations
    plot_df[["coef_x", "coef_y"]] = plot_df[["coef_x", "coef_y"]].fillna(0)
    
    # coefficient without population structure correction - coefficient with correction
    plot_df["diff"] = plot_df["coef_y"] - plot_df["coef_x"]
    plot_df["norm_diff"] = plot_df["diff"] / plot_df["coef_x"]
    
    pearson = st.pearsonr(plot_df['coef_x'], plot_df['coef_y'])[0]

    if plot:
        fig, ax = plt.subplots(1, 2, figsize=(9, 3))
        sns.scatterplot(data=plot_df, x="coef_x", y="coef_y", ax=ax[0], alpha=0.5)
        ax[0].set_xlabel("Coef, Population Structure Correction")
        ax[0].set_ylabel("Coef, No Correction")
        ax[0].set_title(np.round(pearson, 4), "Pearson R")

        sns.scatterplot(data=plot_df, x="coef_x", y="norm_diff", ax=ax[1], alpha=0.5)
        ax[1].set_xlabel("Coef, Population Structure Correction")
        ax[1].set_ylabel("")
        ax[1].set_title("Norm. Difference in Coefs.")

        sns.despine()
        plt.show()
    else:
        print("   ", np.round(pearson, 4), "Pearson R")
    
    # get features whose coefficients differ by more than some threshold (0 to 100) for percent difference
    # (difference between coefficients without and with correction is greater than the coefficient with population structure correction
    #large_diff = plot_df.query("percent_diff >= @diff_thresh | percent_diff <= -@diff_thresh")
    
    # get the number of features with confidence intervals that lie on different sides of 0
    print('   ', len(plot_df.query("(coef_LB_x > 0 & coef_UB_x > 0 & coef_LB_y < 0 & coef_UB_y < 0) | (coef_LB_x < 0 & coef_UB_x < 0 & coef_LB_y > 0 & coef_UB_y > 0)")), 'features have confidence intervals on different sides of 0')
    #print(f'{len(large_diff.query("coef_LB_x > 0 & coef_UB_x > 0 & coef_LB_y < 0 & coef_UB_y < 0"))} features with very different coefficients have confidence intervals on different sides of 0')
    
    # TO-DO: COMPARISON OF CONFIDENCE INTERVALS
    return pearson

# 1 variant in INH has positive or negative coefficients in the 2 different models

## It's a category 5 mutation, and the confidence intervals are significant

## The other 2 variants do not have significant p-values, and the confidence intervals are not fully above or below 0

In [10]:
drug = "Isoniazid"
out_dir = os.path.join(analysis_dir, drug, "tiers=1/phenos=WHO")
corr_prefix = "dropAF_noSyn"
no_corr_prefix = "dropAF_noSyn_noPopCorr"

pop_corr = pd.read_pickle(os.path.join(out_dir, corr_prefix, "model_analysis.pkl"))
no_pop_corr = pd.read_pickle(os.path.join(out_dir, no_corr_prefix, "model_analysis.pkl"))

plot_df = pop_corr[["orig_variant", "coef", "pval", "BH_pval", "coef_LB", "coef_UB", "confidence_WHO_2021"]].merge(no_pop_corr[["orig_variant", "coef", "pval", "BH_pval", "coef_LB", "coef_UB", "confidence_WHO_2021"]], how="outer", on="orig_variant")    
    
# then drop the principal components
plot_df = plot_df.loc[~plot_df["orig_variant"].str.contains("PC")]

In [18]:
show_cols = ["orig_variant", "coef_x", "BH_pval_x", "coef_LB_x", "coef_UB_x", "coef_y", "BH_pval_y", "coef_LB_y", "coef_UB_y", "confidence_WHO_2021_y"]
plot_df.query("(coef_LB_x > 0 & coef_UB_x > 0 & coef_LB_y < 0 & coef_UB_y < 0) | (coef_LB_x < 0 & coef_UB_x < 0 & coef_LB_y > 0 & coef_UB_y > 0)")[show_cols]

Unnamed: 0,orig_variant,coef_x,BH_pval_x,coef_LB_x,coef_UB_x,coef_y,BH_pval_y,coef_LB_y,coef_UB_y,confidence_WHO_2021_y
1317,katG_p.Arg463Leu,-0.1853,0.0005,-0.2719,-0.0975,0.1827,0.0,0.1364,0.229,5) Not assoc w R


In [12]:
df = pd.read_csv("/n/data1/hms/dbmi/farhat/ye12/who/analysis/Moxifloxacin/tiers=1+2/phenos=WHO/encodeAF_noSyn/model_analysis.csv")

In [13]:
df.loc[df["orig_variant"].str.contains("PC")]

Unnamed: 0,orig_variant,coef,coef_LB,coef_UB,pval,BH_pval,Bonferroni_pval,genome_index,confidence_WHO_2021,Odds_Ratio,OR_LB,OR_UB
79,PC0,0.0362,-0.0177,0.0939,0.1195,0.2875,1.0,,,1.0368,0.9824,1.0984
119,PC3,0.0184,-0.0485,0.084,0.2941,0.6729,1.0,,,1.0185,0.9527,1.0876
989,PC1,-0.0889,-0.1317,-0.0515,0.0,0.0003,0.0086,,,0.915,0.8766,0.9498
992,PC2,-0.1073,-0.1743,-0.0352,0.0012,0.0342,1.0,,,0.8983,0.84,0.9654
993,PC4,-0.1635,-0.2228,-0.0962,0.0,0.0,0.0007,,,0.8492,0.8003,0.9083


# Catalog-Based Method Comparison

In [20]:
def get_logReg_summary(out_dir):
    
    model_summary = pd.read_csv(os.path.join(out_dir, "logReg_summary.csv"))
    model_analysis = pd.read_csv(os.path.join(out_dir, "model_analysis.csv"))
    
    summary_sens = model_summary.loc[0, "Sens"]
    summary_spec = model_summary.loc[0, "Spec"]
    
    #max_idx = [np.argmax(model_analysis["Sens"]), np.argmax(model_analysis["Spec"])]
    
    better_variant = model_analysis.query("Sens >= @summary_sens & Spec >= @summary_spec")
    
    if len(better_variant) == 0:
        print("No variants have comparable sensitivity AND specificity")
    else:
        print(better_variant)
    return model_summary, model_analysis
    
    #return model_summary, model_analysis.iloc[max_idx, :][["orig_variant", "coef", "Sens", "Spec", "accuracy", "balanced_accuracy"]]

In [26]:
#out_dir = "/n/data1/hms/dbmi/farhat/ye12/who/analysis/Levofloxacin/tiers=1+2/phenos=ALL/dropAF_withSyn"
out_dir = "/n/data1/hms/dbmi/farhat/ye12/who/analysis/Pyrazinamide/tiers=1+2/phenos=WHO/dropAF_withSyn"

summary, analysis = get_logReg_summary(out_dir)
summary

No variants have comparable sensitivity AND specificity


Unnamed: 0,Sens,Spec,accuracy,balanced_accuracy
0,0.8336,0.9084,0.8951,0.871


In [27]:
analysis.query("Sens > 0.9006")

Unnamed: 0,orig_variant,coef,coef_LB,coef_UB,pval,BH_pval,Bonferroni_pval,genome_index,confidence_WHO_2021,Odds_Ratio,...,Sens_LB,Sens_UB,Spec_LB,Spec_UB,LR+_LB,LR+_UB,LR-_LB,LR-_UB,accuracy,balanced_accuracy


In [29]:
analysis.loc[analysis["orig_variant"].str.contains("PC")]

Unnamed: 0,orig_variant,coef,coef_LB,coef_UB,pval,BH_pval,Bonferroni_pval,genome_index,confidence_WHO_2021,Odds_Ratio,...,Sens_LB,Sens_UB,Spec_LB,Spec_UB,LR+_LB,LR+_UB,LR-_LB,LR-_UB,accuracy,balanced_accuracy
35,PC2,0.166,0.0355,0.3034,0.0078,0.0301,1.0,,,1.1806,...,,,,,,,,,,
57,PC0,0.1346,0.0166,0.263,0.014,0.0492,1.0,,,1.144,...,,,,,,,,,,


In [28]:
analysis.query("Spec > 0.9575")

Unnamed: 0,orig_variant,coef,coef_LB,coef_UB,pval,BH_pval,Bonferroni_pval,genome_index,confidence_WHO_2021,Odds_Ratio,...,Sens_LB,Sens_UB,Spec_LB,Spec_UB,LR+_LB,LR+_UB,LR-_LB,LR-_UB,accuracy,balanced_accuracy
0,pncA_c.-11A>G,0.4088,0.3700,0.4467,0.0000,0.0000,0.0000,2289252.0000,1) Assoc w R,1.5050,...,0.0274,0.0395,0.9993,0.9999,44.4364,284.1349,0.9608,0.9731,0.8278,0.5163
1,pncA_p.His51Asp,0.3308,0.3054,0.3591,0.0000,0.0000,0.0000,2289091.0000,1) Assoc w R,1.3921,...,0.0095,0.0177,0.9998,1.0000,63.1368,inf,0.9823,0.9905,0.8246,0.5067
2,pncA_p.Val131fs,0.2900,0.2684,0.3154,0.0000,0.0000,0.0000,,,1.3365,...,0.0090,0.0165,0.9998,1.0000,54.4333,inf,0.9836,0.9912,0.8245,0.5064
3,pncA_p.Gln141Pro,0.2869,0.2529,0.3273,0.0000,0.0000,0.0000,2288820.0000,1) Assoc w R,1.3323,...,0.0151,0.0243,0.9995,0.9999,35.3919,344.8754,0.9759,0.9851,0.8256,0.5097
4,pncA_p.Leu172Pro,0.2779,0.2524,0.3046,0.0000,0.0000,0.0000,2288727.0000,1) Assoc w R,1.3204,...,0.0062,0.0129,1.0000,1.0000,inf,inf,0.9871,0.9938,0.8239,0.5046
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
854,rpsA_p.Ile70Leu,-0.1249,-0.1356,-0.1120,0.0000,0.0000,0.0000,,,0.8826,...,0.0000,0.0000,0.9966,0.9982,0.0000,0.0000,1.0018,1.0034,0.1798,0.5013
855,PPE35_p.Pro670Leu,-0.1287,-0.1817,-0.0860,0.0000,0.0000,0.0009,2168604.0000,5) Not assoc w R,0.8792,...,0.0003,0.0024,0.9800,0.9842,0.0156,0.1377,1.0145,1.0196,0.1922,0.5084
856,Rv1258c_p.Gly363Val,-0.1288,-0.1399,-0.1158,0.0000,0.0000,0.0000,1406253.0000,3) Uncertain significance,0.8791,...,0.0000,0.0000,0.9965,0.9981,0.0000,0.0000,1.0019,1.0036,0.1799,0.5013
857,Rv3236c_c.546C>T,-0.1577,-0.1859,-0.1337,0.0000,0.0000,0.0000,,,0.8541,...,0.0000,0.0000,0.9916,0.9942,0.0000,0.0000,1.0058,1.0085,0.1835,0.5035


In [19]:
out_dir = "/n/data1/hms/dbmi/farhat/ye12/who/analysis/Moxifloxacin/tiers=1+2/phenos=ALL/dropAF_withSyn"

get_logReg_summary(out_dir)

No variants have comparable sensitivity AND specificity


Unnamed: 0,Sens,Spec,accuracy,balanced_accuracy
0,0.923,0.8945,0.8983,0.9088


In [38]:
df_pza = pd.read_csv("/n/data1/hms/dbmi/farhat/ye12/who/analysis/Pyrazinamide/tiers=1+2/phenos=WHO/dropAF_withSyn/phenos.csv")

# the missing ones might be M. cannettii, most similar to L6 based on the other lineage callers
lineages = pd.read_pickle("../data/combined_lineage_sample_IDs.pkl")
lineages["Lineage"] = lineages["Lineage"].fillna("6")
lineages["Lineage_1"] = lineages["Lineage_1"].fillna("6")

lineages = lineages[["Sample Name", "Sample ID", "Lineage_1"]]
lineages["Lineage"] = [str(val).split(".")[0] for val in lineages["Lineage_1"].values]
lineages.loc[lineages["Lineage"].str.contains("BOV"), "Lineage"] = "M. bovis"

assert len(lineages.loc[pd.isnull(lineages["Lineage"])]) == 0

########## KEEP ONLY ISOLATES WITH ALL 3 PIECES OF DATA ##########

# get only isolates with data for everyting: SNP matrix, in the model, and lineages
combined = lineages.merge(df_pza, left_on="Sample ID", right_on="sample_id")

In [42]:
combined.groupby("Lineage")["phenotype"].mean().sort_values(ascending=False)

Lineage
M. bovis   0.9153
2          0.3750
1          0.1503
6          0.1163
4          0.1112
5          0.1111
3          0.0554
7          0.0000
Name: phenotype, dtype: float64

In [45]:
combined.query("Lineage == 'M. bovis'").Lineage_1.unique()

array(['BOV_AFRI', 'BOV'], dtype=object)