In [110]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['figure.dpi'] = 125
import seaborn as sns
from Bio import SeqIO, Seq
import scipy.stats as st
import sklearn
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
import glob, os, yaml, subprocess, itertools, sparse

who_variants = pd.read_csv("/n/data1/hms/dbmi/farhat/Sanjana/MIC_data/WHO_resistance_variants_all.csv")
pd.set_option('display.float_format', lambda x: '%.4f' % x)

# Check for analysis files to see if anything needs to be rerun

In [64]:
genos_dir = '/n/data1/hms/dbmi/farhat/ye12/who/full_genotypes'
phenos_dir = '/n/data1/hms/dbmi/farhat/ye12/who/phenotypes'
analysis_dir = '/n/data1/hms/dbmi/farhat/ye12/who/analysis'

pheno_drugs = os.listdir(phenos_dir)
geno_drugs = os.listdir(genos_dir)

drugs_for_analysis = list(set(geno_drugs).intersection(set(pheno_drugs)))
drugs_for_analysis = [drug.split("=")[1] for drug in drugs_for_analysis]
print(len(drugs_for_analysis), "drugs with phenotypes and genotypes")

15 drugs with phenotypes and genotypes


In [65]:
out_dir = "/n/data1/hms/dbmi/farhat/ye12/who/analysis"
mid_dir = "tiers=1+2/phenos=WHO"
#model_prefix = "dropAF_withSyn_poolLOF"
model_prefix = "dropAF_withSyn"

In [66]:
for drug in drugs_for_analysis:
    analysis_dir = os.path.join(out_dir, drug, mid_dir, model_prefix)
    
    try:
        fNames = os.listdir(analysis_dir)
        if 'model_analysis.csv' not in fNames:
            print(drug)
    except:
        if "LOF" in model_prefix:
            print(f"{drug} did not have a unique model")
        else:
            print(f"{drug} needs all 3 scripts to be re-run")

In [67]:
mid_dirs = ["tiers=1/phenos=WHO",  "tiers=1/phenos=ALL", "tiers=1+2/phenos=WHO", "tiers=1+2/phenos=ALL"]
model_prefixes = ["dropAF_noSyn", "dropAF_noSyn_poolLOF", "dropAF_withSyn", "dropAF_withSyn_poolLOF"]

combos = list(itertools.product(mid_dirs, model_prefixes))

In [68]:
combos

[('tiers=1/phenos=WHO', 'dropAF_noSyn'),
 ('tiers=1/phenos=WHO', 'dropAF_noSyn_poolLOF'),
 ('tiers=1/phenos=WHO', 'dropAF_withSyn'),
 ('tiers=1/phenos=WHO', 'dropAF_withSyn_poolLOF'),
 ('tiers=1/phenos=ALL', 'dropAF_noSyn'),
 ('tiers=1/phenos=ALL', 'dropAF_noSyn_poolLOF'),
 ('tiers=1/phenos=ALL', 'dropAF_withSyn'),
 ('tiers=1/phenos=ALL', 'dropAF_withSyn_poolLOF'),
 ('tiers=1+2/phenos=WHO', 'dropAF_noSyn'),
 ('tiers=1+2/phenos=WHO', 'dropAF_noSyn_poolLOF'),
 ('tiers=1+2/phenos=WHO', 'dropAF_withSyn'),
 ('tiers=1+2/phenos=WHO', 'dropAF_withSyn_poolLOF'),
 ('tiers=1+2/phenos=ALL', 'dropAF_noSyn'),
 ('tiers=1+2/phenos=ALL', 'dropAF_noSyn_poolLOF'),
 ('tiers=1+2/phenos=ALL', 'dropAF_withSyn'),
 ('tiers=1+2/phenos=ALL', 'dropAF_withSyn_poolLOF')]

In [69]:
combos = [('tiers=1/phenos=WHO', 'dropAF_noSyn'),
 ('tiers=1/phenos=WHO', 'dropAF_withSyn'),
 ('tiers=1/phenos=WHO', 'dropAF_withSyn_poolLOF'),
 ('tiers=1/phenos=ALL', 'dropAF_noSyn'),
 ('tiers=1/phenos=ALL', 'dropAF_withSyn'),
 ('tiers=1+2/phenos=WHO', 'dropAF_noSyn'),
 ('tiers=1+2/phenos=WHO', 'dropAF_noSyn_poolLOF'),
 ('tiers=1+2/phenos=WHO', 'dropAF_withSyn'),
 ('tiers=1+2/phenos=WHO', 'dropAF_withSyn_poolLOF')]

In [81]:
for drug in drugs_for_analysis:
    
    for mid_dir, model_prefix in combos:
        
        if os.path.isdir(os.path.join(out_dir, drug, mid_dir, model_prefix)):
            analysis_file = os.path.join(out_dir, drug, mid_dir, model_prefix, "model_analysis.csv")
            if not os.path.isfile(analysis_file):
#                 analysis = pd.read_csv(analysis_file)

#                 if sum(analysis["coef_LB"] > analysis["coef"]) != 0:
#                     print(f"    {analysis_file}, LB")
#                 if sum(analysis["coef_UB"] < analysis["coef"]) != 0:
#                     print(f"    {analysis_file}, UB")
                print(analysis_file)

/n/data1/hms/dbmi/farhat/ye12/who/analysis/Capreomycin/tiers=1+2/phenos=WHO/dropAF_noSyn/model_analysis.csv
/n/data1/hms/dbmi/farhat/ye12/who/analysis/Capreomycin/tiers=1+2/phenos=WHO/dropAF_noSyn_poolLOF/model_analysis.csv
/n/data1/hms/dbmi/farhat/ye12/who/analysis/Ethambutol/tiers=1+2/phenos=WHO/dropAF_withSyn_poolLOF/model_analysis.csv
/n/data1/hms/dbmi/farhat/ye12/who/analysis/Isoniazid/tiers=1+2/phenos=WHO/dropAF_withSyn_poolLOF/model_analysis.csv
/n/data1/hms/dbmi/farhat/ye12/who/analysis/Rifampicin/tiers=1+2/phenos=WHO/dropAF_withSyn_poolLOF/model_analysis.csv
/n/data1/hms/dbmi/farhat/ye12/who/analysis/Streptomycin/tiers=1/phenos=ALL/dropAF_withSyn/model_analysis.csv
/n/data1/hms/dbmi/farhat/ye12/who/analysis/Amikacin/tiers=1/phenos=WHO/dropAF_withSyn/model_analysis.csv
/n/data1/hms/dbmi/farhat/ye12/who/analysis/Amikacin/tiers=1/phenos=ALL/dropAF_noSyn/model_analysis.csv
/n/data1/hms/dbmi/farhat/ye12/who/analysis/Amikacin/tiers=1/phenos=ALL/dropAF_withSyn/model_analysis.csv
/n/d

In [74]:
analysis_file = os.path.join("/n/data1/hms/dbmi/farhat/ye12/who/analysis/Capreomycin/tiers=1+2/phenos=WHO/dropAF_withSyn/model_analysis.csv")
analysis = pd.read_csv(analysis_file)

In [2]:
def compare_models_population_structure_correction(pop_corr, no_pop_corr, plot=False, alpha=0.05, diff_thresh=0):
    '''
    Comparison to determine if population structure determination is necessary. Things to compare:
    
    1. Do all the same variants have non-zero coefficients (the returned dataframes are only non-zero coefficients)?
    2. Compute Pearson correlation between coefficients.
    3. Do features have p-values that are on the same side of the threshold (0.05 or 0.01)?
    '''
    
    # merge outer to include anything (in case some features have 0 or non-zero coefficients, depending on whether PCs are included)
    plot_df = pop_corr[["orig_variant", "coef", "pval", "BH_pval", "coef_LB", "coef_UB"]].merge(no_pop_corr[["orig_variant", "coef", "pval", "BH_pval", "coef_LB", "coef_UB"]], how="outer", on="orig_variant")    
    
    # then drop the principal components
    plot_df = plot_df.loc[~plot_df["orig_variant"].str.contains("PC")]
    
    # plot_df_missing = plot_df.loc[(pd.isnull(plot_df["coef_x"])) | (pd.isnull(plot_df["coef_y"]))]
    # print(f"{len(plot_df_missing)} coefficients are zero in one model and non-zero in the other")
    # plot_df_nominal_sig = plot_df.query("pval_x < @alpha & pval_y < @alpha")
    # print("   ", len(plot_df_nominal_sig.query("(coef_x < 0 & coef_y > 0) | (coef_x > 0 & coef_y < 0)")), "nominally significant features have conflicting coefficients")
    
    plot_df_sig = plot_df.query("BH_pval_x < @alpha & BH_pval_y < @alpha")
    print("   ", len(plot_df_sig.query("(coef_x < 0 & coef_y > 0) | (coef_x > 0 & coef_y < 0)")), "significant features have conflicting coefficients")    
    
    # NaNs are 0 coefficients, fill them in for plotting and correlation computations
    plot_df[["coef_x", "coef_y"]] = plot_df[["coef_x", "coef_y"]].fillna(0)
    
    # coefficient without population structure correction - coefficient with correction
    plot_df["diff"] = plot_df["coef_y"] - plot_df["coef_x"]
    plot_df["norm_diff"] = plot_df["diff"] / plot_df["coef_x"]
    
    pearson = st.pearsonr(plot_df['coef_x'], plot_df['coef_y'])[0]

    if plot:
        fig, ax = plt.subplots(1, 2, figsize=(9, 3))
        sns.scatterplot(data=plot_df, x="coef_x", y="coef_y", ax=ax[0], alpha=0.5)
        ax[0].set_xlabel("Coef, Population Structure Correction")
        ax[0].set_ylabel("Coef, No Correction")
        ax[0].set_title(np.round(pearson, 4), "Pearson R")

        sns.scatterplot(data=plot_df, x="coef_x", y="norm_diff", ax=ax[1], alpha=0.5)
        ax[1].set_xlabel("Coef, Population Structure Correction")
        ax[1].set_ylabel("")
        ax[1].set_title("Norm. Difference in Coefs.")

        sns.despine()
        plt.show()
    else:
        print("   ", np.round(pearson, 4), "Pearson R")
    
    # get features whose coefficients differ by more than some threshold (0 to 100) for percent difference
    # (difference between coefficients without and with correction is greater than the coefficient with population structure correction
    #large_diff = plot_df.query("percent_diff >= @diff_thresh | percent_diff <= -@diff_thresh")
    
    # get the number of features with confidence intervals that lie on different sides of 0
    print('   ', len(plot_df.query("(coef_LB_x > 0 & coef_UB_x > 0 & coef_LB_y < 0 & coef_UB_y < 0) | (coef_LB_x < 0 & coef_UB_x < 0 & coef_LB_y > 0 & coef_UB_y > 0)")), 'features have confidence intervals on different sides of 0')
    #print(f'{len(large_diff.query("coef_LB_x > 0 & coef_UB_x > 0 & coef_LB_y < 0 & coef_UB_y < 0"))} features with very different coefficients have confidence intervals on different sides of 0')
    
    # TO-DO: COMPARISON OF CONFIDENCE INTERVALS
    return pearson

15 drugs with phenotypes and genotypes


In [5]:
drug = "Ethambutol"
out_dir = os.path.join(analysis_dir, drug, "tiers=1/phenos=WHO")
corr_prefix = "dropAF_noSyn"
no_corr_prefix = "dropAF_noSyn_noPopCorr"
    
pop_corr = pd.read_pickle(os.path.join(out_dir, corr_prefix, "model_analysis.pkl"))
no_pop_corr = pd.read_pickle(os.path.join(out_dir, no_corr_prefix, "model_analysis.pkl"))

# number of principal components with positive coefficients (OR > 1)
sig_PC = pop_corr.loc[(pop_corr["orig_variant"].str.contains("PC")) & 
                      (((pop_corr["coef_LB"] > 0) & (pop_corr["coef_UB"] > 0)) |
                      ((pop_corr["coef_LB"] < 0) & (pop_corr["coef_UB"] < 0)))
                      ]

In [6]:
pearson_df = pd.DataFrame(columns={"Drug", "R"})

for drug in drugs_for_analysis:
    
    drug = drug.split("=")[1]
    out_dir = os.path.join(analysis_dir, drug, "tiers=1/phenos=WHO")
    corr_prefix = "dropAF_noSyn"
    no_corr_prefix = "dropAF_noSyn_noPopCorr"
    
    if os.path.isfile(os.path.join(out_dir, corr_prefix, "model_analysis.pkl")) and os.path.isfile(os.path.join(out_dir, no_corr_prefix, "model_analysis.pkl")):
        
        print(drug)
        pop_corr = pd.read_pickle(os.path.join(out_dir, corr_prefix, "model_analysis.pkl"))
        no_pop_corr = pd.read_pickle(os.path.join(out_dir, no_corr_prefix, "model_analysis.pkl"))

        # number of principal components with positive coefficients (OR > 1)
        sig_PC = pop_corr.loc[(pop_corr["orig_variant"].str.contains("PC")) & 
                              (((pop_corr["coef_LB"] > 0) & (pop_corr["coef_UB"] > 0)) |
                              ((pop_corr["coef_LB"] < 0) & (pop_corr["coef_UB"] < 0)))
                              ]
        
        if len(sig_PC) > 0:
            print("  ", np.sort(sig_PC["orig_variant"].values), "have significant coefficients")

        # variants in moxi_res that are not in the results without population structure correction
        overlap_variants = set(pop_corr.loc[~pop_corr["orig_variant"].str.contains("PC")]["orig_variant"]).intersection(set(no_pop_corr["orig_variant"]))
        if len(overlap_variants) == len(pop_corr.loc[~pop_corr["orig_variant"].str.contains("PC")]) == len(no_pop_corr):
            pearson = compare_models_population_structure_correction(pop_corr, no_pop_corr, plot=False, alpha=0.05, diff_thresh=0)
            pearson_df = pd.concat([pearson_df, pd.DataFrame({"Drug": drug, "R": pearson}, index=[0])], axis=0)
        else:
            print("    Differing non-zero coefficients")        
        print("\n")

Levofloxacin
    0 significant features have conflicting coefficients
    0.9998 Pearson R
    0 features have confidence intervals on different sides of 0


Moxifloxacin
   ['PC1' 'PC2' 'PC4'] have significant coefficients
    0 significant features have conflicting coefficients
    0.9985 Pearson R
    0 features have confidence intervals on different sides of 0


Ethionamide
   ['PC3' 'PC4'] have significant coefficients
    0 significant features have conflicting coefficients
    0.9989 Pearson R
    0 features have confidence intervals on different sides of 0


Clofazimine
   ['PC1'] have significant coefficients
    0 significant features have conflicting coefficients
    0.9945 Pearson R
    0 features have confidence intervals on different sides of 0


Capreomycin
   ['PC0' 'PC1' 'PC2'] have significant coefficients
    0 significant features have conflicting coefficients
    0.9998 Pearson R
    0 features have confidence intervals on different sides of 0


Streptomycin
   ['P

In [7]:
#who_variants.loc[(who_variants.drug == 'PZA') & (who_variants.confidence.str.contains("|".join(["1"])))]

In [8]:
pearson_df.sort_values("R", ascending=False)

Unnamed: 0,Drug,R
0,Capreomycin,0.9998
0,Amikacin,0.9998
0,Levofloxacin,0.9998
0,Rifampicin,0.9995
0,Kanamycin,0.9992
0,Ethionamide,0.9989
0,Streptomycin,0.9987
0,Moxifloxacin,0.9985
0,Pyrazinamide,0.9952
0,Clofazimine,0.9945


# 1 variant in INH has positive or negative coefficients in the 2 different models

## It's a category 5 mutation, and the confidence intervals are significant

## The other 2 variants do not have significant p-values, and the confidence intervals are not fully above or below 0

In [10]:
drug = "Isoniazid"
out_dir = os.path.join(analysis_dir, drug, "tiers=1/phenos=WHO")
corr_prefix = "dropAF_noSyn"
no_corr_prefix = "dropAF_noSyn_noPopCorr"

pop_corr = pd.read_pickle(os.path.join(out_dir, corr_prefix, "model_analysis.pkl"))
no_pop_corr = pd.read_pickle(os.path.join(out_dir, no_corr_prefix, "model_analysis.pkl"))

plot_df = pop_corr[["orig_variant", "coef", "pval", "BH_pval", "coef_LB", "coef_UB", "confidence_WHO_2021"]].merge(no_pop_corr[["orig_variant", "coef", "pval", "BH_pval", "coef_LB", "coef_UB", "confidence_WHO_2021"]], how="outer", on="orig_variant")    
    
# then drop the principal components
plot_df = plot_df.loc[~plot_df["orig_variant"].str.contains("PC")]

In [18]:
show_cols = ["orig_variant", "coef_x", "BH_pval_x", "coef_LB_x", "coef_UB_x", "coef_y", "BH_pval_y", "coef_LB_y", "coef_UB_y", "confidence_WHO_2021_y"]
plot_df.query("(coef_LB_x > 0 & coef_UB_x > 0 & coef_LB_y < 0 & coef_UB_y < 0) | (coef_LB_x < 0 & coef_UB_x < 0 & coef_LB_y > 0 & coef_UB_y > 0)")[show_cols]

Unnamed: 0,orig_variant,coef_x,BH_pval_x,coef_LB_x,coef_UB_x,coef_y,BH_pval_y,coef_LB_y,coef_UB_y,confidence_WHO_2021_y
1317,katG_p.Arg463Leu,-0.1853,0.0005,-0.2719,-0.0975,0.1827,0.0,0.1364,0.229,5) Not assoc w R


In [12]:
df = pd.read_csv("/n/data1/hms/dbmi/farhat/ye12/who/analysis/Moxifloxacin/tiers=1+2/phenos=WHO/encodeAF_noSyn/model_analysis.csv")

In [13]:
df.loc[df["orig_variant"].str.contains("PC")]

Unnamed: 0,orig_variant,coef,coef_LB,coef_UB,pval,BH_pval,Bonferroni_pval,genome_index,confidence_WHO_2021,Odds_Ratio,OR_LB,OR_UB
79,PC0,0.0362,-0.0177,0.0939,0.1195,0.2875,1.0,,,1.0368,0.9824,1.0984
119,PC3,0.0184,-0.0485,0.084,0.2941,0.6729,1.0,,,1.0185,0.9527,1.0876
989,PC1,-0.0889,-0.1317,-0.0515,0.0,0.0003,0.0086,,,0.915,0.8766,0.9498
992,PC2,-0.1073,-0.1743,-0.0352,0.0012,0.0342,1.0,,,0.8983,0.84,0.9654
993,PC4,-0.1635,-0.2228,-0.0962,0.0,0.0,0.0007,,,0.8492,0.8003,0.9083


# Catalog-Based Method Comparison

In [163]:
def compute_balanced_accuracy_score_single_variant(model_analysis, model_matrix, variant):
    
#     y_hat = model.predict(X)
    
#     tn, fp, fn, tp = sklearn.metrics.confusion_matrix(y, y_hat).ravel()
#     sens = tp / (tp+fn)
#     spec = tn / (tn+fp)
#     auc = sklearn.metrics.roc_auc_score(y, y_hat)

    # balanced_score = balanced_accuracy_score(y, y_hat)

    matrix = model_matrix.copy()
    
    coef = model_analysis.query("orig_variant == @variant")["coef"].values[0]
    if coef > 0:
        matrix.loc[model_matrix[variant] == 1, "assoc"] = 1
        matrix.loc[model_matrix[variant] != 1, "assoc"] = 0
    else:
        matrix.loc[model_matrix[variant] == 1, "assoc"] = 0
        matrix.loc[model_matrix[variant] != 1, "assoc"] = 1
        
    return sklearn.metrics.accuracy_score(y, matrix["assoc"]), sklearn.metrics.balanced_accuracy_score(y, matrix["assoc"])

In [126]:
for i, row in model_analysis.iterrows():

    if "PC" not in row["orig_variant"]:
        model_analysis.loc[i, ["accuracy", "balanced_accuracy"]] = compute_balanced_accuracy_score_single_variant(model_analysis, model_matrix, row["orig_variant"])

In [None]:
out_dir = "/n/data1/hms/dbmi/farhat/ye12/who/analysis/Moxifloxacin/tiers=1/phenos=WHO/dropAF_withSyn"

In [164]:
def compute_downselected_model(out_dir):
    
    model_analysis = pd.read_csv(os.path.join(out_dir, "model_analysis.csv"))
    model_matrix = pd.read_pickle(os.path.join(out_dir, "model_matrix.pkl"))

    eigenvec_df = pd.read_pickle(os.path.join(out_dir, "model_eigenvecs.pkl"))
    eigenvec_df.columns = [f"PC{num}" for num in eigenvec_df.columns]

    df_phenos = pd.read_csv(os.path.join(out_dir, "phenos.csv"))
    y = df_phenos.sort_values("sample_id").phenotype.values
    
    # get all significant features
    downselect_matrix = model_matrix.merge(eigenvec_df, left_index=True, right_index=True)[model_analysis["orig_variant"]]
    assert len(model_analysis) == downselect_matrix.shape[1]
    X = downselect_matrix.values
    
    # fit a logistic regression model
    small_model = LogisticRegressionCV(Cs=np.logspace(-6, 6, 13), 
                                 cv=5,
                                 penalty='l2', 
                                 max_iter=10000, 
                                 multi_class='ovr',
                                 #scoring='neg_log_loss',
                                 scoring='balanced_accuracy',
                                 class_weight='balanced'
                                )

    small_model.fit(X, y)
    print(f"Regularization parameter: {small_model.C_[0]}")
    
    # get predictions and compute accuracy scores (balanced and unbalanced)
    y_hat = small_model.predict(X)
    return sklearn.metrics.accuracy_score(y, y_hat), sklearn.metrics.balanced_accuracy_score(y, y_hat)

In [127]:
model_analysis[["orig_variant", "coef", "accuracy", "balanced_accuracy"]]

Unnamed: 0,orig_variant,coef,accuracy,balanced_accuracy
0,gyrA_p.Asp94Gly,1.0758,0.8887,0.6800
1,gyrA_p.Ala90Val,0.6777,0.8652,0.5693
2,gyrA_p.Asp94Asn,0.5014,0.8830,0.5366
3,gyrA_p.Asp94Ala,0.4685,0.8770,0.5385
4,gyrA_p.Gly668Asp,0.4420,0.2082,0.5293
...,...,...,...,...
431,gyrA_p.Arg252Leu,-0.0855,0.1251,0.5017
432,gyrA_c.2487C>T,-0.0888,0.1229,0.5005
433,gyrA_p.Ala384Val,-0.1297,0.2112,0.5440
434,PC4,-0.2618,,


In [128]:
model_analysis.accuracy.max(), model_analysis.balanced_accuracy.max()

(0.8887220098306936, 0.6799731056533795)

In [129]:
sklearn.metrics.accuracy_score(y, model.predict(X)), sklearn.metrics.balanced_accuracy_score(y, model.predict(X))

(0.8791643910431458, 0.5055150806656438)