In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['figure.dpi'] = 125
import seaborn as sns
from Bio import SeqIO, Seq
import scipy.stats as st
import glob, os, yaml, subprocess, itertools, sparse, vcf

who_variants = pd.read_csv("/n/data1/hms/dbmi/farhat/Sanjana/MIC_data/WHO_resistance_variants_all.csv")
pd.set_option('display.float_format', lambda x: '%.4f' % x)

In [3]:
def compare_models_population_structure_correction(pop_corr, no_pop_corr, plot=False, alpha=0.05, diff_thresh=0):
    '''
    Comparison to determine if population structure determination is necessary. Things to compare:
    
    1. Do all the same variants have non-zero coefficients (the returned dataframes are only non-zero coefficients)?
    2. Compute Pearson correlation between coefficients.
    3. Do features have p-values that are on the same side of the threshold (0.05 or 0.01)?
    '''
    
    # merge outer to include anything (in case some features have 0 or non-zero coefficients, depending on whether PCs are included)
    plot_df = pop_corr[["orig_variant", "coef", "pval", "BH_pval", "coef_LB", "coef_UB"]].merge(no_pop_corr[["orig_variant", "coef", "pval", "BH_pval", "coef_LB", "coef_UB"]], how="outer", on="orig_variant")    
    
    # then drop the principal components
    plot_df = plot_df.loc[~plot_df["orig_variant"].str.contains("PC")]
    
    # plot_df_missing = plot_df.loc[(pd.isnull(plot_df["coef_x"])) | (pd.isnull(plot_df["coef_y"]))]
    # print(f"{len(plot_df_missing)} coefficients are zero in one model and non-zero in the other")
    # plot_df_nominal_sig = plot_df.query("pval_x < @alpha & pval_y < @alpha")
    # print("   ", len(plot_df_nominal_sig.query("(coef_x < 0 & coef_y > 0) | (coef_x > 0 & coef_y < 0)")), "nominally significant features have conflicting coefficients")
    
    plot_df_sig = plot_df.query("BH_pval_x < @alpha & BH_pval_y < @alpha")
    print("   ", len(plot_df_sig.query("(coef_x < 0 & coef_y > 0) | (coef_x > 0 & coef_y < 0)")), "significant features have conflicting coefficients")    
    
    # NaNs are 0 coefficients, fill them in for plotting and correlation computations
    plot_df[["coef_x", "coef_y"]] = plot_df[["coef_x", "coef_y"]].fillna(0)
    
    # coefficient without population structure correction - coefficient with correction
    plot_df["diff"] = plot_df["coef_y"] - plot_df["coef_x"]
    plot_df["norm_diff"] = plot_df["diff"] / plot_df["coef_x"]
    
    pearson = st.pearsonr(plot_df['coef_x'], plot_df['coef_y'])[0]

    if plot:
        fig, ax = plt.subplots(1, 2, figsize=(9, 3))
        sns.scatterplot(data=plot_df, x="coef_x", y="coef_y", ax=ax[0], alpha=0.5)
        ax[0].set_xlabel("Coef, Population Structure Correction")
        ax[0].set_ylabel("Coef, No Correction")
        ax[0].set_title(np.round(pearson, 4), "Pearson R")

        sns.scatterplot(data=plot_df, x="coef_x", y="norm_diff", ax=ax[1], alpha=0.5)
        ax[1].set_xlabel("Coef, Population Structure Correction")
        ax[1].set_ylabel("")
        ax[1].set_title("Norm. Difference in Coefs.")

        sns.despine()
        plt.show()
    else:
        print("   ", np.round(pearson, 4), "Pearson R")
    
    # get features whose coefficients differ by more than some threshold (0 to 100) for percent difference
    # (difference between coefficients without and with correction is greater than the coefficient with population structure correction
    #large_diff = plot_df.query("percent_diff >= @diff_thresh | percent_diff <= -@diff_thresh")
    
    # get the number of features with confidence intervals that lie on different sides of 0
    print('   ', len(plot_df.query("(coef_LB_x > 0 & coef_UB_x > 0 & coef_LB_y < 0 & coef_UB_y < 0) | (coef_LB_x < 0 & coef_UB_x < 0 & coef_LB_y > 0 & coef_UB_y > 0)")), 'features have confidence intervals on different sides of 0')
    #print(f'{len(large_diff.query("coef_LB_x > 0 & coef_UB_x > 0 & coef_LB_y < 0 & coef_UB_y < 0"))} features with very different coefficients have confidence intervals on different sides of 0')
    
    # TO-DO: COMPARISON OF CONFIDENCE INTERVALS
    return pearson

In [4]:
genos_dir = '/n/data1/hms/dbmi/farhat/ye12/who/full_genotypes'
phenos_dir = '/n/data1/hms/dbmi/farhat/ye12/who/phenotypes'
analysis_dir = '/n/data1/hms/dbmi/farhat/ye12/who/analysis'

pheno_drugs = os.listdir(phenos_dir)
geno_drugs = os.listdir(genos_dir)

drugs_for_analysis = list(set(geno_drugs).intersection(set(pheno_drugs)))
print(len(drugs_for_analysis), "drugs with phenotypes and genotypes")

15 drugs with phenotypes and genotypes


In [11]:
pearson_df = pd.DataFrame(columns={"Drug", "R"})
for drug in drugs_for_analysis:
    
    drug = drug.split("=")[1]
    out_dir = os.path.join(analysis_dir, drug, "tiers=1/phenos=WHO")
    corr_prefix = "dropAF_noSyn"
    no_corr_prefix = "dropAF_noSyn_noPopCorr"
    
    if os.path.isfile(os.path.join(out_dir, corr_prefix, "model_analysis.pkl")) and os.path.isfile(os.path.join(out_dir, no_corr_prefix, "model_analysis.pkl")):
        
        print(drug)
        pop_corr = pd.read_pickle(os.path.join(out_dir, corr_prefix, "model_analysis.pkl"))
        no_pop_corr = pd.read_pickle(os.path.join(out_dir, no_corr_prefix, "model_analysis.pkl"))

        # number of principal components with positive coefficients (OR > 1)
        print("   ", len(pop_corr.query("coef > 0").loc[pop_corr.query("coef > 0").orig_variant.str.contains('PC')]), "principal components have non-zero coefficients")
        
        # variants in moxi_res that are not in the results without population structure correction
        overlap_variants = set(pop_corr.loc[~pop_corr["orig_variant"].str.contains("PC")]["orig_variant"]).intersection(set(no_pop_corr["orig_variant"]))
        if len(overlap_variants) == len(pop_corr.loc[~pop_corr["orig_variant"].str.contains("PC")]) == len(no_pop_corr):
            pearson = compare_models_population_structure_correction(pop_corr, no_pop_corr, plot=False, alpha=0.05, diff_thresh=0)
            pearson_df = pd.concat([pearson_df, pd.DataFrame({"Drug": drug, "R": pearson}, index=[0])], axis=0)
        else:
            print("    Differing non-zero coefficients")        
        print("\n")

Delamanid
    5 principal components have non-zero coefficients
    0 significant features have conflicting coefficients
    0.9787 Pearson R
    0 features have confidence intervals on different sides of 0


Kanamycin
    3 principal components have non-zero coefficients
    0 significant features have conflicting coefficients
    0.9992 Pearson R
    0 features have confidence intervals on different sides of 0


Ethionamide
    1 principal components have non-zero coefficients
    0 significant features have conflicting coefficients
    0.9989 Pearson R
    0 features have confidence intervals on different sides of 0


Rifampicin
    2 principal components have non-zero coefficients
    0 significant features have conflicting coefficients
    0.9995 Pearson R
    0 features have confidence intervals on different sides of 0


Amikacin
    2 principal components have non-zero coefficients
    0 significant features have conflicting coefficients
    0.9998 Pearson R
    0 features have 

In [19]:
#who_variants.loc[(who_variants.drug == 'PZA') & (who_variants.confidence.str.contains("|".join(["1"])))]

In [15]:
pearson_df.sort_values("R", ascending=False)

Unnamed: 0,Drug,R
0,Capreomycin,0.9998
0,Amikacin,0.9998
0,Levofloxacin,0.9998
0,Rifampicin,0.9995
0,Kanamycin,0.9992
0,Ethionamide,0.9989
0,Streptomycin,0.9987
0,Moxifloxacin,0.9985
0,Pyrazinamide,0.9952
0,Clofazimine,0.9945


# 1 variant in INH has positive or negative coefficients in the 2 different models

## It's a category 5 mutation, and the confidence intervals are significant

## The other 2 variants do not have significant p-values, and the confidence intervals are not fully above or below 0

In [10]:
drug = "Isoniazid"
out_dir = os.path.join(analysis_dir, drug, "tiers=1/phenos=WHO")
corr_prefix = "dropAF_noSyn"
no_corr_prefix = "dropAF_noSyn_noPopCorr"

pop_corr = pd.read_pickle(os.path.join(out_dir, corr_prefix, "model_analysis.pkl"))
no_pop_corr = pd.read_pickle(os.path.join(out_dir, no_corr_prefix, "model_analysis.pkl"))

plot_df = pop_corr[["orig_variant", "coef", "pval", "BH_pval", "coef_LB", "coef_UB", "confidence_WHO_2021"]].merge(no_pop_corr[["orig_variant", "coef", "pval", "BH_pval", "coef_LB", "coef_UB", "confidence_WHO_2021"]], how="outer", on="orig_variant")    
    
# then drop the principal components
plot_df = plot_df.loc[~plot_df["orig_variant"].str.contains("PC")]

In [18]:
show_cols = ["orig_variant", "coef_x", "BH_pval_x", "coef_LB_x", "coef_UB_x", "coef_y", "BH_pval_y", "coef_LB_y", "coef_UB_y", "confidence_WHO_2021_y"]
plot_df.query("(coef_LB_x > 0 & coef_UB_x > 0 & coef_LB_y < 0 & coef_UB_y < 0) | (coef_LB_x < 0 & coef_UB_x < 0 & coef_LB_y > 0 & coef_UB_y > 0)")[show_cols]

Unnamed: 0,orig_variant,coef_x,BH_pval_x,coef_LB_x,coef_UB_x,coef_y,BH_pval_y,coef_LB_y,coef_UB_y,confidence_WHO_2021_y
1317,katG_p.Arg463Leu,-0.1853,0.0005,-0.2719,-0.0975,0.1827,0.0,0.1364,0.229,5) Not assoc w R
