In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['figure.dpi'] = 125
import seaborn as sns
from Bio import SeqIO, Seq
import scipy.stats as st
import glob, os, yaml, subprocess, itertools, sparse, vcf

who_variants = pd.read_csv("/n/data1/hms/dbmi/farhat/Sanjana/MIC_data/WHO_resistance_variants_all.csv")
pd.set_option('display.float_format', lambda x: '%.4f' % x)

In [16]:
def compare_models_population_structure_correction(pop_corr, no_pop_corr, plot=False, alpha=0.05, diff_thresh=0):
    '''
    Comparison to determine if population structure determination is necessary. Things to compare:
    
    1. Do all the same variants have non-zero coefficients (the returned dataframes are only non-zero coefficients)?
    2. Compute Pearson correlation between coefficients.
    3. Do features have p-values that are on the same side of the threshold (0.05 or 0.01)?
    '''
    
    # merge outer to include anything (in case some features have 0 or non-zero coefficients, depending on whether PCs are included)
    plot_df = pop_corr[["orig_variant", "coef", "pval", "BH_pval", "coef_LB", "coef_UB"]].merge(no_pop_corr[["orig_variant", "coef", "pval", "BH_pval", "coef_LB", "coef_UB"]], how="outer", on="orig_variant")    
    
    # then drop the principal components
    plot_df = plot_df.loc[~plot_df["orig_variant"].str.contains("PC")]
    
    # plot_df_missing = plot_df.loc[(pd.isnull(plot_df["coef_x"])) | (pd.isnull(plot_df["coef_y"]))]
    # print(f"{len(plot_df_missing)} coefficients are zero in one model and non-zero in the other")
    # plot_df_nominal_sig = plot_df.query("pval_x < @alpha & pval_y < @alpha")
    # print("   ", len(plot_df_nominal_sig.query("(coef_x < 0 & coef_y > 0) | (coef_x > 0 & coef_y < 0)")), "nominally significant features have conflicting coefficients")
    
    plot_df_sig = plot_df.query("BH_pval_x < @alpha & BH_pval_y < @alpha")
    print("   ", len(plot_df_sig.query("(coef_x < 0 & coef_y > 0) | (coef_x > 0 & coef_y < 0)")), "significant features have conflicting coefficients")    
    
    # NaNs are 0 coefficients, fill them in for plotting and correlation computations
    plot_df[["coef_x", "coef_y"]] = plot_df[["coef_x", "coef_y"]].fillna(0)
    
    # coefficient without population structure correction - coefficient with correction
    plot_df["diff"] = plot_df["coef_y"] - plot_df["coef_x"]
    plot_df["norm_diff"] = plot_df["diff"] / plot_df["coef_x"]

    if plot:
        fig, ax = plt.subplots(1, 2, figsize=(9, 3))
        sns.scatterplot(data=plot_df, x="coef_x", y="coef_y", ax=ax[0], alpha=0.5)
        ax[0].set_xlabel("Coef, Population Structure Correction")
        ax[0].set_ylabel("Coef, No Correction")
        ax[0].set_title(np.round(st.pearsonr(plot_df['coef_x'], plot_df['coef_y'])[0], 4), "Pearson R")

        sns.scatterplot(data=plot_df, x="coef_x", y="norm_diff", ax=ax[1], alpha=0.5)
        ax[1].set_xlabel("Coef, Population Structure Correction")
        ax[1].set_ylabel("")
        ax[1].set_title("Norm. Difference in Coefs.")

        sns.despine()
        plt.show()
    else:
        print("   ", np.round(st.pearsonr(plot_df['coef_x'], plot_df['coef_y'])[0], 4), "Pearson R")
    
    # get features whose coefficients differ by more than some threshold (0 to 100) for percent difference
    # (difference between coefficients without and with correction is greater than the coefficient with population structure correction
    #large_diff = plot_df.query("percent_diff >= @diff_thresh | percent_diff <= -@diff_thresh")
    
    # get the number of features with confidence intervals that lie on different sides of 0
    print('   ', len(plot_df.query("coef_LB_x > 0 & coef_UB_x > 0 & coef_LB_y < 0 & coef_UB_y < 0")), 'features have confidence intervals on different sides of 0')
    #print(f'{len(large_diff.query("coef_LB_x > 0 & coef_UB_x > 0 & coef_LB_y < 0 & coef_UB_y < 0"))} features with very different coefficients have confidence intervals on different sides of 0')
    
    # TO-DO: COMPARISON OF CONFIDENCE INTERVALS

In [17]:
genos_dir = '/n/data1/hms/dbmi/farhat/ye12/who/full_genotypes'
phenos_dir = '/n/data1/hms/dbmi/farhat/ye12/who/phenotypes'
analysis_dir = '/n/data1/hms/dbmi/farhat/ye12/who/analysis'

pheno_drugs = os.listdir(phenos_dir)
geno_drugs = os.listdir(genos_dir)

drugs_for_analysis = list(set(geno_drugs).intersection(set(pheno_drugs)))
print(len(drugs_for_analysis), "drugs with phenotypes and genotypes")

15 drugs with phenotypes and genotypes


In [18]:
for drug in drugs_for_analysis:
    
    drug = drug.split("=")[1]
    out_dir = os.path.join(analysis_dir, drug, "tiers=1/phenos=WHO")
    corr_prefix = "dropAF_noSyn"
    no_corr_prefix = "dropAF_noSyn_noPopCorr"
    
    if os.path.isfile(os.path.join(out_dir, corr_prefix, "model_analysis.pkl")) and os.path.isfile(os.path.join(out_dir, no_corr_prefix, "model_analysis.pkl")):
        
        print(drug)
        pop_corr = pd.read_pickle(os.path.join(out_dir, corr_prefix, "model_analysis.pkl"))
        no_pop_corr = pd.read_pickle(os.path.join(out_dir, no_corr_prefix, "model_analysis.pkl"))

        # number of principal components with positive coefficients (OR > 1)
        print("   ", len(pop_corr.query("coef > 0").loc[pop_corr.query("coef > 0").orig_variant.str.contains('PC')]), "principal components have non-zero coefficients")
        
        # variants in moxi_res that are not in the results without population structure correction
        overlap_variants = set(pop_corr.loc[~pop_corr["orig_variant"].str.contains("PC")]["orig_variant"]).intersection(set(no_pop_corr["orig_variant"]))
        if len(overlap_variants) == len(pop_corr.loc[~pop_corr["orig_variant"].str.contains("PC")]) == len(no_pop_corr):
            compare_models_population_structure_correction(pop_corr, no_pop_corr, plot=False, alpha=0.05, diff_thresh=0)
        else:
            print("    Differing non-zero coefficients")        
        print("\n")

Kanamycin
    3 principal components have non-zero coefficients
    0 significant features have conflicting coefficients
    0.9992 Pearson R
    0 features have confidence intervals on different sides of 0


Rifampicin
    2 principal components have non-zero coefficients
    0 significant features have conflicting coefficients
    0.9995 Pearson R
    0 features have confidence intervals on different sides of 0


Ethambutol
    2 principal components have non-zero coefficients
    0 significant features have conflicting coefficients
    0.9921 Pearson R
    0 features have confidence intervals on different sides of 0


Amikacin
    2 principal components have non-zero coefficients
    0 significant features have conflicting coefficients
    0.9998 Pearson R
    0 features have confidence intervals on different sides of 0


Delamanid
    5 principal components have non-zero coefficients
    0 significant features have conflicting coefficients
    0.9787 Pearson R
    0 features have c

In [19]:
#who_variants.loc[(who_variants.drug == 'PZA') & (who_variants.confidence.str.contains("|".join(["1"])))]

# 1 variant in INH has positive or negative coefficients in the 2 different models

## It's a category 5 mutation, and the confidence intervals are significant

## The other 2 variants do not have significant p-values, and the confidence intervals are not fully above or below 0

In [20]:
drug = "Isoniazid"
out_dir = os.path.join(analysis_dir, drug, "tiers=1/phenos=WHO")
corr_prefix = "dropAF_noSyn"
no_corr_prefix = "dropAF_noSyn_noPopCorr"

pop_corr = pd.read_pickle(os.path.join(out_dir, corr_prefix, "model_analysis.pkl"))
no_pop_corr = pd.read_pickle(os.path.join(out_dir, no_corr_prefix, "model_analysis.pkl"))

plot_df = pop_corr[["orig_variant", "coef", "pval", "BH_pval", "coef_LB", "coef_UB", "confidence_WHO_2021"]].merge(no_pop_corr[["orig_variant", "coef", "pval", "BH_pval", "coef_LB", "coef_UB", "confidence_WHO_2021"]], how="outer", on="orig_variant")    
    
# then drop the principal components
plot_df = plot_df.loc[~plot_df["orig_variant"].str.contains("PC")]

In [22]:
plot_df.query("(coef_x < 0 & coef_y > 0) | (coef_x > 0 & coef_y < 0)")

Unnamed: 0,orig_variant,coef_x,pval_x,BH_pval_x,coef_LB_x,coef_UB_x,confidence_WHO_2021_x,coef_y,pval_y,BH_pval_y,coef_LB_y,coef_UB_y,confidence_WHO_2021_y
731,katG_p.Asn529Thr,0.0012,0.4412,0.616,-0.0171,0.0216,3) Uncertain significance,-0.0014,0.4312,0.6037,-0.0145,0.0192,3) Uncertain significance
758,katG_c.-28G>T,0.0001,0.4942,0.6829,-0.0175,0.0176,3) Uncertain significance,-0.0008,0.4685,0.6502,-0.0185,0.0162,3) Uncertain significance
1317,katG_p.Arg463Leu,-0.1853,0.0,0.0005,-0.2719,-0.0975,5) Not assoc w R,0.1827,0.0,0.0,0.1364,0.229,5) Not assoc w R


# TO-DO: Lineage Analysis for PCs

## Further investigate katG_p.Arg463Leu. Correlated with any lineages?

In [23]:
lineages = pd.read_csv("data/lineages.csv")
lineages[["Lineage_1", "Lineage_2"]] = lineages["Lineage"].str.split(",", expand=True).rename(columns={0:"Lineage_1", 1:"Lineage_2"})
del lineages["Lineage"]

lineages[["Lineage_1", "Count_1"]] = lineages["Lineage_1"].str.split("(", expand=True)
lineages["Count_1"] = lineages["Count_1"].str.strip(")")

lineages[["Lineage_2", "Count_2"]] = lineages["Lineage_2"].str.split("(", expand=True)
lineages["Count_2"] = lineages["Count_2"].str.strip(")")

In [24]:
lineages.shape

(100, 5)

In [25]:
lineages

Unnamed: 0,Isolate,Lineage_1,Lineage_2,Count_1,Count_2
0,11050333,2.2.1,,1/1,
1,SAMN21856363,2.2.1,,1/1,
2,SAMC246113,4.5,,1/1,
3,SAMEA1101763,3,,1/1,
4,SAMEA1019058,4.8,,1/1,
...,...,...,...,...,...
95,SAMEA7527813,3,,1/1,
96,26_51_G1159_1,3,,1/1,
97,SAMC246418,2.2.1,,1/1,
98,03_5667,2.2.1,,1/1,
