In [50]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['figure.dpi'] = 150
import seaborn as sns
from Bio import SeqIO, Seq

import glob, os, yaml, subprocess, itertools, sparse, vcf
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from statsmodels.distributions.empirical_distribution import ECDF
import sklearn.metrics
from sklearn.decomposition import PCA
import timeit
import scipy.stats as st
import statsmodels.api as sm
import pickle, yaml, tracemalloc
from scipy.stats import binomtest

analysis_dir = '/n/data1/hms/dbmi/farhat/Sanjana/who-mutation-catalogue'
input_data_dir = '/n/data1/hms/dbmi/farhat/ye12/who'
who_variants_combined = pd.read_csv("who_confidence_2021.csv")

import itertools
from stats_utils import *
import warnings
warnings.filterwarnings("ignore")

# RIFAMPICIN TEST CASE

# 0. Original Analysis: 137 new significant resistance-associated variants

## But, there are 152 with OR > 1 and BH p-val < 0.01 that are in tier 2 genes (rpoB is the only tier 1)
## These 152 mutations were studied in the next two analyses

In [2]:
folder = "BINARY"
phenos_name = "WHO"
drug = "Rifampicin"
drug_WHO_abbr = "RIF"

# get the number of high confidence resistance-associated 
high_Conf_variants = who_variants_combined.loc[(who_variants_combined["drug"]==drug_WHO_abbr) & 
                                               (who_variants_combined["confidence"].str.contains("|".join(["1", "2"])))
                                              ].mutation.unique()

# these have Odds_Ratio > 1, BH pval < 0.01, and are not in the Tier 1 analysis (for RIF, it means they are not in rpoB)
tier2_mutations_of_interest = get_tier2_mutations_of_interest(analysis_dir, drug, phenos_name)

og_model_analysis = pd.read_excel("../results/BINARY/Rifampicin.xlsx", sheet_name="Model_7")
print(og_model_analysis.query("Odds_Ratio > 1 & BH_pval < 0.01 & PPV_LB > 0.25 & TP >= 5 & ~mutation.str.contains('rpoB')").shape)

152 significant tier 2 mutations associated with WHO resistance
(137, 40)


In [3]:
og_model_analysis.iloc[:, :-5].to_csv("original_model_analysis.csv", index=False)

In [4]:
og_model_analysis.query("mutation=='rpoC_p.Glu1092Asp'")[['mutation', 'Odds_Ratio', 'OR_LB', 'OR_UB', 'pval', 'BH_pval',
       'Bonferroni_pval', 'Num_Isolates', 'Total_Isolates', 'TP', 'FP', 'TN',
       'FN', 'PPV_LB', 'PPV', 'PPV_UB', 'NPV', 'Sens', 'Spec']]

Unnamed: 0,mutation,Odds_Ratio,OR_LB,OR_UB,pval,BH_pval,Bonferroni_pval,Num_Isolates,Total_Isolates,TP,FP,TN,FN,PPV_LB,PPV,PPV_UB,NPV,Sens,Spec
16,rpoC_p.Glu1092Asp,1.215757,1.142315,1.29766,1.50006e-09,2.883625e-08,5e-06,2987,35401,1892,1095,22769,9645,0.615839,0.633411,0.650722,0.702443,0.163994,0.954115


# 1. Excluded Samples Analysis: 54 new significant resistance-associated variants

Removed ~5,000 samples that contain both high confidence rpoB mutations (Category 1 or 2 in 2021 catalogue) and any of the ~150 significant tier2 mutations. 

An L2-penalized regression was fit to determine the effect sizes of tier 2 mutations, independent of resistance-associated mutations they may occur with. 

In [5]:
exclude_df = pd.read_csv(os.path.join(analysis_dir, f"{drug}/{folder}/exclude_comutation/WHOphenos_univariate_stats.csv"))
exclude_df = exclude_df.loc[~exclude_df["mutation"].str.contains("PC")]
exclude_df[['Num_Isolates', 'Total_Isolates', 'TP', 'FP', 'TN', 'FN']] = exclude_df[['Num_Isolates', 'Total_Isolates', 'TP', 'FP', 'TN', 'FN']].astype(int)

exclude_df.query("Odds_Ratio > 1 & BH_pval < 0.01 & TP >= 5 & PPV_LB >= 0.25 & ~mutation.str.contains('rpoB')")

Unnamed: 0,mutation,Odds_Ratio,OR_LB,OR_UB,pval,BH_pval,Bonferroni_pval,Num_Isolates,Total_Isolates,TP,...,Sens_LB,Sens_UB,Spec_LB,Spec_UB,LR+_LB,LR+_UB,LR-_LB,LR-_UB,predicted_effect,position
155,rpoC_p.Ile885Val,1.028202,1.011277,1.037013,8e-06,0.000166,0.025045,6,25963,6,...,0.00043,0.002545,0.999823,1.0,,inf,0.997895,0.999766,missense_variant,766022
230,rpoC_p.Val483Gly,1.083889,1.048381,1.129905,0.000154,0.002196,0.494025,8,25963,7,...,0.000549,0.00281,0.999733,0.999999,3.499996,231.110409,0.997668,0.999698,missense_variant,764817


In [6]:
exclude_df.query("mutation=='rpoC_p.Glu1092Asp'")[['mutation', 'Odds_Ratio', 'OR_LB', 'OR_UB', 'pval', 'BH_pval',
       'Bonferroni_pval', 'Num_Isolates', 'Total_Isolates', 'TP', 'FP', 'TN',
       'FN', 'PPV_LB', 'PPV', 'PPV_UB', 'NPV', 'Sens', 'Spec']]

Unnamed: 0,mutation,Odds_Ratio,OR_LB,OR_UB,pval,BH_pval,Bonferroni_pval,Num_Isolates,Total_Isolates,TP,FP,TN,FN,PPV_LB,PPV,PPV_UB,NPV,Sens,Spec
71,rpoC_p.Glu1092Asp,1.208537,1.139478,1.271025,1.527808e-09,7.001724e-08,5e-06,1018,25963,95,923,19912,5033,0.076157,0.09332,0.112876,0.798236,0.018526,0.9557


In [7]:
og_model_analysis.query("mutation=='rpoC_p.Glu1092Asp'")[['mutation', 'Odds_Ratio', 'OR_LB', 'OR_UB', 'pval', 'BH_pval',
       'Bonferroni_pval', 'Num_Isolates', 'Total_Isolates', 'TP', 'FP', 'TN',
       'FN', 'PPV_LB', 'PPV', 'PPV_UB', 'NPV', 'Sens', 'Spec']]

Unnamed: 0,mutation,Odds_Ratio,OR_LB,OR_UB,pval,BH_pval,Bonferroni_pval,Num_Isolates,Total_Isolates,TP,FP,TN,FN,PPV_LB,PPV,PPV_UB,NPV,Sens,Spec
16,rpoC_p.Glu1092Asp,1.215757,1.142315,1.29766,1.50006e-09,2.883625e-08,5e-06,2987,35401,1892,1095,22769,9645,0.615839,0.633411,0.650722,0.702443,0.163994,0.954115


# 2. Likelihood Ratio Test: 5/152 are significant

152 tier 2 variants were found significant in the first analysis. We then performed a likelihood ratio test:

A mutation is removed from the original model input matrix, and then a new model is fit. 

We then compare the log-likelihoods of the original model and the model with 1 less mutation. If the p-value is significant, it means that removing the mutation significantly changes the fit of the model. 

In [9]:
lrt_res = pd.read_csv(os.path.join(analysis_dir, drug, "BINARY/LRT", f"{phenos_name}phenos_results.csv"), index_col=[0])
full_model = lrt_res.iloc[0, :]
print(lrt_res.shape)

print(full_model[["log_like", "AUC", "Sens", "Spec", "accuracy"]])
lrt_res = BH_FDR_correction(lrt_res.loc[~pd.isnull(lrt_res["pval"])])
lrt_res["Bonferroni_pval"] = np.min([lrt_res["pval"] * len(lrt_res["pval"]), np.ones(len(lrt_res["pval"]))], axis=0)

lrt_res = lrt_res.reset_index()
lrt_res.rename(columns={lrt_res.columns[0]: "mutation"}, inplace=True)

# ref_auc = full_model["AUC"]
# ref_sens = full_model["Sens"]
# ref_spec = full_model["Spec"]

(153, 8)
log_like   -3386.116022
AUC            0.986142
Sens           0.951108
Spec           0.981836
accuracy       0.971856
Name: FULL, dtype: float64


In [10]:
lrt_res.query("BH_pval < 0.01")

Unnamed: 0,mutation,penalty,log_like,chi_stat,pval,AUC,Sens,Spec,accuracy,BH_pval,Bonferroni_pval
0,Rv2752c_p.Asn30Ser,0.01,-3399.903591,27.575136,1.511088e-07,0.985922,0.950711,0.981884,0.97176,2.3e-05,2.3e-05
1,rpoC_p.Glu1092Asp,0.01,-3398.725236,25.218427,5.119076e-07,0.985491,0.951108,0.981789,0.971824,3.9e-05,7.8e-05
2,rpoC_p.Ile491Thr,0.01,-3394.704221,17.176397,3.406427e-05,0.986078,0.950909,0.981932,0.971856,0.001726,0.005178
3,rpoC_p.Asn698Ser,0.01,-3393.990711,15.749378,7.231173e-05,0.986091,0.950909,0.981932,0.971856,0.002748,0.010991
4,rpoC_p.Pro1040Arg,0.01,-3393.209605,14.187165,0.0001654956,0.986077,0.950909,0.981932,0.971856,0.005031,0.025155


In [11]:
thresh = 0.01
print(len(set(lrt_res.query("BH_pval < @thresh").mutation).intersection(exclude_df.query("Odds_Ratio > 1 & BH_pval < @thresh")["mutation"])))

thresh = 0.05
print(len(set(lrt_res.query("BH_pval < @thresh").mutation).intersection(exclude_df.query("Odds_Ratio > 1 & BH_pval < @thresh")["mutation"])))

2
4


In [27]:
thresh = 0.01
lrt_res.query("BH_pval < @thresh")

Unnamed: 0,mutation,penalty,log_like,chi_stat,pval,AUC,Sens,Spec,accuracy,BH_pval,Bonferroni_pval
0,Rv2752c_p.Asn30Ser,0.01,-3399.903591,27.575136,1.511088e-07,0.985922,0.950711,0.981884,0.97176,2.3e-05,2.3e-05
1,rpoC_p.Glu1092Asp,0.01,-3398.725236,25.218427,5.119076e-07,0.985491,0.951108,0.981789,0.971824,3.9e-05,7.8e-05
2,rpoC_p.Ile491Thr,0.01,-3394.704221,17.176397,3.406427e-05,0.986078,0.950909,0.981932,0.971856,0.001726,0.005178
3,rpoC_p.Asn698Ser,0.01,-3393.990711,15.749378,7.231173e-05,0.986091,0.950909,0.981932,0.971856,0.002748,0.010991
4,rpoC_p.Pro1040Arg,0.01,-3393.209605,14.187165,0.0001654956,0.986077,0.950909,0.981932,0.971856,0.005031,0.025155


In [46]:
model_matrix = pd.read_pickle(os.path.join(analysis_dir, drug, "BINARY/tiers=1+2/phenos=WHO/dropAF_noSyn_unpooled", "model_matrix.pkl"))
model_matrix.shape

(30984, 3609)

In [60]:
tier1_genes = [mut for mut in model_matrix.columns if "rpoB" in mut]
print(len(tier1_genes))

thresh = 0.05
mutations_for_interact = lrt_res.query("BH_pval < @thresh")["mutation"].values
print(len(mutations_for_interact))

764
9


In [61]:
new_matrix = pd.DataFrame()
interact_terms = list(itertools.product(tier1_genes, mutations_for_interact))

for (term1, term2) in interact_terms:
    new_matrix["-".join([term1, term2])] = model_matrix[term1] * model_matrix[term2]
    
print(new_matrix.shape)
    
new_matrix = new_matrix[new_matrix.columns[~((new_matrix == 0).all())]]
print(new_matrix.shape)

interact_model_matrix = pd.concat([model_matrix, new_matrix], axis=1)
interact_model_matrix.to_pickle('/n/data1/hms/dbmi/farhat/Sanjana/who-mutation-catalogue/Rifampicin/BINARY/interaction/model_matrix_LRT05_sig.pkl')

(30984, 6876)
(30984, 154)


In [65]:
os.path.join(analysis_dir, drug, "BINARY/interaction")

'/n/data1/hms/dbmi/farhat/Sanjana/who-mutation-catalogue/Rifampicin/BINARY/interaction'

In [71]:
interact_model_matrix[interact_model_matrix.columns[interact_model_matrix.columns.str.contains("rpoC_p.Glu1092Asp")]]

Unnamed: 0_level_0,rpoC_p.Glu1092Asp,rpoB_p.Ala1172Pro-rpoC_p.Glu1092Asp,rpoB_p.Ala286Val-rpoC_p.Glu1092Asp,rpoB_p.Ala817Val-rpoC_p.Glu1092Asp,rpoB_p.Ala977Ser-rpoC_p.Glu1092Asp,rpoB_p.Arg167His-rpoC_p.Glu1092Asp,rpoB_p.Arg219Cys-rpoC_p.Glu1092Asp,rpoB_p.Arg219Ser-rpoC_p.Glu1092Asp,rpoB_p.Arg448Lys-rpoC_p.Glu1092Asp,rpoB_p.Arg552Cys-rpoC_p.Glu1092Asp,...,rpoB_p.Val168Ala-rpoC_p.Glu1092Asp,rpoB_p.Val170Ala-rpoC_p.Glu1092Asp,rpoB_p.Val170Phe-rpoC_p.Glu1092Asp,rpoB_p.Val534Ala-rpoC_p.Glu1092Asp,rpoB_p.Val555Ala-rpoC_p.Glu1092Asp,rpoB_p.Val562Met-rpoC_p.Glu1092Asp,rpoB_p.Val581Leu-rpoC_p.Glu1092Asp,rpoB_p.Val736Leu-rpoC_p.Glu1092Asp,rpoB_p.Val916Met-rpoC_p.Glu1092Asp,rpoB_p.Val996Gly-rpoC_p.Glu1092Asp
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
809,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
812,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
815,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
816,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
817,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
933876,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
946364,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
946365,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
946366,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
