In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['figure.dpi'] = 150
import seaborn as sns
from Bio import SeqIO, Seq

import glob, os, yaml, subprocess, itertools, sparse, vcf
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from statsmodels.distributions.empirical_distribution import ECDF
import sklearn.metrics
from sklearn.decomposition import PCA
import timeit
import scipy.stats as st
import statsmodels.api as sm
import pickle, yaml, tracemalloc
from scipy.stats import binomtest

analysis_dir = '/n/data1/hms/dbmi/farhat/Sanjana/who-mutation-catalogue'
input_data_dir = '/n/data1/hms/dbmi/farhat/ye12/who'
who_variants_combined = pd.read_csv("who_confidence_2021.csv")

import itertools
from stats_utils import *
import warnings
warnings.filterwarnings("ignore")

# RIFAMPICIN TEST CASE

# 0. Original Analysis: 137 new significant resistance-associated variants

## But, there are 152 with OR > 1 and BH p-val < 0.01 that are in tier 2 genes (rpoB is the only tier 1)
## These 152 mutations were studied in the next two analyses

In [2]:
folder = "BINARY"
phenos_name = "WHO"
drug = "Rifampicin"
drug_WHO_abbr = "RIF"

# get the number of high confidence resistance-associated 
high_Conf_variants = who_variants_combined.loc[(who_variants_combined["drug"]==drug_WHO_abbr) & 
                                               (who_variants_combined["confidence"].str.contains("|".join(["1", "2"])))
                                              ].mutation.unique()

# these have Odds_Ratio > 1, BH pval < 0.01, and are not in the Tier 1 analysis (for RIF, it means they are not in rpoB)
# DIDN'T THRESHOLD USING THE UNIVARIATE STATS BECAUSE THOSE WILL BE COMPUTED AT THE VERY END AND SHOULDN'T BE INVOLVED IN MAKING DECISIONS ABOUT THE REGRESSION ANALYSIS
tier2_mutations_of_interest = get_tier2_mutations_of_interest(analysis_dir, drug, phenos_name)

og_model_analysis = pd.read_excel("../results/BINARY/Rifampicin.xlsx", sheet_name="Model_7")
print(og_model_analysis.query("Odds_Ratio > 1 & BH_pval < 0.01 & PPV_LB > 0.25 & TP >= 5 & ~mutation.str.contains('rpoB')").shape)

152 significant tier 2 mutations associated with WHO resistance
(137, 40)


In [3]:
og_model_analysis.query("mutation=='rpoC_p.Glu1092Asp'")[['mutation', 'Odds_Ratio', 'OR_LB', 'OR_UB', 'pval', 'BH_pval',
       'Bonferroni_pval', 'Num_Isolates', 'Total_Isolates', 'TP', 'FP', 'TN',
       'FN', 'PPV_LB', 'PPV', 'PPV_UB', 'NPV', 'Sens', 'Spec']]

Unnamed: 0,mutation,Odds_Ratio,OR_LB,OR_UB,pval,BH_pval,Bonferroni_pval,Num_Isolates,Total_Isolates,TP,FP,TN,FN,PPV_LB,PPV,PPV_UB,NPV,Sens,Spec
16,rpoC_p.Glu1092Asp,1.215757,1.142315,1.29766,1.50006e-09,2.883625e-08,5e-06,2987,35401,1892,1095,22769,9645,0.615839,0.633411,0.650722,0.702443,0.163994,0.954115


In [133]:
og_model_analysis.shape

(3609, 40)

# 1. Excluded Samples Analysis: 54 new significant resistance-associated variants

Removed ~5,000 samples that contain both high confidence rpoB mutations (Category 1 or 2 in 2021 catalogue) and any of the ~150 significant tier2 mutations. 

An L2-penalized regression was fit to determine the effect sizes of tier 2 mutations, independent of resistance-associated mutations they may occur with. 

In [94]:
# exclude_df = pd.read_csv(os.path.join(analysis_dir, f"{drug}/{folder}/exclude_comutation/WHOphenos_univariate_stats.csv"))
# exclude_df = exclude_df.loc[~exclude_df["mutation"].str.contains("PC")]
# exclude_df[['Num_Isolates', 'Total_Isolates', 'TP', 'FP', 'TN', 'FN']] = exclude_df[['Num_Isolates', 'Total_Isolates', 'TP', 'FP', 'TN', 'FN']].astype(int)

# exclude_df.query("Odds_Ratio > 1 & BH_pval < 0.01 & TP >= 5 & PPV_LB >= 0.25 & ~mutation.str.contains('rpoB')")

In [95]:
# exclude_df.query("mutation=='rpoC_p.Glu1092Asp'")[['mutation', 'Odds_Ratio', 'OR_LB', 'OR_UB', 'pval', 'BH_pval',
#        'Bonferroni_pval', 'Num_Isolates', 'Total_Isolates', 'TP', 'FP', 'TN',
#        'FN', 'PPV_LB', 'PPV', 'PPV_UB', 'NPV', 'Sens', 'Spec']]

In [96]:
# og_model_analysis.query("mutation=='rpoC_p.Glu1092Asp'")[['mutation', 'Odds_Ratio', 'OR_LB', 'OR_UB', 'pval', 'BH_pval',
#        'Bonferroni_pval', 'Num_Isolates', 'Total_Isolates', 'TP', 'FP', 'TN',
#        'FN', 'PPV_LB', 'PPV', 'PPV_UB', 'NPV', 'Sens', 'Spec']]

# 2. Likelihood Ratio Test: 5/152 of the Above Tier 2 Mutations are Significant

152 tier 2 variants were found significant in the first analysis. We then performed a likelihood ratio test:

A mutation is removed from the original model input matrix, and then a new model is fit. 

We then compare the log-likelihoods of the original model and the model with 1 less mutation. If the p-value is significant, it means that removing the mutation significantly changes the fit of the model. 

In [108]:
LRT_res = pd.read_csv(os.path.join(analysis_dir, drug, "BINARY/LRT", f"{phenos_name}phenos_results_tiers1+2.csv"))
print(LRT_res.head(1))

LRT_res = add_pval_corrections(LRT_res.iloc[1:, ], col="pval")
print(LRT_res.shape)

  mutation     log_like  chi_stat  pval       AUC      Sens      Spec  \
0     FULL -3386.116022       NaN   NaN  0.986142  0.951108  0.981836   

   accuracy  
0  0.971856  
(3609, 10)


In [242]:
0.986142-0.963773

0.022368999999999972

In [240]:
LRT_res.query("mutation in @LRT_bootstrap_StatsDiff.mutation")

Unnamed: 0,mutation,log_like,chi_stat,pval,AUC,Sens,Spec,accuracy,BH_pval,Bonferroni_pval,Significant
0,rpoB_p.Ser450Leu,-6132.979553,5493.727061,0.0,0.963773,0.860876,0.979829,0.941195,0.0,0.0,1
26,Rv2752c_p.Asn30Ser,-3399.923547,27.61505,1.480226e-07,0.985922,0.950711,0.981884,0.97176,4.5e-05,0.000534,1
30,rpoC_p.Glu1092Asp,-3398.74157,25.251095,5.033098e-07,0.985492,0.951108,0.981789,0.971824,0.000114,0.001816,1
38,rpoC_p.Ile491Thr,-3394.700163,17.168281,3.421011e-05,0.986078,0.950909,0.981932,0.971856,0.005144,0.123464,1
41,rpoC_p.Asn698Ser,-3394.005705,15.779366,7.117452e-05,0.986091,0.950909,0.981932,0.971856,0.009514,0.256869,1
47,rpoC_p.Pro1040Arg,-3393.225162,14.218279,0.0001627815,0.986078,0.950909,0.981932,0.971856,0.017802,0.587479,0


In [241]:
LRT_bootstrap_StatsDiff.groupby("mutation")["AUC"].mean()

mutation
Rv2752c_p.Asn30Ser    0.000199
rpoB_p.Ser450Leu      0.021720
rpoC_p.Asn698Ser      0.000055
rpoC_p.Glu1092Asp     0.000479
rpoC_p.Ile491Thr      0.000079
rpoC_p.Pro1040Arg     0.000068
Name: AUC, dtype: float64

In [215]:
og_model_analysis["Significant"] = og_model_analysis["Significant"].fillna(0).astype(int)
print(f"{len(og_model_analysis.query('Odds_Ratio > 1 & Significant == 1'))} mutations are significantly associated with resistance")
print(f"{len(og_model_analysis.query('Odds_Ratio > 1 & Significant == 1').loc[~og_model_analysis['mutation'].str.contains('rpoB')])} mutations are Tier 2")

for i, row in og_model_analysis.iterrows():
    
    if "rpoB" in row["mutation"]:
        if row["BH_pval"] < 0.05:
            og_model_analysis.loc[i, "Significant"] = 1
    else:
        if row["BH_pval"] < 0.01:
            og_model_analysis.loc[i, "Significant"] = 1


og_summary = pd.DataFrame(og_model_analysis.query("Odds_Ratio > 1 & Significant == 1").confidence.value_counts(dropna=False)).reset_index().rename(columns={"index": "confidence", 
                                                                                                                                             "confidence": "orig_significant_count"}).sort_values("confidence")

og_summary = og_summary.merge(pd.DataFrame(who_variants_combined.query("drug=='RIF'").confidence.value_counts()).reset_index().rename(columns={"index": "confidence", 
                                                                                                                                                 "confidence": "who2021_count"}).sort_values("confidence"), 
                  on="confidence", how="outer")

og_summary["who2021_count"] = og_summary["who2021_count"].fillna(0)
og_summary["who2021_count"] = og_summary["who2021_count"].astype(int)

350 mutations are significantly associated with resistance
167 mutations are Tier 2


In [227]:
final = og_summary.merge(lrt_summary, on="confidence")
final[["confidence", "who2021_count", "orig_significant_count", "LRT_significant_count"]]

Unnamed: 0,confidence,who2021_count,orig_significant_count,LRT_significant_count
0,1) Assoc w R,24,24,24
1,2) Assoc w R - Interim,93,33,15
2,3) Uncertain significance,1515,117,5
3,5) Not assoc w R,28,1,1
4,,0,175,4


In [225]:
24 + 33 + 118 + 175

350

In [226]:
39 + 10

49

In [208]:
final = og_summary.merge(lrt_summary, on="confidence")
final[["confidence", "who2021_count", "orig_significant_count", "LRT_significant_count"]]

Unnamed: 0,confidence,who2021_count,orig_significant_count,LRT_significant_count
0,1) Assoc w R,24,24,24
1,2) Assoc w R - Interim,93,18,11
2,3) Uncertain significance,1515,81,2
3,5) Not assoc w R,28,1,1
4,,0,161,4


In [245]:
og_model_analysis.query('Odds_Ratio > 1 & Significant == 1').shape

(350, 40)

In [246]:
set(LRT_res.query('Significant == 1 & mutation in @mut_assoc_res')["mutation"]) - set(og_model_analysis.query('Odds_Ratio > 1 & Significant == 1')["mutation"])

{'Rv1129c_p.Asp214Asn', 'glpK_p.Ile255Leu'}

In [247]:
og_model_analysis.query('Odds_Ratio > 1 & Significant == 1').shape

(350, 40)

In [250]:
og_model_analysis.query("Odds_Ratio > 1 & mutation in ['Rv1129c_p.Asp214Asn', 'glpK_p.Ile255Leu']")

Unnamed: 0,mutation,predicted_effect,position,confidence,Odds_Ratio,OR_LB,OR_UB,pval,BH_pval,Bonferroni_pval,...,Spec_UB,LR+_LB,LR+_UB,LR-_LB,LR-_UB,Tier,Phenos,pool_type,synonymous,HET
94,Rv1129c_p.Asp214Asn,missense_variant,1253895,,1.067306,0.993585,1.079323,0.004564,0.022627,1.0,...,0.99999,0.291409,14.682404,0.999644,1.000177,2,WHO,unpooled,0,DROP
106,glpK_p.Ile255Leu,missense_variant,4138993,,1.062619,1.000009,1.070424,0.001785,0.011241,1.0,...,1.0,,inf,0.999586,1.000067,2,WHO,unpooled,0,DROP


In [251]:
LRT_res.query("mutation in ['Rv1129c_p.Asp214Asn', 'glpK_p.Ile255Leu']")

Unnamed: 0,mutation,log_like,chi_stat,pval,AUC,Sens,Spec,accuracy,BH_pval,Bonferroni_pval,Significant
36,glpK_p.Ile255Leu,-3395.126391,18.020738,2.2e-05,0.986016,0.950909,0.981932,0.971856,0.003585,0.078861,1
40,Rv1129c_p.Asp214Asn,-3394.187504,16.142963,5.9e-05,0.986032,0.950909,0.981836,0.971792,0.008153,0.21198,1


In [253]:
np.array([1, 2, 3])

NameError: name 'X' is not defined

In [219]:
mut_assoc_res = og_model_analysis.query("Odds_Ratio > 1")["mutation"].values

for i, row in LRT_res.iterrows():
    
    if "rpoB" in row["mutation"]:
        if row["BH_pval"] < 0.05:
            LRT_res.loc[i, "Significant"] = 1
    else:
        if row["BH_pval"] < 0.01:
            LRT_res.loc[i, "Significant"] = 1

LRT_res["Significant"] = LRT_res["Significant"].fillna(0).astype(int)

print(f"{len(LRT_res.query('Significant == 1 & mutation in @mut_assoc_res'))} mutations are significantly associated with resistance")
print(f"{len(LRT_res.query('Significant == 1 & mutation in @mut_assoc_res').loc[~LRT_res['mutation'].str.contains('rpoB')])} mutations are Tier 2")


lrt_summary = LRT_res.query("Significant == 1 & mutation in @mut_assoc_res").merge(who_variants_combined.query("drug=='RIF'"), 
                                      on="mutation", how="left"
                                     ).confidence.value_counts(dropna=False).reset_index().rename(columns={"index": "confidence", "confidence": "LRT_significant_count"}).sort_values("confidence")

# lrt_summary = lrt_summary.merge(pd.DataFrame(who_variants_combined.query("drug=='RIF'").confidence.value_counts()).reset_index().rename(columns={"index": "confidence", 
#                                                                                                                                                  "confidence": "who2021_count"}).sort_values("confidence"), 
#                   on="confidence", how="outer")

# lrt_summary["who2021_count"] = lrt_summary["who2021_count"].fillna(0)
# lrt_summary["who2021_count"] = lrt_summary["who2021_count"].astype(int)

49 mutations are significantly associated with resistance
6 mutations are Tier 2


In [220]:
lrt_summary

Unnamed: 0,confidence,LRT_significant_count
0,1) Assoc w R,24
1,2) Assoc w R - Interim,15
2,3) Uncertain significance,5
4,5) Not assoc w R,1
3,,4


In [149]:
np.unique([gene.split("_")[0] for gene in who_variants_combined.query("drug=='RIF' & confidence == '1) Assoc w R'")["mutation"].values])

array(['rpoB'], dtype='<U4')

In [148]:
np.unique([gene.split("_")[0] for gene in who_variants_combined.query("drug=='RIF' & confidence == '2) Assoc w R - Interim'")["mutation"].values])

array(['rpoB'], dtype='<U4')

In [150]:
np.unique([gene.split("_")[0] for gene in who_variants_combined.query("drug=='RIF' & confidence == '3) Uncertain significance'")["mutation"].values])

array(['Rv2752c', 'rpoA', 'rpoB', 'rpoC'], dtype='<U7')

In [151]:
np.unique([gene.split("_")[0] for gene in who_variants_combined.query("drug=='RIF' & confidence == '5) Not assoc w R'")["mutation"].values])

array(['Rv2752c', 'rpoA', 'rpoB', 'rpoC'], dtype='<U7')

In [117]:
LRT_res.query("BH_pval < 0.01 & mutation in @mut_assoc_res").shape

(42, 10)

In [None]:
#.query("confidence not in ['1) Assoc w R', '2) Assoc w R - Interim']")

# Perform bootstrapping for all 152 mutations to see how much AUC, Sens, Spec, and Accuracy Change

If the confidence interval lies above 0, it means that removing the feature SIGNIFICANTLY DECREASES THE PREDICTIVE POWER OF THE MODEL

CI > 0 = FEATURE IS IMPORTANT

In [254]:
# bootstrapped the difference: full model stat - (model - 1) stat
LRT_bootstrap_StatsDiff = pd.read_csv(os.path.join(analysis_dir, drug, "BINARY/LRT", f"{phenos_name}phenos_bootstrap_StatsDiff.csv.gz"), compression="gzip")
print(len(LRT_bootstrap_StatsDiff.mutation.unique()))
LRT_bootstrap_StatsDiff2 = pd.read_csv(os.path.join(analysis_dir, drug, "BINARY/LRT", f"{phenos_name}phenos_bootstrap_StatsDiff_2.csv.gz"), compression="gzip")
print(len(LRT_bootstrap_StatsDiff2.mutation.unique()))

6
5


In [258]:
LRT_bootstrap_StatsDiff = pd.concat([LRT_bootstrap_StatsDiff, LRT_bootstrap_StatsDiff2])

In [78]:
pval_df["AUC"].values

array([0.0, 0.010000000000000009, 0.010000000000000009,
       0.09999999999999998, 0.020000000000000018, 0.09999999999999998],
      dtype=object)

In [260]:
LRT_bootstrap_StatsDiff

Unnamed: 0,AUC,Sens,Spec,accuracy,mutation
0,1.946493e-02,0.080225,0.002110,0.027659,rpoB_p.Ser450Leu
1,2.122932e-02,0.088078,0.002849,0.030145,rpoB_p.Ser450Leu
2,2.066901e-02,0.081655,0.005795,0.030951,rpoB_p.Ser450Leu
3,2.224834e-02,0.097551,-0.001723,0.030596,rpoB_p.Ser450Leu
4,2.095228e-02,0.085680,0.004442,0.030790,rpoB_p.Ser450Leu
...,...,...,...,...,...
495,1.091928e-07,0.000000,0.000000,0.000000,Rv1129c_p.Ser362Thr
496,1.616239e-07,0.000000,0.000000,0.000000,Rv1129c_p.Ser362Thr
497,5.641436e-07,0.000000,0.000000,0.000000,Rv1129c_p.Ser362Thr
498,6.643447e-08,0.000000,0.000000,0.000000,Rv1129c_p.Ser362Thr


In [261]:
LRT_bootstrap_StatsDiff.mutation.unique()

array(['rpoB_p.Ser450Leu', 'Rv2752c_p.Asn30Ser', 'rpoC_p.Glu1092Asp',
       'rpoC_p.Ile491Thr', 'rpoC_p.Asn698Ser', 'rpoC_p.Pro1040Arg',
       'glpK_p.Leu152Arg', 'Rv2752c_p.Val396Gly', 'lpqB_p.Asp370Glu',
       'rpoA_c.-316G>A', 'Rv1129c_p.Ser362Thr'], dtype=object)

In [266]:
pval_df

Unnamed: 0,AUC,Sens,Spec,accuracy
rpoB_p.Ser450Leu,0.0,,,
Rv2752c_p.Asn30Ser,0.01,,,
rpoC_p.Glu1092Asp,0.01,,,
rpoC_p.Ile491Thr,0.1,,,
rpoC_p.Asn698Ser,0.02,,,
rpoC_p.Pro1040Arg,0.1,,,
glpK_p.Leu152Arg,0.5,,,
Rv2752c_p.Val396Gly,0.27,,,
lpqB_p.Asp370Glu,0.31,,,
rpoA_c.-316G>A,0.46,,,


In [265]:
for metric in ['AUC']:

    print(st.spearmanr(LRT_res.query("mutation in @LRT_bootstrap_StatsDiff.mutation.unique()")["BH_pval"], 
                 pval_df[metric]
                ))

SpearmanrResult(correlation=0.9338806280906226, pvalue=2.6137422278744596e-05)


In [264]:
pval_df = pd.DataFrame(columns=['AUC', 'Sens', 'Spec', 'accuracy'])

for mut in LRT_bootstrap_StatsDiff.mutation.unique():
    
    print("\n" + mut)
    
    for metric in ['AUC']:
    
#         lower, upper = np.percentile(LRT_bootstrap_StatsDiff.query("mutation==@mut")[metric], q=[2.5, 97.5])
    
#         if lower > 0 and upper > 0:
#             print(f"    {metric}: {round(lower, 4)}, {round(upper, 4)}")

        pval = 1 - np.mean(LRT_bootstrap_StatsDiff.query('mutation==@mut')[metric].values > 0)
        
        if pval < 0.05:
            print(f"{metric}: {round(pval, 2)}")
            
        pval_df.loc[mut, metric] = pval


rpoB_p.Ser450Leu
AUC: 0.0

Rv2752c_p.Asn30Ser
AUC: 0.01

rpoC_p.Glu1092Asp
AUC: 0.01

rpoC_p.Ile491Thr

rpoC_p.Asn698Ser
AUC: 0.02

rpoC_p.Pro1040Arg

glpK_p.Leu152Arg

Rv2752c_p.Val396Gly

lpqB_p.Asp370Glu

rpoA_c.-316G>A

Rv1129c_p.Ser362Thr


In [234]:
LRT_bootstrap_StatsDiff

Unnamed: 0,AUC,Sens,Spec,accuracy,mutation
0,0.019465,0.080225,0.002110,0.027659,rpoB_p.Ser450Leu
1,0.021229,0.088078,0.002849,0.030145,rpoB_p.Ser450Leu
2,0.020669,0.081655,0.005795,0.030951,rpoB_p.Ser450Leu
3,0.022248,0.097551,-0.001723,0.030596,rpoB_p.Ser450Leu
4,0.020952,0.085680,0.004442,0.030790,rpoB_p.Ser450Leu
...,...,...,...,...,...
595,0.000246,0.000302,0.000000,0.000097,rpoC_p.Pro1040Arg
596,0.000006,0.000000,0.000000,0.000000,rpoC_p.Pro1040Arg
597,-0.000002,0.000000,-0.000048,-0.000032,rpoC_p.Pro1040Arg
598,0.000222,0.000685,-0.000337,0.000000,rpoC_p.Pro1040Arg


In [77]:
LRT_res.query("mutation in @LRT_bootstrap_StatsDiff.mutation.unique()").query("BH_pval < 0.05")

Unnamed: 0,mutation,log_like,chi_stat,pval,AUC,Sens,Spec,accuracy,BH_pval,Bonferroni_pval
0,rpoB_p.Ser450Leu,-6132.979553,5493.727061,0.0,0.963773,0.860876,0.979829,0.941195,0.0,0.0
26,Rv2752c_p.Asn30Ser,-3399.923547,27.61505,1.480226e-07,0.985922,0.950711,0.981884,0.97176,4.5e-05,0.000534
30,rpoC_p.Glu1092Asp,-3398.74157,25.251095,5.033098e-07,0.985492,0.951108,0.981789,0.971824,0.000114,0.001816
38,rpoC_p.Ile491Thr,-3394.700163,17.168281,3.421011e-05,0.986078,0.950909,0.981932,0.971856,0.005144,0.123464
41,rpoC_p.Asn698Ser,-3394.005705,15.779366,7.117452e-05,0.986091,0.950909,0.981932,0.971856,0.009514,0.256869
47,rpoC_p.Pro1040Arg,-3393.225162,14.218279,0.0001627815,0.986078,0.950909,0.981932,0.971856,0.017802,0.587479


# Model with Interaction Terms

In [8]:
interact_coef = pd.read_csv(os.path.join(analysis_dir, drug, f"BINARY/interaction/{phenos_name}_LRT05_sig_coef.csv"))
interact_coef_bs = pd.read_csv(os.path.join(analysis_dir, drug, f"BINARY/interaction/{phenos_name}_LRT05_sig_BScoef.csv"))

# interact_res = pd.read_csv(os.path.join(analysis_dir, drug, f"BINARY/interaction/{phenos_name}_LRT05_sig_BSsummaryStats.csv"))

interact_matrix = pd.read_pickle(os.path.join(analysis_dir, drug, f"BINARY/interaction/model_matrix_{phenos_name}_LRT05_sig.pkl"))
num_samples = len(interact_matrix)

# del interact_matrix
interact_coef = get_pvalues_add_ci(interact_coef, interact_coef_bs, "mutation", num_samples, alpha=0.05)
#interact_coef = add_pval_corrections()

interact_coef = add_pval_corrections(interact_coef)

In [9]:
split_mut = interact_coef["mutation"].str.split("+", n=1, expand=True)
split_mut.columns = ["Mut1", "Mut2"]

interact_coef = pd.concat([interact_coef, split_mut], axis=1)

mut_conf_dict = dict(zip(who_variants_combined.query("drug=='RIF'")["mutation"], who_variants_combined.query("drug=='RIF'")["confidence"]))

interact_coef["conf1"] = interact_coef["Mut1"].map(mut_conf_dict)
interact_coef["conf2"] = interact_coef["Mut2"].map(mut_conf_dict)

In [20]:
interact_coef.query("BH_pval < 0.05").loc[interact_coef["mutation"].str.contains("rpoC_p.Glu1092Asp")].sort_values("coef", ascending=False)

Unnamed: 0,mutation,coef,coef_LB,coef_UB,pval,BH_pval,Bonferroni_pval,Mut1,Mut2,conf1,conf2
107,rpoC_p.Glu1092Asp,0.219059,0.1307759,0.276704,1.474453e-10,5.291179e-09,5.555738e-07,rpoC_p.Glu1092Asp,,5) Not assoc w R,
51,rpoB_p.Glu761Asp+rpoC_p.Glu1092Asp,0.04575,0.03516931,0.055883,6.531201e-17,4.825405e-15,2.460957e-13,rpoB_p.Glu761Asp,rpoC_p.Glu1092Asp,3) Uncertain significance,5) Not assoc w R
400,rpoB_p.Asp435Gly+rpoC_p.Glu1092Asp,0.038459,0.01972758,0.063935,0.0004819887,0.004597806,1.0,rpoB_p.Asp435Gly,rpoC_p.Glu1092Asp,2) Assoc w R - Interim,5) Not assoc w R
96,rpoB_p.Ile488Val+rpoC_p.Glu1092Asp,0.033812,0.02370705,0.043639,2.951732e-11,1.17075e-09,1.112212e-07,rpoB_p.Ile488Val,rpoC_p.Glu1092Asp,3) Uncertain significance,5) Not assoc w R
184,rpoB_p.Ser450Trp+rpoC_p.Glu1092Asp,0.026928,0.01726181,0.037756,4.941578e-07,1.02307e-05,0.001861987,rpoB_p.Ser450Trp,rpoC_p.Glu1092Asp,1) Assoc w R,5) Not assoc w R
596,rpoB_p.Asp435Asn+rpoC_p.Glu1092Asp,0.026639,1.9519510000000003e-17,0.042539,0.003542234,0.02269921,1.0,rpoB_p.Asp435Asn,rpoC_p.Glu1092Asp,,5) Not assoc w R
726,rpoB_p.Met434Val+rpoC_p.Glu1092Asp,0.025688,0.006916843,0.04585,0.008438221,0.044782,1.0,rpoB_p.Met434Val,rpoC_p.Glu1092Asp,2) Assoc w R - Interim,5) Not assoc w R
706,rpoB_p.His445Leu+rpoC_p.Glu1092Asp,0.025392,0.01397709,0.052171,0.007719784,0.04209573,1.0,rpoB_p.His445Leu,rpoC_p.Glu1092Asp,1) Assoc w R,5) Not assoc w R
469,rpoB_p.Ser441Gln+rpoC_p.Glu1092Asp,0.022871,0.009898698,0.036071,0.0009790105,0.007950241,1.0,rpoB_p.Ser441Gln,rpoC_p.Glu1092Asp,1) Assoc w R,5) Not assoc w R
268,rpoB_p.Ile480Val+rpoC_p.Glu1092Asp,0.022659,0.00934735,0.030784,2.985122e-05,0.0004244505,0.1124794,rpoB_p.Ile480Val,rpoC_p.Glu1092Asp,3) Uncertain significance,5) Not assoc w R


In [38]:
mutations_for_interact = ['Rv2752c_p.Asn30Ser', 'rpoC_p.Glu1092Asp', 'rpoC_p.Ile491Thr',
       'rpoC_p.Asn698Ser', 'rpoC_p.Pro1040Arg', 'rpoC_p.Ile491Val',
       'rpoC_p.Glu750Asp', 'rpoA_p.Val183Gly', 'rpoC_p.Asn416Ser']

In [74]:
for mut in mutations_for_interact:
    
    print(mut, interact_model_matrix[interact_model_matrix.columns[interact_model_matrix.columns.str.contains(mut)]].shape[1])

Rv2752c_p.Asn30Ser 4
rpoC_p.Glu1092Asp 125
rpoC_p.Ile491Thr 3
rpoC_p.Asn698Ser 4
rpoC_p.Pro1040Arg 6
rpoC_p.Ile491Val 8
rpoC_p.Glu750Asp 4
rpoA_p.Val183Gly 6
rpoC_p.Asn416Ser 3
