In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['figure.dpi'] = 150
import seaborn as sns
from Bio import SeqIO, Seq

import glob, os, yaml, subprocess, itertools, sparse, vcf
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from statsmodels.distributions.empirical_distribution import ECDF
import sklearn.metrics
from sklearn.decomposition import PCA
import timeit
import scipy.stats as st
import statsmodels.api as sm
import pickle, yaml, tracemalloc
from scipy.stats import binomtest

analysis_dir = '/n/data1/hms/dbmi/farhat/Sanjana/who-mutation-catalogue'
input_data_dir = '/n/data1/hms/dbmi/farhat/ye12/who'
who_variants_combined = pd.read_csv("who_confidence_2021.csv")

import itertools
from stats_utils import *
import warnings
warnings.filterwarnings("ignore")

# RIFAMPICIN TEST CASE

# 0. Original Analysis: 137 new significant resistance-associated variants

## But, there are 152 with OR > 1 and BH p-val < 0.01 that are in tier 2 genes (rpoB is the only tier 1)
## These 152 mutations were studied in the next two analyses

In [4]:
folder = "BINARY"
phenos_name = "WHO"
drug = "Rifampicin"
drug_WHO_abbr = "RIF"

# get the number of high confidence resistance-associated 
high_Conf_variants = who_variants_combined.loc[(who_variants_combined["drug"]==drug_WHO_abbr) & 
                                               (who_variants_combined["confidence"].str.contains("|".join(["1", "2"])))
                                              ].mutation.unique()

# these have Odds_Ratio > 1, BH pval < 0.01, and are not in the Tier 1 analysis (for RIF, it means they are not in rpoB)
# DIDN'T THRESHOLD USING THE UNIVARIATE STATS BECAUSE THOSE WILL BE COMPUTED AT THE VERY END AND SHOULDN'T BE INVOLVED IN MAKING DECISIONS ABOUT THE REGRESSION ANALYSIS
og_models = pd.read_excel("../results/BINARY/Rifampicin.xlsx", sheet_name=["Model_3", "Model_7", "Model_11", "Model_15"])
#print(og_who_model.query("Odds_Ratio > 1 & BH_pval < 0.01 & PPV_LB > 0.25 & TP >= 5 & ~mutation.str.contains('rpoB')").shape)



In [5]:
def get_significant_genes_by_tier(tier1_model, tier2_model, LRT_res):

    print(f"Total numbers of mutations: {tier1_model.shape}, {tier2_model.shape}")

    tier1_significant = tier1_model.query("Odds_Ratio > 1 & BH_pval < 0.05")
    tier2_significant = tier2_model.query("Odds_Ratio > 1 & BH_pval < 0.01")

    print(f"Numbers of significant mutations: {len(set(tier1_significant.mutation))}, {len(set(tier2_significant.mutation) - set(tier1_significant.mutation))}")
    
    combined_df =pd.concat([tier1_significant, tier2_significant], axis=0)
    combined_df["Tier"] = [1]*len(tier1_significant) + [2]*len(tier2_significant)
    return combined_df.drop_duplicates(subset="mutation", keep="first").merge(LRT_res[["mutation", 'LRT_pval', 'LRT_BH_pval', 'LRT_Bonferroni_pval']], on="mutation", how="left")

# 1. Excluded Samples Analysis: 54 new significant resistance-associated variants

Removed ~5,000 samples that contain both high confidence rpoB mutations (Category 1 or 2 in 2021 catalogue) and any of the ~150 significant tier2 mutations. 

An L2-penalized regression was fit to determine the effect sizes of tier 2 mutations, independent of resistance-associated mutations they may occur with. 

In [23]:
# exclude_df = pd.read_csv(os.path.join(analysis_dir, f"{drug}/{folder}/exclude_comutation/WHOphenos_univariate_stats.csv"))
# exclude_df = exclude_df.loc[~exclude_df["mutation"].str.contains("PC")]
# exclude_df[['Num_Isolates', 'Total_Isolates', 'TP', 'FP', 'TN', 'FN']] = exclude_df[['Num_Isolates', 'Total_Isolates', 'TP', 'FP', 'TN', 'FN']].astype(int)

# exclude_df.query("Odds_Ratio > 1 & BH_pval < 0.01 & TP >= 5 & PPV_LB >= 0.25 & ~mutation.str.contains('rpoB')")

In [24]:
# exclude_df.query("mutation=='rpoC_p.Glu1092Asp'")[['mutation', 'Odds_Ratio', 'OR_LB', 'OR_UB', 'pval', 'BH_pval',
#        'Bonferroni_pval', 'Num_Isolates', 'Total_Isolates', 'TP', 'FP', 'TN',
#        'FN', 'PPV_LB', 'PPV', 'PPV_UB', 'NPV', 'Sens', 'Spec']]

# 2. Likelihood Ratio Test: 5/152 of the Above Tier 2 Mutations are Significant

152 tier 2 variants were found significant in the first analysis. We then performed a likelihood ratio test:

A mutation is removed from the original model input matrix, and then a new model is fit. 

We then compare the log-likelihoods of the original model and the model with 1 less mutation. If the p-value is significant, it means that removing the mutation significantly changes the fit of the model. 

In [118]:
LRTresults_tier1

Unnamed: 0,log_like,chi_stat,pval,AUC,Sens,Spec,accuracy,BH_pval,Bonferroni_pval
0,-6060.916834,2301.327422,0.0,0.964044,0.904443,0.978961,0.954725,0.0,0.0
1,-5277.604669,734.703093,0.0,0.969290,0.923311,0.981392,0.962502,0.0,0.0
2,-4998.790841,177.075436,0.0,0.973489,0.932702,0.979003,0.963944,0.0,0.0
3,-4959.474858,98.443469,0.0,0.973846,0.934093,0.979129,0.964482,0.0,0.0
4,-5037.650604,254.794962,0.0,0.972094,0.929484,0.981769,0.964764,0.0,0.0
...,...,...,...,...,...,...,...,...,...
810,-4910.148492,-0.209262,1.0,0.974210,0.934788,0.978961,0.964595,1.0,1.0
811,-4910.085364,-0.335519,1.0,0.974170,0.932441,0.981141,0.965302,1.0,1.0
812,-4910.203516,-0.099214,1.0,0.974200,0.934788,0.978961,0.964595,1.0,1.0
813,-4909.935748,-0.634751,1.0,0.974187,0.934788,0.978961,0.964595,1.0,1.0


In [32]:
os.path.join(analysis_dir, drug, f"BINARY/tiers=1/phenos={phenos_name}/dropAF_noSyn_unpooled")

'/n/data1/hms/dbmi/farhat/Sanjana/who-mutation-catalogue/Rifampicin/BINARY/tiers=1/phenos=WHO/dropAF_noSyn_unpooled'

In [6]:
phenos_name = "WHO"
out_dir = os.path.join(analysis_dir, drug, f"BINARY/LRT/phenos={phenos_name}")

LRTresults_tier1 = pd.read_csv(os.path.join(out_dir, f"results_tier={'1'}.csv")).rename(columns={"Unnamed: 0": "mutation"})
LRTresults_tier2 = pd.read_csv(os.path.join(out_dir, f"results_tier={'2'}.csv")).rename(columns={"Unnamed: 0": "mutation"})

# remove the FULL model row, which is the first row
LRTresults_tier1 = add_pval_corrections(LRTresults_tier1.iloc[1:, ])
LRTresults_tier2 = add_pval_corrections(LRTresults_tier2.iloc[1:, ])

tier1_model_permute = pd.read_csv(os.path.join(analysis_dir, drug, f"BINARY/tiers=1/phenos={phenos_name}/dropAF_noSyn_unpooled/model_analysis.csv")).query("~mutation.str.contains('PC')")
tier2_model_permute = pd.read_csv(os.path.join(analysis_dir, drug, f"BINARY/tiers=1+2/phenos={phenos_name}/dropAF_noSyn_unpooled/model_analysis.csv")).query("~mutation.str.contains('PC')")

# remove the tier 1 genes for the purposes of this analysis
tier2_model_permute = tier2_model_permute.query("mutation not in @tier1_model_permute.mutation")

# check that all mutations in the permutation dataframe are in the LRT dataframe. The only difference should be the FULL model row
# print(set(tier1_model_permute.mutation).symmetric_difference(LRTresults_tier1.index))
# print(set(tier2_model_permute.mutation).symmetric_difference(LRTresults_tier2.index))
assert len(set(tier1_model_permute.mutation).symmetric_difference(LRTresults_tier1.mutation)) == 0
assert len(set(tier2_model_permute.mutation).symmetric_difference(LRTresults_tier2.mutation)) == 0

# combine results into a single dataframe for easy searching
tier1_model = tier1_model_permute.merge(LRTresults_tier1[["mutation", "pval", "BH_pval", "Bonferroni_pval"]].rename(columns={"pval": "LRT_pval", 
                                                                                                               "BH_pval": "LRT_BH_pval", 
                                                                                                               "Bonferroni_pval": "LRT_Bonferroni_pval"
                                                                                                              }), on="mutation", how="left")

tier2_model = tier2_model_permute.merge(LRTresults_tier2[["mutation", "pval", "BH_pval", "Bonferroni_pval"]].rename(columns={"pval": "LRT_pval", 
                                                                                                               "BH_pval": "LRT_BH_pval", 
                                                                                                               "Bonferroni_pval": "LRT_Bonferroni_pval"
                                                                                                              }), on="mutation", how="left")

print(len(tier1_model), len(tier2_model))

815 2845


In [7]:
# number of Tier 1 mutations determined significant by original analysis, but not in LRT
thresh = 0.05
tier1_model.query("BH_pval < @thresh & LRT_BH_pval > @thresh").confidence.value_counts(dropna=False)

3) Uncertain significance    34
NaN                          16
2) Assoc w R - Interim       14
5) Not assoc w R              4
Name: confidence, dtype: int64

In [8]:
# mutations picked up by the LRT, but not the coefficient association. 
# They are not significant in the tiers 1+2 model either 
thresh = 0.05

print(f"{len(tier1_model.query('Odds_Ratio > 1 & BH_pval < @thresh & LRT_BH_pval > @thresh'))} significant coef, but not LRT")
print(f"{len(tier1_model.query('Odds_Ratio > 1 & BH_pval > @thresh & LRT_BH_pval < @thresh'))} significant LRT, but not coef")
print(f"{len(tier1_model.query('Odds_Ratio > 1 & BH_pval < @thresh & LRT_BH_pval < @thresh'))} significant in both")

54 significant coef, but not LRT
4 significant LRT, but not coef
47 significant in both


In [9]:
# mutations picked up by the LRT, but not the coefficient association. 
# They are not significant in the tiers 1+2 model either 
thresh = 0.05
tier1_model.query("Odds_Ratio > 1 & BH_pval > @thresh & LRT_BH_pval < @thresh")

Unnamed: 0,mutation,coef,pval,BH_pval,Bonferroni_pval,Odds_Ratio,confidence,LRT_pval,LRT_BH_pval,LRT_Bonferroni_pval
45,rpoB_p.Asn438del,0.07156,0.001,0.41,0.82,1.074182,2) Assoc w R - Interim,1.646256e-08,2e-06,1.3e-05
65,rpoB_p.Leu430_Ser431insArg,0.050143,0.003,0.615,1.0,1.051422,,2.782667e-05,0.001194,0.022679
70,rpoB_p.Gln436del,0.048461,0.011,0.751667,1.0,1.049655,2) Assoc w R - Interim,1.692521e-05,0.000811,0.013794
97,rpoB_p.Asp435_Gln436delinsGlu,0.035628,0.034,0.961379,1.0,1.03627,2) Assoc w R - Interim,0.001195996,0.034812,0.974737


In [10]:
thresh = 0.01
tier2_model.query("BH_pval < @thresh & LRT_BH_pval > @thresh").confidence.value_counts(dropna=False)

NaN                          111
3) Uncertain significance     20
5) Not assoc w R               5
Name: confidence, dtype: int64

In [11]:
tier2_model.query("mutation=='glpK_p.Val192fs'")

Unnamed: 0,mutation,coef,pval,BH_pval,Bonferroni_pval,Odds_Ratio,confidence,LRT_pval,LRT_BH_pval,LRT_Bonferroni_pval
203,glpK_p.Val192fs,0.026607,0.033,1.0,1.0,1.026964,,1.0,1.0,1.0


In [12]:
thresh = 0.01

print(f"{len(tier2_model.query('Odds_Ratio > 1 & BH_pval < @thresh & LRT_BH_pval > @thresh'))} significant coef, but not LRT")
print(f"{len(tier2_model.query('Odds_Ratio > 1 & BH_pval > @thresh & LRT_BH_pval < @thresh'))} significant LRT, but not coef")
print(f"{len(tier2_model.query('Odds_Ratio > 1 & BH_pval < @thresh & LRT_BH_pval < @thresh'))} significant in both")

98 significant coef, but not LRT
0 significant LRT, but not coef
2 significant in both


In [13]:
tier2_model.query('Odds_Ratio > 1 & BH_pval < @thresh & LRT_BH_pval > @thresh')

Unnamed: 0,mutation,coef,pval,BH_pval,Bonferroni_pval,Odds_Ratio,confidence,LRT_pval,LRT_BH_pval,LRT_Bonferroni_pval
0,rpoC_p.Val483Gly,0.300945,0.0,0.0,0.0,1.351136,,0.006608,0.458527,1.000000
1,rpoC_p.Ile491Thr,0.199407,0.0,0.0,0.0,1.220678,,0.000034,0.024332,0.097328
3,rpoC_p.Ile491Val,0.194982,0.0,0.0,0.0,1.215289,3) Uncertain significance,0.000808,0.176888,1.000000
4,rpoC_p.Asn698Ser,0.190240,0.0,0.0,0.0,1.209540,,0.000071,0.033749,0.202492
5,rpoC_p.Pro1040Arg,0.179407,0.0,0.0,0.0,1.196508,3) Uncertain significance,0.000163,0.051457,0.463113
...,...,...,...,...,...,...,...,...,...,...
138,glpK_c.-7A>G,0.036804,0.0,0.0,0.0,1.037489,,0.010680,0.523861,1.000000
175,Rv2477c_p.Phe429Leu,0.030412,0.0,0.0,0.0,1.030879,,0.042472,1.000000,1.000000
234,glpK_p.Asp215Gly,0.023232,0.0,0.0,0.0,1.023504,,0.827897,1.000000,1.000000
245,Rv2752c_p.Ser189*,0.022385,0.0,0.0,0.0,1.022638,,0.825366,1.000000,1.000000


In [31]:
tier2_model.query("mutation in ['rpoC_p.Val483Ala', 'rpoC_p.Val483Gly', 'rpoC_p.Glu1092Asp']")

Unnamed: 0,mutation,coef,pval,BH_pval,Bonferroni_pval,Odds_Ratio,confidence,LRT_pval,LRT_BH_pval,LRT_Bonferroni_pval
0,rpoC_p.Val483Gly,0.300945,0.0,0.0,0.0,1.351136,,0.006607943,0.458527,1.0
2,rpoC_p.Glu1092Asp,0.19584,0.0,0.0,0.0,1.216332,5) Not assoc w R,5.033098e-07,0.000716,0.001432
6,rpoC_p.Val483Ala,0.159044,0.0,0.0,0.0,1.17239,,0.01559949,0.625078,1.0


In [14]:
thresh = 0.01
tier2_model.query('BH_pval < @thresh & LRT_BH_pval < @thresh')["mutation"].values

array(['rpoC_p.Glu1092Asp', 'Rv2752c_p.Asn30Ser'], dtype=object)

In [15]:
who_variants_combined.query("drug=='RIF'").confidence.value_counts()

3) Uncertain significance    1515
2) Assoc w R - Interim         93
5) Not assoc w R               28
1) Assoc w R                   24
Name: confidence, dtype: int64

# Perform bootstrapping for all 152 mutations to see how much AUC, Sens, Spec, and Accuracy Change

If the confidence interval lies above 0, it means that removing the feature SIGNIFICANTLY DECREASES THE PREDICTIVE POWER OF THE MODEL

CI > 0 = FEATURE IS IMPORTANT

In [16]:
# # bootstrapped the difference: full model stat - (model - 1) stat
# LRT_bootstrap_StatsDiff = pd.read_csv(os.path.join(analysis_dir, drug, "BINARY/LRT", f"{phenos_name}phenos_bootstrap_StatsDiff.csv.gz"), compression="gzip")
# print(len(LRT_bootstrap_StatsDiff.mutation.unique()))
# LRT_bootstrap_StatsDiff2 = pd.read_csv(os.path.join(analysis_dir, drug, "BINARY/LRT", f"{phenos_name}phenos_bootstrap_StatsDiff_2.csv.gz"), compression="gzip")
# print(len(LRT_bootstrap_StatsDiff2.mutation.unique()))

# LRT_bootstrap_StatsDiff = pd.concat([LRT_bootstrap_StatsDiff, LRT_bootstrap_StatsDiff2])

In [265]:
for metric in ['AUC']:

    print(st.spearmanr(LRT_res.query("mutation in @LRT_bootstrap_StatsDiff.mutation.unique()")["BH_pval"], 
                 pval_df[metric]
                ))

SpearmanrResult(correlation=0.9338806280906226, pvalue=2.6137422278744596e-05)


In [264]:
pval_df = pd.DataFrame(columns=['AUC', 'Sens', 'Spec', 'accuracy'])

for mut in LRT_bootstrap_StatsDiff.mutation.unique():
    
    print("\n" + mut)
    
    for metric in ['AUC']:
    
#         lower, upper = np.percentile(LRT_bootstrap_StatsDiff.query("mutation==@mut")[metric], q=[2.5, 97.5])
    
#         if lower > 0 and upper > 0:
#             print(f"    {metric}: {round(lower, 4)}, {round(upper, 4)}")

        pval = 1 - np.mean(LRT_bootstrap_StatsDiff.query('mutation==@mut')[metric].values > 0)
        
        if pval < 0.05:
            print(f"{metric}: {round(pval, 2)}")
            
        pval_df.loc[mut, metric] = pval


rpoB_p.Ser450Leu
AUC: 0.0

Rv2752c_p.Asn30Ser
AUC: 0.01

rpoC_p.Glu1092Asp
AUC: 0.01

rpoC_p.Ile491Thr

rpoC_p.Asn698Ser
AUC: 0.02

rpoC_p.Pro1040Arg

glpK_p.Leu152Arg

Rv2752c_p.Val396Gly

lpqB_p.Asp370Glu

rpoA_c.-316G>A

Rv1129c_p.Ser362Thr


# Model with Interaction Terms

Chose 2 variants because they were the only Tier 2 genes that were significantly associated in the Ridge regression and were significant in the LRT

In [23]:
thresh = 0.01
interact_mut = tier2_model.query('BH_pval < @thresh & LRT_BH_pval < @thresh')["mutation"].values
#interact_mut = ['rpoC_p.Glu1092Asp', 'Rv2752c_p.Asn30Ser']

# model_matrix = pd.read_pickle(os.path.join(analysis_dir, drug, "BINARY/tiers=1+2/phenos=WHO/dropAF_noSyn_unpooled/model_matrix.pkl"))
# interact_matrix = model_matrix.copy()

# # get a list of all pairs of mutations to interact
# interactions = list(itertools.product(*[interact_mut, interact_matrix.columns[interact_matrix.columns.str.contains("rpoB")]]))

# # add interactions
# for (mut1, mut2) in interactions:
#     interact_matrix[f"{mut1}+{mut2}"] = interact_matrix[mut1] * interact_matrix[mut2]
    
# # drop any columns with no signal
# interact_matrix = interact_matrix[interact_matrix.columns[~((interact_matrix == 0).all())]]

# assert len(set(model_matrix.columns) - set(interact_matrix.columns)) == 0
# print(len(interact_matrix), len(model_matrix))

# interact_matrix.to_pickle(os.path.join(analysis_dir, drug, f"BINARY/interaction/model_matrix_{phenos_name}.pkl"))

interact_matrix = pd.read_pickle(os.path.join(analysis_dir, drug, f"BINARY/interaction/model_matrix_{phenos_name}.pkl"))
interact_matrix.shape

(30984, 3736)

In [24]:
# so much co-occurrence of rpoC_p.Glu1092Asp with rpoB mutations. Can also check proportion that are high confidence rpoB mutations
for mut in interact_mut:
    print(mut, interact_matrix[interact_matrix.columns[interact_matrix.columns.str.contains(mut)]].shape[1])

rpoC_p.Glu1092Asp 125
Rv2752c_p.Asn30Ser 4


In [25]:
phenos_name = "WHO"
interact_res = pd.read_csv(os.path.join(analysis_dir, drug, f"BINARY/interaction/{phenos_name}_coef.csv"))
interact_permute = pd.read_csv(os.path.join(analysis_dir, drug, f"BINARY/interaction/{phenos_name}_coef_permutation.csv"))

# assess significance using the results of the permutation test
for i, row in interact_res.iterrows():
    # p-value is the proportion of permutation coefficients that are AT LEAST AS EXTREME as the test statistic
    if row["coef"] > 0:
        interact_res.loc[i, "pval"] = np.mean(interact_permute[row["mutation"]] >= row["coef"])
    else:
        interact_res.loc[i, "pval"] = np.mean(interact_permute[row["mutation"]] <= row["coef"])
        
interact_res = add_pval_corrections(interact_res)
interact_res["Odds_Ratio"] = np.exp(interact_res["coef"])

interact_res[["Mut1", "Mut2"]] = interact_res["mutation"].str.split("+", expand=True)
interact_res = interact_res.merge(who_variants_combined.query("drug=='RIF'"), left_on="Mut2", right_on="mutation", how="left")
del interact_res["mutation_y"]
interact_res.rename(columns={"mutation_x": "mutation"}, inplace=True)

In [26]:
interact_res.query("mutation.str.contains('rpoC_p.Glu1092Asp') & BH_pval < 0.05").sort_values(["BH_pval", "Odds_Ratio"], 
                                                                                           ascending=[True, False]
                                                                                          )

Unnamed: 0,mutation,coef,pval,BH_pval,Bonferroni_pval,Odds_Ratio,Mut1,Mut2,drug,confidence
219,rpoC_p.Glu1092Asp,0.219309,0.0,0.0,0.0,1.245216,rpoC_p.Glu1092Asp,,,
170,rpoC_p.Glu1092Asp+rpoB_p.Ile491Tyr,0.026609,0.0,0.0,0.0,1.026966,rpoC_p.Glu1092Asp,rpoB_p.Ile491Tyr,RIF,3) Uncertain significance
169,rpoC_p.Glu1092Asp+rpoB_p.Met434_Asn437delinsIle,0.018817,0.0,0.0,0.0,1.018995,rpoC_p.Glu1092Asp,rpoB_p.Met434_Asn437delinsIle,RIF,2) Assoc w R - Interim
171,rpoC_p.Glu1092Asp+rpoB_p.Asn260Asp,-0.027974,0.0,0.0,0.0,0.972414,rpoC_p.Glu1092Asp,rpoB_p.Asn260Asp,,


In [27]:
interact_res.query("mutation=='rpoC_p.Val483Ala'")

Unnamed: 0,mutation,coef,pval,BH_pval,Bonferroni_pval,Odds_Ratio,Mut1,Mut2,drug,confidence
175,rpoC_p.Val483Ala,0.16795,0.0,0.0,0.0,1.182877,rpoC_p.Val483Ala,,,


In [298]:
interact_res.query("mutation=='rpoB_p.Asn260Asp'")

Unnamed: 0,mutation,coef,pval,BH_pval,Bonferroni_pval,Odds_Ratio,Mut1,Mut2,drug,confidence
91,rpoB_p.Asn260Asp,-0.027974,0.0,0.0,0.0,0.972414,rpoB_p.Asn260Asp,,,


In [304]:
interact_matrix.shape

(30984, 3736)

In [305]:
interact_matrix = interact_matrix.merge(df_phenos, left_index=True, right_index=True)
del interact_matrix["phenotypic_category"]

In [28]:
interact_res.query("mutation.str.contains('rpoC_p.Glu1092Asp') & pval < 0.05").sort_values(["BH_pval", "Odds_Ratio"], 
                                                                                           ascending=[True, False]
                                                                                          )

Unnamed: 0,mutation,coef,pval,BH_pval,Bonferroni_pval,Odds_Ratio,Mut1,Mut2,drug,confidence
219,rpoC_p.Glu1092Asp,0.219309,0.0,0.0,0.0,1.245216,rpoC_p.Glu1092Asp,,,
170,rpoC_p.Glu1092Asp+rpoB_p.Ile491Tyr,0.026609,0.0,0.0,0.0,1.026966,rpoC_p.Glu1092Asp,rpoB_p.Ile491Tyr,RIF,3) Uncertain significance
169,rpoC_p.Glu1092Asp+rpoB_p.Met434_Asn437delinsIle,0.018817,0.0,0.0,0.0,1.018995,rpoC_p.Glu1092Asp,rpoB_p.Met434_Asn437delinsIle,RIF,2) Assoc w R - Interim
171,rpoC_p.Glu1092Asp+rpoB_p.Asn260Asp,-0.027974,0.0,0.0,0.0,0.972414,rpoC_p.Glu1092Asp,rpoB_p.Asn260Asp,,
300,rpoC_p.Glu1092Asp+rpoB_p.Ser450Leu,0.05754,0.004,1.0,1.0,1.059228,rpoC_p.Glu1092Asp,rpoB_p.Ser450Leu,RIF,1) Assoc w R
429,rpoC_p.Glu1092Asp+rpoB_p.Asp435Gly,0.038386,0.027,1.0,1.0,1.039132,rpoC_p.Glu1092Asp,rpoB_p.Asp435Gly,RIF,2) Assoc w R - Interim
330,rpoC_p.Glu1092Asp+rpoB_p.Val168Ala,0.033776,0.006,1.0,1.0,1.034352,rpoC_p.Glu1092Asp,rpoB_p.Val168Ala,RIF,3) Uncertain significance
405,rpoC_p.Glu1092Asp+rpoB_p.Ser450Trp,0.026845,0.019,1.0,1.0,1.027209,rpoC_p.Glu1092Asp,rpoB_p.Ser450Trp,RIF,1) Assoc w R
436,rpoC_p.Glu1092Asp+rpoB_p.Asp435Asn,0.026641,0.029,1.0,1.0,1.026999,rpoC_p.Glu1092Asp,rpoB_p.Asp435Asn,,
488,rpoC_p.Glu1092Asp+rpoB_p.His445Leu,0.025338,0.043,1.0,1.0,1.025661,rpoC_p.Glu1092Asp,rpoB_p.His445Leu,RIF,1) Assoc w R


In [274]:
who_variants_combined.query("mutation=='rpoB_p.Ile491Tyr'")

Unnamed: 0,drug,confidence,mutation
4680,RIF,3) Uncertain significance,rpoB_p.Ile491Tyr


In [151]:
tiers1_stats = pd.read_csv(os.path.join(analysis_paths[0], "model_analysis_with_stats.csv"))
tiers12_stats = pd.read_csv(os.path.join(analysis_paths[1], "model_analysis_with_stats.csv"))

In [152]:
tiers1_stats

Unnamed: 0,mutation,predicted_effect,position,confidence,Odds_Ratio,pval,BH_pval,Bonferroni_pval,Significant,Num_Isolates,...,NPV_LB,NPV_UB,Sens_LB,Sens_UB,Spec_LB,Spec_UB,LR+_LB,LR+_UB,LR-_LB,LR-_UB
0,rpoB_p.Ser450Leu,missense_variant,761154;761155,1) Assoc w R,13.996830,0.000,0.000000,0.0,1,7536,...,0.849854,0.858186,0.638086,0.655640,0.995089,0.996740,131.630401,196.405333,0.345859,0.363410
1,rpoB_p.Asp435Val,missense_variant,761110,1) Assoc w R,2.453635,0.000,0.000000,0.0,1,695,...,0.683125,0.692907,0.055387,0.064131,0.999284,0.999828,81.961356,305.112605,0.936382,0.945054
2,rpoB_p.His445Tyr,missense_variant,761138;761139,1) Assoc w R,2.028256,0.000,0.000000,0.0,1,458,...,0.678422,0.688210,0.035491,0.042651,0.999229,0.999799,49.686851,173.868388,0.957910,0.965003
3,rpoB_p.His445Asp,missense_variant,761139,1) Assoc w R,1.953321,0.000,0.000000,0.0,1,385,...,0.677139,0.686928,0.029849,0.036471,0.999511,0.999932,65.271465,380.899057,0.963895,0.970440
4,rpoB_p.Ser450Trp,missense_variant,761155,1) Assoc w R,1.595500,0.000,0.000000,0.0,1,198,...,0.673539,0.683331,0.014594,0.019391,0.999571,0.999954,37.397049,270.739868,0.980940,0.985659
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
810,rpoB_p.Asp853_Glu854dup,inframe_insertion,762368;762365;762352,5) Not assoc w R,0.944660,0.000,0.000000,0.0,1,32,...,0.320645,0.330445,0.000917,0.001893,0.999679,1.000000,,inf,0.998195,0.999123
811,rpoB_p.Ile925Val,missense_variant,762579,5) Not assoc w R,0.938021,0.000,0.000000,0.0,1,81,...,0.320866,0.330675,0.002399,0.003845,0.998630,0.999700,2.120413,9.123034,0.996785,0.998485
812,rpoB_c.-83A>G,upstream_gene_variant,759724,3) Uncertain significance,0.936417,0.002,0.546667,1.0,0,17,...,0.320480,0.330277,0.000383,0.001089,0.999516,0.999998,1.022873,58.144978,0.999046,0.999786
813,rpoB_p.Asn381His,missense_variant,760947,5) Not assoc w R,0.898769,0.000,0.000000,0.0,1,76,...,0.321046,0.330856,0.002510,0.003985,0.999679,1.000000,,inf,0.996100,0.997530


In [153]:
tiers1_stats[tiers1_stats.columns[~tiers1_stats.columns.str.contains("|".join(["LB", "UB"]))]]

Unnamed: 0,mutation,predicted_effect,position,confidence,Odds_Ratio,pval,BH_pval,Bonferroni_pval,Significant,Num_Isolates,...,TP,FP,TN,FN,PPV,NPV,Sens,Spec,LR+,LR-
0,rpoB_p.Ser450Leu,missense_variant,761154;761155,1) Assoc w R,13.996830,0.000,0.000000,0.0,1,7536,...,7440,96,23765,4061,0.987261,0.854057,0.646900,0.995977,160.788410,0.354526
1,rpoB_p.Asp435Val,missense_variant,761110,1) Assoc w R,2.453635,0.000,0.000000,0.0,1,695,...,686,9,23852,10815,0.987050,0.688032,0.059647,0.999623,158.137418,0.940708
2,rpoB_p.His445Tyr,missense_variant,761138;761139,1) Assoc w R,2.028256,0.000,0.000000,0.0,1,458,...,448,10,23851,11053,0.978166,0.683331,0.038953,0.999581,92.946074,0.961450
3,rpoB_p.His445Asp,missense_variant,761139,1) Assoc w R,1.953321,0.000,0.000000,0.0,1,385,...,380,5,23856,11121,0.987013,0.682048,0.033041,0.999790,157.676376,0.967162
4,rpoB_p.Ser450Trp,missense_variant,761155,1) Assoc w R,1.595500,0.000,0.000000,0.0,1,198,...,194,4,23857,11307,0.979798,0.678450,0.016868,0.999832,100.622424,0.983297
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
810,rpoB_p.Asp853_Glu854dup,inframe_insertion,762368;762365;762352,5) Not assoc w R,0.944660,0.000,0.000000,0.0,1,32,...,32,0,11501,23829,1.000000,0.325531,0.001341,1.000000,inf,0.998659
811,rpoB_p.Ile925Val,missense_variant,762579,5) Not assoc w R,0.938021,0.000,0.000000,0.0,1,81,...,73,8,11493,23788,0.901235,0.325756,0.003059,0.999304,4.398249,0.997635
812,rpoB_c.-83A>G,upstream_gene_variant,759724,3) Uncertain significance,0.936417,0.002,0.546667,1.0,0,17,...,16,1,11500,23845,0.941176,0.325364,0.000671,0.999913,7.711999,0.999416
813,rpoB_p.Asn381His,missense_variant,760947,5) Not assoc w R,0.898769,0.000,0.000000,0.0,1,76,...,76,0,11501,23785,1.000000,0.325937,0.003185,1.000000,inf,0.996815


In [148]:
tiers12_stats[tiers12_stats.columns[~tiers12_stats.columns.str.contains("|".join(["LB", "UB"]))]].query("mutation in ['rpoC_p.Glu1092Asp', 'rpoC_p.Val483Ala', 'rpoC_p.Val483Gly']")

Unnamed: 0,mutation,predicted_effect,position,confidence,Odds_Ratio,pval,BH_pval,Bonferroni_pval,Significant,Num_Isolates,...,TP,FP,TN,FN,PPV,NPV,Sens,Spec,LR+,LR-
9,rpoC_p.Val483Gly,missense_variant,764817,,1.351136,0.0,0.0,0.0,1,900,...,883,17,20904,9180,0.981111,0.694854,0.087747,0.999187,107.985825,0.912995
16,rpoC_p.Glu1092Asp,missense_variant,766645,5) Not assoc w R,1.216332,0.0,0.0,0.0,1,2697,...,1711,986,19935,8352,0.634409,0.704741,0.170029,0.95287,3.60768,0.871022
27,rpoC_p.Val483Ala,missense_variant,764817,,1.17239,0.0,0.0,0.0,1,236,...,233,3,20918,9830,0.987288,0.680304,0.023154,0.999857,161.469178,0.976986


In [150]:
tiers12_stats.query("mutation in ['rpoC_p.Glu1092Asp', 'rpoC_p.Val483Ala', 'rpoC_p.Val483Gly']")[["PPV_LB", "PPV_UB"]]

Unnamed: 0,PPV_LB,PPV_UB
9,0.969929,0.988959
16,0.615909,0.652616
27,0.963302,0.997371
