In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib_venn import venn2, venn3

plt.rcParams['figure.dpi'] = 150
import seaborn as sns
import scipy.stats as st
import statsmodels
import statsmodels.api as sm
from functools import reduce

import glob, os, yaml, subprocess, itertools, sparse, sys, statsmodels, shutil
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

analysis_dir = '/n/data1/hms/dbmi/farhat/Sanjana/who-mutation-catalogue'

lineages = pd.read_csv("../lineages/combined_lineages_samples.csv")
print(lineages.shape)
import collections, warnings
warnings.filterwarnings("ignore")

sys.path.append(os.path.join(os.path.dirname(os.getcwd()), "utils"))
from stats_utils import *
from data_utils import *

coll2014 = pd.read_csv("../data/coll2014_SNP_scheme.tsv", sep="\t")
coll2014["#lineage"] = coll2014["#lineage"].str.replace("lineage", "")
coll2014.rename(columns={"#lineage": "Lineage"}, inplace=True)
coll2014['nucleotide'] = [val.split('/')[1] for val in coll2014['allele_change'].values]

drug_abbr_dict = {"Delamanid": "DLM",
                  "Bedaquiline": "BDQ",
                  "Clofazimine": "CFZ",
                  "Ethionamide": "ETO",
                  "Linezolid": "LZD",
                  "Moxifloxacin": "MXF",
                  "Capreomycin": "CAP",
                  "Amikacin": "AMK",
                  "Pretomanid": "PMD",
                  "Pyrazinamide": "PZA",
                  "Kanamycin": "KAN",
                  "Levofloxacin": "LFX",
                  "Streptomycin": "STM",
                  "Ethambutol": "EMB",
                  "Isoniazid": "INH",
                  "Rifampicin": "RIF"
                 }

cc_df = pd.read_csv("../data/drug_CC.csv")
# who_variants_V1 = pd.read_excel("../results/WHO-catalog-V1.xlsx", sheet_name='Mutation_catalogue')
# who_variants = pd.read_csv("../results/WHO-catalog-V2-full.csv", header=[2]).query("tier==1").reset_index(drop=True)
# del who_variants['mutation']

# for col in who_variants.columns[6:46]:
#     who_variants.rename(columns={col: f"ALL_{col}"}, inplace=True)

# for col in who_variants.columns[50:97]:
#     who_variants.rename(columns={col: f"WHO_{col.replace('.1', '')}"}, inplace=True)

# who_variants.to_csv("../results/WHO-catalog-V2-tier1.csv", index=False)
who_variants = pd.read_csv("../results/WHO-catalog-V2-tier1.csv")
del who_variants['mutation']
RS_counts = pd.read_csv("../tables/Figure_3A_data.csv", index_col='Drug')

results_final = pd.read_csv("../results/Regression_Final_June2024_Tier1.csv")
not_assocR_literature = pd.read_excel("../data/NotAwR by literature.xlsx", sheet_name=0).rename(columns={'drug': 'Drug', 'variant': 'mutation'})

drug_gene_mapping = pd.read_csv("../data/drug_gene_mapping.csv")
silent_lst = ['synonymous_variant', 'initiator_codon_variant', 'stop_retained_variant']

drugs_lst = list(drug_abbr_dict.keys())
sample_ids_mapping = pd.read_csv("../data/sample_ids_mapping_20220922.csv")

(52567, 10)


# Overview of Regression Gradings

In [2]:
regression_grading_cols_lst = ['REGRESSION FINAL CONFIDENCE GRADING', 'REGRESSION + GRADING RULES']
solo_cols_lst = ['SOLO INITIAL CONFIDENCE GRADING', 'SOLO FINAL CONFIDENCE GRADING']

regression_totals_df = []
solo_totals_df = []

for col in regression_grading_cols_lst:
    regression_totals_df.append(pd.DataFrame(results_final[col].value_counts()).reset_index().rename(columns={col: 'Grading', 'count': col}).set_index('Grading'))

for col in solo_cols_lst:
    solo_totals_df.append(pd.DataFrame(results_final[col].value_counts()).reset_index().rename(columns={col: 'Grading', 'count': col}).set_index('Grading'))

regression_totals_df = pd.concat(regression_totals_df, axis=1).sort_index()
solo_totals_df = pd.concat(solo_totals_df, axis=1).sort_index()

In [3]:
regression_totals_df.merge(solo_totals_df, left_index=True, right_index=True)

Unnamed: 0_level_0,REGRESSION FINAL CONFIDENCE GRADING,REGRESSION + GRADING RULES,SOLO INITIAL CONFIDENCE GRADING,SOLO FINAL CONFIDENCE GRADING
Grading,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1) Assoc w R,589,589,311,253
2) Assoc w R - Interim,164,1050,146,1130
3) Uncertain significance,20617,14763,20835,14958
4) Not assoc w R - Interim,130,5098,19,4998
5) Not assoc w R,89,89,278,250


In [2]:
311+146+19+278

754

In [1]:
20835-20617

218

In [3]:
218/754

0.2891246684350133

In [4]:
solo_uncertain = solo_totals_df.loc['3) Uncertain significance', 'SOLO INITIAL CONFIDENCE GRADING']
regression_uncertain = regression_totals_df.loc['3) Uncertain significance', 'REGRESSION FINAL CONFIDENCE GRADING']

solo_graded = solo_totals_df.drop(labels='3) Uncertain significance')['SOLO INITIAL CONFIDENCE GRADING'].sum()

# how many fewer Uncertain mutations does regression leave? What is the percent change relative to the number of SOLO graded mutations?
solo_uncertain-regression_uncertain, solo_graded, (solo_uncertain-regression_uncertain)/solo_graded

(218, 754, 0.2891246684350133)

In [5]:
regression_LoF_upgrades = results_final.loc[(results_final['REGRESSION FINAL CONFIDENCE GRADING'] != results_final['REGRESSION GRADING + LOF UPGRADE'])]
regression_GR_upgrades = results_final.loc[(results_final['REGRESSION GRADING + LOF UPGRADE'] != results_final['REGRESSION + GRADING RULES'])]

print(f'{len(regression_LoF_upgrades)} variants were upgraded by grading rules')
print(f'{len(regression_LoF_upgrades.query("predicted_effect not in @silent_lst"))} non-silent variants were upgraded by grading rules')
print(f'{len(regression_GR_upgrades)} variants were upgraded by grading rules')
print(f'{len(regression_GR_upgrades.query("predicted_effect not in @silent_lst"))} non-silent variants were upgraded by grading rules')

804 variants were upgraded by grading rules
804 non-silent variants were upgraded by grading rules
5050 variants were upgraded by grading rules
146 non-silent variants were upgraded by grading rules


# Number of Variants tested only in the ALL dataset or Ungraded

In [6]:
ungraded_muts = results_final.loc[(pd.isnull(results_final['WHO_Odds_Ratio'])) & (pd.isnull(results_final['ALL_Odds_Ratio']))]
graded_muts = results_final.dropna(subset=['WHO_Odds_Ratio', 'ALL_Odds_Ratio'], how='all')
WHO_only = results_final.loc[(~pd.isnull(results_final['WHO_Odds_Ratio'])) & (pd.isnull(results_final['ALL_Odds_Ratio']))]
ALL_only = results_final.loc[(pd.isnull(results_final['WHO_Odds_Ratio'])) & (~pd.isnull(results_final['ALL_Odds_Ratio']))]

WHO_ALL_graded = results_final.loc[(~pd.isnull(results_final['WHO_Odds_Ratio'])) & (~pd.isnull(results_final['ALL_Odds_Ratio']))]
WHO_ALL_same_grading = WHO_ALL_graded.loc[(WHO_ALL_graded['Initial confidence grading WHO dataset'] == WHO_ALL_graded['Initial confidence grading ALL dataset'])]

print(f"{len(ungraded_muts)} ({np.round(len(ungraded_muts) / len(results_final) * 100, 1)})% mutations were not tested in either phenotypic dataset")
print(f"{len(WHO_only)}/{len(graded_muts)} ({np.round(len(WHO_only) / len(graded_muts) * 100, 1)})% mutations were only tested in the WHO dataset")
print(f"{len(ALL_only)}/{len(graded_muts)} ({np.round(len(ALL_only) / len(graded_muts) * 100, 1)})% mutations were only tested in the ALL dataset")
print(f"{len(WHO_ALL_same_grading)} ({np.round(len(WHO_ALL_same_grading) / len(WHO_ALL_graded) * 100, 1)})% mutations have the same grading in both datasets")

2388 (11.1)% mutations were not tested in either phenotypic dataset
0/19201 (0.0)% mutations were only tested in the WHO dataset
6710/19201 (34.9)% mutations were only tested in the ALL dataset
12173 (97.5)% mutations have the same grading in both datasets


In [7]:
len(graded_muts) - len(ALL_only), len(WHO_ALL_graded)

(12491, 12491)

# Number of Variants that meet different Criteria for resolving WHO and ALL discrepancies

In [12]:
WHO_col = 'Initial confidence grading WHO dataset'
ALL_col = 'Initial confidence grading ALL dataset'

print('Both Assoc w R:', len(results_final.loc[(results_final[WHO_col]=='Assoc w R') & (results_final[ALL_col]=='Assoc w R')]))

print('Both Not assoc w R or Neutral:', len(results_final.loc[(results_final[WHO_col].isin(['Not assoc w R', 'Neutral'])) & (results_final[ALL_col].isin(['Not assoc w R', 'Neutral']))]))

print('WHO = Neutral, ALL = Uncertain:', len(results_final.loc[(results_final[WHO_col]=='Neutral') & (results_final[ALL_col]=='Uncertain')]))

print('WHO = Uncertain, ALL = Neutral:', len(results_final.loc[(results_final[WHO_col]=='Uncertain') & (results_final[ALL_col]=='Neutral') & (~pd.isnull(results_final[WHO_col]))]))

print('WHO = Assoc w R, ALL = Uncertain:', len(results_final.loc[(results_final[WHO_col]=='Assoc w R') & (results_final[ALL_col]=='Uncertain')]))

print('WHO = Not assoc w R, ALL = Uncertain:', len(results_final.loc[(results_final[WHO_col]=='Not assoc w R') & (results_final[ALL_col]=='Uncertain')]))

print('WHO = Uncertain, ALL = Assoc w R:', len(results_final.loc[(results_final[WHO_col]=='Uncertain') & (results_final[ALL_col]=='Assoc w R')]))

print('WHO = Uncertain, ALL = Not assoc w R:', len(results_final.loc[(results_final[WHO_col]=='Uncertain') & (results_final[ALL_col]=='Not assoc w R')]))

print('WHO = Assoc w R, ALL = Not assoc w R, Neutral:', len(results_final.loc[(results_final[WHO_col]=='Assoc w R') & (results_final[ALL_col].isin(['Not assoc w R', 'Neutral']))]))

print('WHO = Not assoc w R, Neutral, ALL = Assoc w R:', len(results_final.loc[(results_final[ALL_col]=='Assoc w R') & (results_final[WHO_col].isin(['Not assoc w R', 'Neutral']))]))

Both Assoc w R: 589
Both Not assoc w R or Neutral: 61
WHO = Neutral, ALL = Uncertain: 28
WHO = Uncertain, ALL = Neutral: 53
WHO = Assoc w R, ALL = Uncertain: 15
WHO = Not assoc w R, ALL = Uncertain: 17
WHO = Uncertain, ALL = Assoc w R: 149
WHO = Uncertain, ALL = Not assoc w R: 55
WHO = Assoc w R, ALL = Not assoc w R, Neutral: 0
WHO = Not assoc w R, Neutral, ALL = Assoc w R: 0


# Results Section A

## Variants that are Assoc w R in WHO and Uncertain in ALL

In [105]:
search_string = 'Assoc w R'

regression_Group12_agreement = len(results_final.loc[(results_final['REGRESSION FINAL CONFIDENCE GRADING'].str.contains(search_string)) & (results_final['SOLO INITIAL CONFIDENCE GRADING'].str.contains(search_string))])

SOLO_Group12 = len(results_final.loc[(results_final['SOLO INITIAL CONFIDENCE GRADING'].str.contains(search_string))])

regression_Group12_agreement, SOLO_Group12, regression_Group12_agreement/SOLO_Group12

(452, 457, 0.9890590809628009)

In [106]:
search_string = 'Assoc w R'

regression_SOLO_GR_Group12_agreement = len(results_final.loc[(results_final['REGRESSION + GRADING RULES'].str.contains(search_string)) & (results_final['SOLO FINAL CONFIDENCE GRADING'].str.contains(search_string))])

SOLO_GR_Group12 = len(results_final.loc[(results_final['SOLO FINAL CONFIDENCE GRADING'].str.contains(search_string))])

regression_SOLO_GR_Group12_agreement, SOLO_GR_Group12, regression_SOLO_GR_Group12_agreement/SOLO_GR_Group12

(1380, 1383, 0.9978308026030369)

In [2]:
# Groups 1 and 2
regression_Groups12 = results_final.loc[(results_final['REGRESSION FINAL CONFIDENCE GRADING'].str.contains('Assoc w R'))]
regression_GR_Groups12 = results_final.loc[(results_final['REGRESSION + GRADING RULES'].str.contains('Assoc w R'))]
SOLO_Groups12 = results_final.loc[results_final['SOLO INITIAL CONFIDENCE GRADING'].str.contains('Assoc w R')]
SOLO_GR_Groups12 = results_final.loc[results_final['SOLO FINAL CONFIDENCE GRADING'].str.contains('Assoc w R')]

groups12_lst = [regression_Groups12, regression_GR_Groups12, SOLO_Groups12, SOLO_GR_Groups12]
group_names = ['Regression', 'Regression + GR', 'SOLO', 'SOLO + GR']

for name, df in zip(group_names, groups12_lst):
    print(f"{name}: {len(df)}")

Regression: 753
Regression + GR: 1639
SOLO: 457
SOLO + GR: 1383


In [3]:
len(regression_Groups12) - len(SOLO_Groups12), (len(regression_Groups12) - len(SOLO_Groups12)) / len(SOLO_Groups12)

(296, 0.6477024070021882)

In [4]:
len(regression_GR_Groups12) - len(regression_Groups12)

886

In [5]:
silent_variants_assocR = results_final.loc[(results_final['predicted_effect'].isin(silent_lst)) & (results_final['REGRESSION FINAL CONFIDENCE GRADING'].str.contains('Assoc w R'))].mutation.values
len(silent_variants_assocR), silent_variants_assocR

(1, array(['katG_c.12A>G'], dtype=object))

In [6]:
# Groups 4 and 5, +/- silent variants
regression_Groups45 = results_final.loc[(results_final['REGRESSION FINAL CONFIDENCE GRADING'].str.contains('Not assoc w R'))]
regression_GR_Groups45 = results_final.loc[(results_final['REGRESSION + GRADING RULES'].str.contains('Not assoc w R'))]
SOLO_Groups45 = results_final.loc[results_final['SOLO INITIAL CONFIDENCE GRADING'].str.contains('Not assoc w R')]
SOLO_GR_Groups45 = results_final.loc[results_final['SOLO FINAL CONFIDENCE GRADING'].str.contains('Not assoc w R')]

groups45_lst = [regression_Groups45, regression_GR_Groups45, SOLO_Groups45, SOLO_GR_Groups45]

for name, df in zip(group_names, groups45_lst):
    print(f"{name}: {len(df)} ({len(df.loc[~df['predicted_effect'].isin(silent_lst)])} non-silent)")

Regression: 219 (160 non-silent)
Regression + GR: 5187 (224 non-silent)
SOLO: 297 (230 non-silent)
SOLO + GR: 5248 (230 non-silent)


## Which variants does regression find, SOLO misses, and grading rules upgrade to interim?

In [7]:
# mutations that HAD to be upgraded to Groups 1 and 2 using rules
SOLOmiss_GRupgrade_R = results_final.loc[(~results_final['SOLO INITIAL CONFIDENCE GRADING'].str.contains('Assoc w R')) &
                                         (results_final['SOLO FINAL CONFIDENCE GRADING'].str.contains('Assoc w R'))
                                        ]

# matches with total counts above too
assert len(SOLOmiss_GRupgrade_R) == len(SOLO_GR_Groups12) - len(SOLO_Groups12)

regression_Rassoc_SOLOmiss_GRupgrade_R = SOLOmiss_GRupgrade_R.loc[SOLOmiss_GRupgrade_R['REGRESSION FINAL CONFIDENCE GRADING'].str.contains('Assoc w R')]
print(f"Regression upgrades {len(regression_Rassoc_SOLOmiss_GRupgrade_R)} R-assoc variants upgraded by grading rules.")

Regression upgrades 78 R-assoc variants upgraded by grading rules.


In [8]:
# mutations that HAD to be upgraded to Groups 1 and 2 using rules
SOLOmiss_GRupgrade_NotR = results_final.loc[(~results_final['SOLO INITIAL CONFIDENCE GRADING'].str.contains('Not assoc w R')) &
                                             (results_final['SOLO FINAL CONFIDENCE GRADING'].str.contains('Not assoc w R'))
                                            ]

# matches with total counts above too
assert len(SOLOmiss_GRupgrade_NotR) == len(SOLO_GR_Groups45) - len(SOLO_Groups45)

regression_NotR_SOLOmiss_GRupgrade_NotR = SOLOmiss_GRupgrade_NotR.loc[SOLOmiss_GRupgrade_NotR['REGRESSION FINAL CONFIDENCE GRADING'].str.contains('Not assoc w R')]
print(f"Regression upgrades {len(regression_NotR_SOLOmiss_GRupgrade_NotR)} Not R-assoc variants upgraded by grading rules.")

Regression upgrades 46 Not R-assoc variants upgraded by grading rules.


## Look into the cases where regression grades but SOLO misses

In [9]:
# combine with WHO catalogue results to get additional information
regression_Rassoc_SOLOmiss_GRupgrade_R_SOLO_results = regression_Rassoc_SOLOmiss_GRupgrade_R[['Drug', 'mutation']].merge(who_variants.rename(columns={'drug': 'Drug', 'variant': 'mutation'}))

print(len(regression_Rassoc_SOLOmiss_GRupgrade_R_SOLO_results))
assert len(regression_Rassoc_SOLOmiss_GRupgrade_R_SOLO_results) == len(regression_Rassoc_SOLOmiss_GRupgrade_R)

regression_Rassoc_SOLOmiss_GRupgrade_R[['REGRESSION FINAL CONFIDENCE GRADING', 'SOLO INITIAL CONFIDENCE GRADING', 'SOLO FINAL CONFIDENCE GRADING']].value_counts()

78


REGRESSION FINAL CONFIDENCE GRADING  SOLO INITIAL CONFIDENCE GRADING  SOLO FINAL CONFIDENCE GRADING
1) Assoc w R                         3) Uncertain significance        2) Assoc w R - Interim           42
2) Assoc w R - Interim               3) Uncertain significance        2) Assoc w R - Interim           33
1) Assoc w R                         3) Uncertain significance        1) Assoc w R                      3
Name: count, dtype: int64

In [31]:
len(regression_Rassoc_SOLOmiss_GRupgrade_R_SOLO_results.query("ALL_Present_SOLO_SR==0")), len(regression_Rassoc_SOLOmiss_GRupgrade_R_SOLO_results.query("ALL_Present_SOLO_SR < 5"))

(3, 27)

In [32]:
regression_Rassoc_SOLOmiss_GRupgrade_R_SOLO_results.query("ALL_Present_SOLO_SR >= 5").ALL_OR_SOLO_FE_sig.value_counts()

ALL_OR_SOLO_FE_sig
False    49
True      2
Name: count, dtype: int64

In [35]:
regression_Rassoc_SOLOmiss_GRupgrade_R_SOLO_results.query("ALL_Present_SOLO_SR >= 5 & ALL_OR_SOLO_FE_sig==True")[['Drug', 'mutation', 'INITIAL CONFIDENCE GRADING', 'FINAL CONFIDENCE GRADING', 'Additional grading criteria applied']]

Unnamed: 0,Drug,mutation,INITIAL CONFIDENCE GRADING,FINAL CONFIDENCE GRADING,Additional grading criteria applied
3,Clofazimine,Rv0678_p.Asp47fs,3) Uncertain significance,2) Assoc w R - Interim,Indel frameshift or premature stop codon (LoF)
42,Linezolid,rrl_n.2270G>T,3) Uncertain significance,2) Assoc w R - Interim,Selection evidence


## Save the mutations for which regression supersedes the need for grading rules to a supplementary table

In [23]:
regression_supersede_save = pd.concat([regression_Rassoc_SOLOmiss_GRupgrade_R, regression_NotR_SOLOmiss_GRupgrade_NotR])[['Drug', 'mutation', 'SOLO INITIAL CONFIDENCE GRADING', 'SOLO FINAL CONFIDENCE GRADING', 'REGRESSION FINAL CONFIDENCE GRADING']].sort_values("Drug")

len(regression_supersede_save)

124

In [24]:
regression_supersede_save['SOLO INITIAL CONFIDENCE GRADING'].value_counts()

SOLO INITIAL CONFIDENCE GRADING
3) Uncertain significance    124
Name: count, dtype: int64

In [25]:
regression_supersede_save['SOLO FINAL CONFIDENCE GRADING'].value_counts()

SOLO FINAL CONFIDENCE GRADING
2) Assoc w R - Interim        75
4) Not assoc w R - Interim    46
1) Assoc w R                   3
Name: count, dtype: int64

In [27]:
regression_supersede_save.loc[regression_supersede_save['SOLO FINAL CONFIDENCE GRADING']=='4) Not assoc w R - Interim']['REGRESSION FINAL CONFIDENCE GRADING'].value_counts()

REGRESSION FINAL CONFIDENCE GRADING
5) Not assoc w R              24
4) Not assoc w R - Interim    22
Name: count, dtype: int64

In [28]:
regression_supersede_save.loc[regression_supersede_save['SOLO FINAL CONFIDENCE GRADING'].str.contains('Assoc w R')]['REGRESSION FINAL CONFIDENCE GRADING'].value_counts()

REGRESSION FINAL CONFIDENCE GRADING
1) Assoc w R              45
2) Assoc w R - Interim    33
Name: count, dtype: int64

In [30]:
regression_supersede_save.to_csv("../tables/regression_supersede_save.csv", index=False)

# 15 Variants that are Assoc w R in WHO and Uncertain in ALL

In [22]:
WHO_AssocR_ALL_Uncertain = results_final.loc[(results_final['Initial confidence grading WHO dataset']=='Assoc w R') & (results_final['Initial confidence grading ALL dataset']=='Uncertain')]

WHO_AssocR_ALL_Uncertain['PPV_LB_diff_WHO_ALL'] = WHO_AssocR_ALL_Uncertain['WHO_R_PPV_LB'] - WHO_AssocR_ALL_Uncertain['ALL_R_PPV_LB']

len(WHO_AssocR_ALL_Uncertain)

15

In [25]:
WHO_AssocR_ALL_Uncertain['REGRESSION FINAL CONFIDENCE GRADING'].value_counts()

REGRESSION FINAL CONFIDENCE GRADING
3) Uncertain significance    15
Name: count, dtype: int64

In [23]:
WHO_AssocR_ALL_Uncertain[['SOLO INITIAL CONFIDENCE GRADING', 'SOLO FINAL CONFIDENCE GRADING']].value_counts()

SOLO INITIAL CONFIDENCE GRADING  SOLO FINAL CONFIDENCE GRADING
3) Uncertain significance        2) Assoc w R - Interim           5
                                 3) Uncertain significance        5
5) Not assoc w R                 4) Not assoc w R - Interim       2
1) Assoc w R                     2) Assoc w R - Interim           1
2) Assoc w R - Interim           2) Assoc w R - Interim           1
3) Uncertain significance        4) Not assoc w R - Interim       1
Name: count, dtype: int64

In [52]:
WHO_AssocR_ALL_Uncertain.query("(ALL_BH_pval <= 0.05 & predicted_effect not in @silent_lst) | (ALL_BH_pval <= 0.01 & predicted_effect in @silent_lst)").shape

(11, 69)

In [54]:
WHO_AssocR_ALL_Uncertain.query("((ALL_BH_pval <= 0.05 & predicted_effect not in @silent_lst) | (ALL_BH_pval <= 0.01 & predicted_effect in @silent_lst)) & ALL_R_PPV_LB < 0.25").shape

(7, 69)

In [69]:
np.sort(WHO_AssocR_ALL_Uncertain.query("((ALL_BH_pval <= 0.05 & predicted_effect not in @silent_lst) | (ALL_BH_pval <= 0.01 & predicted_effect in @silent_lst)) & ALL_R_PPV_LB < 0.25").PPV_LB_diff_WHO_ALL.values)

array([0.01842252, 0.06080397, 0.09590884, 0.15315811, 0.26316292,
       0.58097719, 0.59475568])

In [71]:
WHO_AssocR_ALL_Uncertain.query("((ALL_BH_pval <= 0.05 & predicted_effect not in @silent_lst) | (ALL_BH_pval <= 0.01 & predicted_effect in @silent_lst)) & ALL_R_PPV_LB < 0.25")[['Drug', 'mutation', 'WHO_R_PPV_LB', 'WHO_R_PPV', 'WHO_Present_R', 'WHO_Present_S', 'ALL_R_PPV_LB', 'ALL_R_PPV', 'ALL_Present_R', 'ALL_Present_S', 'PPV_LB_diff_WHO_ALL']].sort_values('PPV_LB_diff_WHO_ALL', ascending=False)

Unnamed: 0,Drug,mutation,WHO_R_PPV_LB,WHO_R_PPV,WHO_Present_R,WHO_Present_S,ALL_R_PPV_LB,ALL_R_PPV,ALL_Present_R,ALL_Present_S,PPV_LB_diff_WHO_ALL
1091,Bedaquiline,Rv0678_p.Met146Thr,0.715086,1.0,11.0,0.0,0.12033,0.229167,11.0,37.0,0.594756
1089,Bedaquiline,mmpS5_c.-74G>T,0.769425,0.914286,32.0,3.0,0.188448,0.264463,32.0,89.0,0.580977
9184,Ethionamide,ethA_p.Asp396fs,0.421277,0.857143,6.0,1.0,0.158114,1.0,2.0,0.0,0.263163
2757,Clofazimine,Rv0678_p.Ile67fs,0.260191,0.5,9.0,9.0,0.107032,0.212766,10.0,37.0,0.153158
12398,Kanamycin,whiB7_c.-116A>G,0.317366,0.439394,29.0,37.0,0.221457,0.288235,49.0,121.0,0.095909
19770,Streptomycin,gid_p.Val65fs,0.283582,0.8,4.0,1.0,0.222778,0.666667,4.0,2.0,0.060804
8808,Ethionamide,ethA_p.Trp391*,0.257131,0.47619,10.0,11.0,0.238708,0.4,14.0,21.0,0.018423


# Neutral Variants

# 1. ALL Neutral, WHO not Neutral

In [6]:
regression_neutral = results_final.loc[((results_final['Initial confidence grading WHO dataset']=='Neutral') | (results_final['Initial confidence grading ALL dataset']=='Neutral')) & (results_final['REGRESSION FINAL CONFIDENCE GRADING'].str.contains('Not assoc w R'))]

ALL_neutral_only = results_final.loc[(results_final['Initial confidence grading WHO dataset'] != 'Neutral') & (results_final['Initial confidence grading ALL dataset'] == 'Neutral')].dropna(subset='Initial confidence grading WHO dataset')

WHO_neutral_only = results_final.loc[(results_final['Initial confidence grading WHO dataset'] == 'Neutral') & (results_final['Initial confidence grading ALL dataset'] != 'Neutral')].dropna(subset='Initial confidence grading ALL dataset')

# these should NOT be Neutral: ALL Neutral, but null in WHO, meaning they are absent from the WHO dataset
assert len(results_final.loc[(pd.isnull(results_final['Initial confidence grading WHO dataset'])) & (results_final['Initial confidence grading ALL dataset']=='Neutral') & (results_final['REGRESSION FINAL CONFIDENCE GRADING'] != '3) Uncertain significance')]) == 0

print(f"{len(WHO_neutral_only)} variants Neutral in WHO only, {len(ALL_neutral_only)} variants Neutral in ALL only")

29 variants Neutral in WHO only, 53 variants Neutral in ALL only


In [7]:
ALL_neutral_only['SOLO FINAL CONFIDENCE GRADING'].value_counts()

SOLO FINAL CONFIDENCE GRADING
3) Uncertain significance     35
4) Not assoc w R - Interim    15
5) Not assoc w R               3
Name: count, dtype: int64

# Do any Neutral Variants have Significant MIC associations?

In [8]:
MIC_results = pd.read_csv("../tables/MIC_results.csv")

regression_neutral_withMIC = regression_neutral.merge(MIC_results, on=['Drug', 'mutation'])
print(regression_neutral_withMIC.shape)

# no
print(len(regression_neutral_withMIC.query("MIC_BH_pval <= 0.05")))

(77, 68)
1


In [10]:
regression_neutral_withMIC.query("MIC_BH_pval <= 0.05")[['Drug', 'mutation', 'MIC_coef', 'MIC_BH_pval', 'WHO_Odds_Ratio', 'WHO_BH_pval', 'WHO_BH_neutral_pval', 'WHO_Present_SR', 'ALL_Odds_Ratio', 'ALL_BH_pval', 'ALL_BH_neutral_pval', 'ALL_Present_S', 'ALL_Present_R']]

Unnamed: 0,Drug,mutation,MIC_coef,MIC_BH_pval,WHO_Odds_Ratio,WHO_BH_pval,WHO_BH_neutral_pval,WHO_Present_SR,ALL_Odds_Ratio,ALL_BH_pval,ALL_BH_neutral_pval,ALL_Present_S,ALL_Present_R
46,Linezolid,rrl_n.2779A>G,0.026412,0.024923,0.984589,0.991,0.0,4.0,0.988435,0.989,0.0,6.0,0.0


# SOLO Not assoc w R, Regression Uncertain 

In [11]:
SOLO_notAssoc_regression_uncertain = results_final.loc[(results_final['REGRESSION FINAL CONFIDENCE GRADING']=='3) Uncertain significance') & (results_final['SOLO INITIAL CONFIDENCE GRADING'].str.contains('Not assoc w R'))]

print(f"{len(SOLO_notAssoc_regression_uncertain)} mutations Groups 4-5 by SOLO, Group 3 by Regression")

233 mutations Groups 4-5 by SOLO, Group 3 by Regression


In [12]:
SOLO_notAssoc_regression_uncertain_ungraded = SOLO_notAssoc_regression_uncertain.query("Reason=='Not Graded'")

SOLO_notAssoc_regression_uncertain_graded = SOLO_notAssoc_regression_uncertain.query("Reason != 'Not Graded'") #who_variants.merge(SOLO_notAssoc_regression_uncertain.query("Reason != 'Not Graded'").rename(columns={'Drug': 'drug', 'mutation': 'variant'})[['drug', 'variant']])

print(f"{len(SOLO_notAssoc_regression_uncertain_ungraded)}/{len(SOLO_notAssoc_regression_uncertain)} mutations were not graded")

54/233 mutations were not graded


In [13]:
SOLO_notAssoc_regression_uncertain_graded.query("ALL_Present_SR < 5").shape

(26, 66)

In [14]:
SOLO_notAssoc_regression_uncertain_graded.query("ALL_Present_SR < 10").shape

(32, 66)

In [15]:
SOLO_notAssoc_regression_uncertain_graded.query("ALL_Present_SR < 5 & ALL_BH_neutral_pval <= 0.05").shape

(16, 66)

## 14 variants are marginally significant with OR < 1, p-value in (0.1, 0.5]

## all are synonymous variants

In [16]:
SOLO_notAssoc_regression_uncertain_graded.query("(WHO_Odds_Ratio < 1 & WHO_BH_pval <= 0.05) | (ALL_Odds_Ratio < 1 & ALL_BH_pval <= 0.05)")[['Drug', 'mutation', 'WHO_Odds_Ratio', 'WHO_BH_pval', 'WHO_S_PPV_LB', 'WHO_Present_SR', 'ALL_Odds_Ratio', 'ALL_BH_pval', 'ALL_S_PPV_LB', 'ALL_Present_SR', 'Initial confidence grading WHO dataset', 'Initial confidence grading ALL dataset', 'REGRESSION FINAL CONFIDENCE GRADING']].shape

(14, 13)

In [17]:
SOLO_notAssoc_regression_uncertain_graded.query("(WHO_Odds_Ratio < 1 & WHO_BH_pval <= 0.05) | (ALL_Odds_Ratio < 1 & ALL_BH_pval <= 0.05)").predicted_effect.unique()

array(['synonymous_variant'], dtype=object)

In [20]:
# 139 remaining that meet the SOLO Neutral criterion but don't meet regression criteria for grading
df_search = SOLO_notAssoc_regression_uncertain_graded.query("ALL_Present_SR >= 5 & ~((WHO_Odds_Ratio < 1 & WHO_BH_pval <= 0.05) | (ALL_Odds_Ratio < 1 & ALL_BH_pval <= 0.05))")[['mutation', 'Drug', 'WHO_Odds_Ratio', 'WHO_BH_pval', 'ALL_Odds_Ratio', 'ALL_BH_pval']].rename(columns={'mutation': 'variant', 'Drug': 'drug'})

who_variants.merge(df_search)[['drug', 'variant', 'WHO_Odds_Ratio', 'WHO_BH_pval', 'ALL_Odds_Ratio', 'ALL_BH_pval', 'ALL_Present_S', 'ALL_Present_R', 'WHO_PPV_ub', 'ALL_PPV_ub',  'WHO_setB',
 'WHO_setC',
 'WHO_setD1',
 'WHO_setD2']]

Unnamed: 0,drug,variant,WHO_Odds_Ratio,WHO_BH_pval,ALL_Odds_Ratio,ALL_BH_pval,ALL_Present_S,ALL_Present_R,WHO_PPV_ub,ALL_PPV_ub,WHO_setB,WHO_setC,WHO_setD1,WHO_setD2
0,Amikacin,eis_c.-10G>A,1.031504,0.670638,1.072857,0.000000,369.0,22.0,8.30%,8.40%,1.0,1.0,0.0,0.0
1,Amikacin,eis_c.-12C>T,0.973076,0.752432,1.030509,0.723324,917.0,64.0,8.79%,8.25%,1.0,1.0,0.0,0.0
2,Amikacin,eis_p.Val163Ile,1.035268,0.752432,1.002809,0.925333,711.0,53.0,10.54%,8.98%,1.0,1.0,0.0,0.0
3,Amikacin,rrs_n.492C>T,0.992668,0.897806,0.970594,0.925333,785.0,12.0,3.73%,2.62%,1.0,1.0,0.0,0.0
4,Amikacin,rrs_n.514A>C,1.081648,0.124878,1.081290,0.029743,577.0,329.0,34.26%,39.54%,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134,Streptomycin,Rv1258c_c.1029T>C,0.944848,0.207446,0.933563,0.094394,391.0,35.0,11.51%,11.24%,1.0,1.0,0.0,0.0
135,Streptomycin,Rv1258c_p.Glu194fs,1.012949,0.630768,0.977383,0.633541,3296.0,5120.0,63.23%,61.88%,1.0,1.0,0.0,0.0
136,Streptomycin,whiB7_c.-100T>C,1.031658,0.173571,1.029012,0.190362,135.0,84.0,34.48%,45.15%,1.0,1.0,0.0,0.0
137,Streptomycin,whiB7_c.-242G>C,0.997890,0.630768,0.979538,0.352047,93.0,10.0,19.15%,17.13%,0.0,0.0,0.0,1.0


In [71]:
# total
139+14+26+54

233

# Not assoc w R by Literature

In [134]:
not_assocR_literature = not_assocR_literature.merge(results_final, how='inner')
print(len(not_assocR_literature))
print(not_assocR_literature['REGRESSION FINAL CONFIDENCE GRADING'].value_counts())

28
REGRESSION FINAL CONFIDENCE GRADING
3) Uncertain significance     27
4) Not assoc w R - Interim     1
Name: count, dtype: int64
