In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib_venn import venn2, venn3

plt.rcParams['figure.dpi'] = 150
import seaborn as sns
import scipy.stats as st
import statsmodels
import statsmodels.api as sm
from functools import reduce

import glob, os, yaml, subprocess, itertools, sparse, sys, statsmodels, shutil
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

analysis_dir = '/n/data1/hms/dbmi/farhat/Sanjana/who-mutation-catalogue'

lineages = pd.read_csv("../lineages/combined_lineages_samples.csv")
print(lineages.shape)
import collections, warnings
warnings.filterwarnings("ignore")

sys.path.append(os.path.join(os.path.dirname(os.getcwd()), "utils"))
from stats_utils import *
from data_utils import *

coll2014 = pd.read_csv("../data/coll2014_SNP_scheme.tsv", sep="\t")
coll2014["#lineage"] = coll2014["#lineage"].str.replace("lineage", "")
coll2014.rename(columns={"#lineage": "Lineage"}, inplace=True)
coll2014['nucleotide'] = [val.split('/')[1] for val in coll2014['allele_change'].values]

drug_abbr_dict = {"Delamanid": "DLM",
                  "Bedaquiline": "BDQ",
                  "Clofazimine": "CFZ",
                  "Ethionamide": "ETO",
                  "Linezolid": "LZD",
                  "Moxifloxacin": "MXF",
                  "Capreomycin": "CAP",
                  "Amikacin": "AMK",
                  "Pretomanid": "PMD",
                  "Pyrazinamide": "PZA",
                  "Kanamycin": "KAN",
                  "Levofloxacin": "LFX",
                  "Streptomycin": "STM",
                  "Ethambutol": "EMB",
                  "Isoniazid": "INH",
                  "Rifampicin": "RIF"
                 }

cc_df = pd.read_csv("../data/drug_CC.csv")
# who_variants_V1 = pd.read_excel("../results/WHO-catalog-V1.xlsx", sheet_name='Mutation_catalogue')
# who_variants = pd.read_csv("../results/WHO-catalog-V2-full.csv", header=[2]).query("tier==1").reset_index(drop=True)
# del who_variants['mutation']

# for col in who_variants.columns[6:46]:
#     who_variants.rename(columns={col: f"ALL_{col}"}, inplace=True)

# for col in who_variants.columns[50:97]:
#     who_variants.rename(columns={col: f"WHO_{col.replace('.1', '')}"}, inplace=True)

# who_variants.to_csv("../results/WHO-catalog-V2-tier1.csv", index=False)
who_variants = pd.read_csv("../results/WHO-catalog-V2-tier1.csv")
del who_variants['mutation']
RS_counts = pd.read_csv("../tables/Figure_3A_data.csv", index_col='Drug')

results_final = pd.read_csv("../results/Nov2024_Tier1.csv")
not_assocR_literature = pd.read_excel("../data/NotAwR by literature.xlsx", sheet_name=0).rename(columns={'drug': 'Drug', 'variant': 'mutation'})

drug_gene_mapping = pd.read_csv("../data/drug_gene_mapping.csv")
silent_lst = ['synonymous_variant', 'initiator_codon_variant', 'stop_retained_variant']

drugs_lst = list(drug_abbr_dict.keys())
sample_ids_mapping = pd.read_csv("../data/sample_ids_mapping_20220922.csv")

(52567, 10)


In [2]:
INH_genos = pd.read_csv(os.path.join(analysis_dir, 'Isoniazid', 'genos_1.csv.gz'), compression='gzip', usecols=['sample_id', 'resolved_symbol', 'variant_category', 'variant_allele_frequency'])

In [3]:
samples_with_variant = INH_genos.query("resolved_symbol=='katG' & variant_category=='c.12A>G' & variant_allele_frequency > 0.75").sample_id.unique()
len(samples_with_variant)

13

In [11]:
INH_genos.query("sample_id in @samples_with_variant & ~(resolved_symbol=='katG' & variant_category=='c.12A>G') & variant_allele_frequency > 0.25")[['resolved_symbol', 'variant_category']].value_counts()

resolved_symbol  variant_category
katG             p.Arg463Leu         13
inhA             c.-777C>T           10
ahpC             c.-88G>A             1
katG             p.Ser315Thr          1
Name: count, dtype: int64

In [10]:
results_final.query("Drug=='Isoniazid' & mutation in ['katG_c.12A>G', 'katG_p.Arg463Leu', 'inhA_c.-777C>T', 'ahpC_c.-88G>A']")[['mutation', 'WHO_Odds_Ratio', 'ALL_Odds_Ratio', 'SOLO INITIAL CONFIDENCE GRADING', 'SOLO FINAL CONFIDENCE GRADING', 'REGRESSION FINAL CONFIDENCE GRADING']]

Unnamed: 0,mutation,WHO_Odds_Ratio,ALL_Odds_Ratio,SOLO INITIAL CONFIDENCE GRADING,SOLO FINAL CONFIDENCE GRADING,REGRESSION FINAL CONFIDENCE GRADING
10307,inhA_c.-777C>T,3.702257,3.843702,1) Assoc w R,1) Assoc w R,1) Assoc w R
10335,katG_c.12A>G,1.099477,1.089698,3) Uncertain significance,4) Not assoc w R - Interim,1) Assoc w R
12356,ahpC_c.-88G>A,0.928051,0.937116,5) Not assoc w R,5) Not assoc w R,3) Uncertain significance
12360,katG_p.Arg463Leu,0.813266,0.807385,5) Not assoc w R,5) Not assoc w R,5) Not assoc w R


# Overview of Regression Gradings

In [3]:
regression_grading_cols_lst = ['REGRESSION FINAL CONFIDENCE GRADING', 'REGRESSION + GRADING RULES']
solo_cols_lst = ['SOLO INITIAL CONFIDENCE GRADING', 'SOLO FINAL CONFIDENCE GRADING']

regression_totals_df = []
solo_totals_df = []

for col in regression_grading_cols_lst:
    regression_totals_df.append(pd.DataFrame(results_final[col].value_counts()).reset_index().rename(columns={col: 'Grading', 'count': col}).set_index('Grading'))

for col in solo_cols_lst:
    solo_totals_df.append(pd.DataFrame(results_final[col].value_counts()).reset_index().rename(columns={col: 'Grading', 'count': col}).set_index('Grading'))

regression_totals_df = pd.concat(regression_totals_df, axis=1).sort_index()
solo_totals_df = pd.concat(solo_totals_df, axis=1).sort_index()

In [4]:
regression_totals_df.merge(solo_totals_df, left_index=True, right_index=True)

Unnamed: 0_level_0,REGRESSION FINAL CONFIDENCE GRADING,REGRESSION + GRADING RULES,SOLO INITIAL CONFIDENCE GRADING,SOLO FINAL CONFIDENCE GRADING
Grading,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1) Assoc w R,587,587,311,253
2) Assoc w R - Interim,164,1049,146,1130
3) Uncertain significance,20614,14756,20835,14958
4) Not assoc w R - Interim,130,5103,19,4998
5) Not assoc w R,94,94,278,250


In [121]:
regression_totals_df.loc[['1) Assoc w R', '2) Assoc w R - Interim', '4) Not assoc w R - Interim', '5) Not assoc w R'], 'REGRESSION FINAL CONFIDENCE GRADING'].values.sum()

975

In [122]:
solo_totals_df.loc[['1) Assoc w R', '2) Assoc w R - Interim', '4) Not assoc w R - Interim', '5) Not assoc w R'], 'SOLO INITIAL CONFIDENCE GRADING'].values.sum()

754

In [124]:
20835-20614, 975-754, 221/754

(221, 221, 0.29310344827586204)

In [5]:
solo_uncertain = solo_totals_df.loc['3) Uncertain significance', 'SOLO INITIAL CONFIDENCE GRADING']
regression_uncertain = regression_totals_df.loc['3) Uncertain significance', 'REGRESSION FINAL CONFIDENCE GRADING']

solo_graded = solo_totals_df.drop(labels='3) Uncertain significance')['SOLO INITIAL CONFIDENCE GRADING'].sum()

# how many fewer Uncertain mutations does regression leave? What is the percent change relative to the number of SOLO graded mutations?
solo_uncertain-regression_uncertain, solo_graded, (solo_uncertain-regression_uncertain)/solo_graded

(221, 754, 0.29310344827586204)

In [6]:
regression_LoF_upgrades = results_final.loc[(results_final['REGRESSION FINAL CONFIDENCE GRADING'] != results_final['REGRESSION GRADING + LOF UPGRADE'])]
regression_GR_upgrades = results_final.loc[(results_final['REGRESSION GRADING + LOF UPGRADE'] != results_final['REGRESSION + GRADING RULES'])]

print(f'{len(regression_LoF_upgrades)} variants were upgraded by grading rules')
print(f'{len(regression_LoF_upgrades.query("predicted_effect not in @silent_lst"))} non-silent variants were upgraded by grading rules')
print(f'{len(regression_GR_upgrades)} variants were upgraded by grading rules')
print(f'{len(regression_GR_upgrades.query("predicted_effect not in @silent_lst"))} non-silent variants were upgraded by grading rules')

807 variants were upgraded by grading rules
807 non-silent variants were upgraded by grading rules
5051 variants were upgraded by grading rules
147 non-silent variants were upgraded by grading rules


# Number of Variants tested only in the ALL dataset or Ungraded

In [7]:
ungraded_muts = results_final.loc[(pd.isnull(results_final['WHO_Odds_Ratio'])) & (pd.isnull(results_final['ALL_Odds_Ratio']))]
graded_muts = results_final.dropna(subset=['WHO_Odds_Ratio', 'ALL_Odds_Ratio'], how='all')
WHO_only = results_final.loc[(~pd.isnull(results_final['WHO_Odds_Ratio'])) & (pd.isnull(results_final['ALL_Odds_Ratio']))]
ALL_only = results_final.loc[(pd.isnull(results_final['WHO_Odds_Ratio'])) & (~pd.isnull(results_final['ALL_Odds_Ratio']))]

WHO_ALL_graded = results_final.loc[(~pd.isnull(results_final['WHO_Odds_Ratio'])) & (~pd.isnull(results_final['ALL_Odds_Ratio']))]
WHO_ALL_same_grading = WHO_ALL_graded.loc[(WHO_ALL_graded['Initial confidence grading WHO dataset'] == WHO_ALL_graded['Initial confidence grading ALL dataset'])]

print(f"{len(ungraded_muts)} ({np.round(len(ungraded_muts) / len(results_final) * 100, 1)})% mutations were not tested in either phenotypic dataset")
print(f"{len(WHO_only)}/{len(graded_muts)} ({np.round(len(WHO_only) / len(graded_muts) * 100, 1)})% mutations were only tested in the WHO dataset")
print(f"{len(ALL_only)}/{len(graded_muts)} ({np.round(len(ALL_only) / len(graded_muts) * 100, 1)})% mutations were only tested in the ALL dataset")
print(f"{len(WHO_ALL_same_grading)} ({np.round(len(WHO_ALL_same_grading) / len(WHO_ALL_graded) * 100, 1)})% mutations have the same grading in both datasets")

2388 (11.1)% mutations were not tested in either phenotypic dataset
0/19201 (0.0)% mutations were only tested in the WHO dataset
6710/19201 (34.9)% mutations were only tested in the ALL dataset
12177 (97.5)% mutations have the same grading in both datasets


In [8]:
len(graded_muts) - len(ALL_only), len(WHO_ALL_graded)

(12491, 12491)

# Number of Variants that meet different Criteria for resolving WHO and ALL discrepancies

In [9]:
WHO_col = 'Initial confidence grading WHO dataset'
ALL_col = 'Initial confidence grading ALL dataset'

print('Both Assoc w R:', len(results_final.loc[(results_final[WHO_col]=='Assoc w R') & (results_final[ALL_col]=='Assoc w R')]))

print('Both Not assoc w R or Neutral:', len(results_final.loc[(results_final[WHO_col].isin(['Not assoc w R', 'Neutral'])) & (results_final[ALL_col].isin(['Not assoc w R', 'Neutral']))]))

print('WHO = Neutral, ALL = Uncertain:', len(results_final.loc[(results_final[WHO_col]=='Neutral') & (results_final[ALL_col]=='Uncertain')]))

print('WHO = Uncertain, ALL = Neutral:', len(results_final.loc[(results_final[WHO_col]=='Uncertain') & (results_final[ALL_col]=='Neutral') & (~pd.isnull(results_final[WHO_col]))]))

print('WHO = Assoc w R, ALL = Uncertain:', len(results_final.loc[(results_final[WHO_col]=='Assoc w R') & (results_final[ALL_col]=='Uncertain')]))

print('WHO = Not assoc w R, ALL = Uncertain:', len(results_final.loc[(results_final[WHO_col]=='Not assoc w R') & (results_final[ALL_col]=='Uncertain')]))

print('WHO = Uncertain, ALL = Assoc w R:', len(results_final.loc[(results_final[WHO_col]=='Uncertain') & (results_final[ALL_col]=='Assoc w R')]))

print('WHO = Uncertain, ALL = Not assoc w R:', len(results_final.loc[(results_final[WHO_col]=='Uncertain') & (results_final[ALL_col]=='Not assoc w R')]))

print('WHO = Assoc w R, ALL = Not assoc w R, Neutral:', len(results_final.loc[(results_final[WHO_col]=='Assoc w R') & (results_final[ALL_col].isin(['Not assoc w R', 'Neutral']))]))

print('WHO = Not assoc w R, Neutral, ALL = Assoc w R:', len(results_final.loc[(results_final[ALL_col]=='Assoc w R') & (results_final[WHO_col].isin(['Not assoc w R', 'Neutral']))]))

Both Assoc w R: 587
Both Not assoc w R or Neutral: 66
WHO = Neutral, ALL = Uncertain: 28
WHO = Uncertain, ALL = Neutral: 53
WHO = Assoc w R, ALL = Uncertain: 12
WHO = Not assoc w R, ALL = Uncertain: 16
WHO = Uncertain, ALL = Assoc w R: 149
WHO = Uncertain, ALL = Not assoc w R: 56
WHO = Assoc w R, ALL = Not assoc w R, Neutral: 0
WHO = Not assoc w R, Neutral, ALL = Assoc w R: 0


In [10]:
results_final.loc[(results_final[WHO_col]=='Assoc w R') & (results_final[ALL_col]=='Uncertain')]['SOLO INITIAL CONFIDENCE GRADING'].value_counts()

SOLO INITIAL CONFIDENCE GRADING
3) Uncertain significance    6
2) Assoc w R - Interim       3
5) Not assoc w R             2
1) Assoc w R                 1
Name: count, dtype: int64

In [11]:
results_final.query("Drug=='Bedaquiline' & mutation=='mmpS5_c.-74G>T'")[['WHO_Odds_Ratio', 'WHO_BH_pval', 'WHO_BH_LRT_pval', 'WHO_R_PPV_LB', 'ALL_Odds_Ratio', 'ALL_BH_pval', 'ALL_BH_LRT_pval', 'ALL_R_PPV_LB']]

Unnamed: 0,WHO_Odds_Ratio,WHO_BH_pval,WHO_BH_LRT_pval,WHO_R_PPV_LB,ALL_Odds_Ratio,ALL_BH_pval,ALL_BH_LRT_pval,ALL_R_PPV_LB
1089,1.394573,0.0,1.7339e-12,0.763225,1.287033,0.0,6.073573e-36,0.182759


# Co-occurrence of mmpS5_c.-74G>T with other variants

In [32]:
BDQ_genos = pd.read_csv(os.path.join(analysis_dir, "Bedaquiline", "genos_1.csv.gz"), compression='gzip', usecols=['sample_id', 'resolved_symbol', 'variant_category', 'variant_allele_frequency'])

df_phenos = pd.read_csv(os.path.join(analysis_dir, "Bedaquiline", "phenos_binary.csv"))

BDQ_genos['mutation'] = BDQ_genos['resolved_symbol'] + '_' + BDQ_genos['variant_category']

del BDQ_genos['resolved_symbol']
del BDQ_genos['variant_category']

In [33]:
samples_with_variants = BDQ_genos.query("mutation=='mmpS5_c.-74G>T' & variant_allele_frequency > 0.75").sample_id.unique()
len(samples_with_variants)

131

In [34]:
BDQ_R_isolates_with_variant = df_phenos.query("sample_id in @samples_with_variants & phenotype==1").sample_id.values
len(BDQ_R_isolates_with_variant)

37

In [51]:
df_check = BDQ_genos.query("sample_id in @BDQ_R_isolates_with_variant & mutation!='mmpS5_c.-74G>T' & variant_allele_frequency > 0.75")

df_check = df_check.merge(results_final.query("Drug=='Bedaquiline'")[['mutation', 'REGRESSION FINAL CONFIDENCE GRADING', 'SOLO INITIAL CONFIDENCE GRADING', 'SOLO FINAL CONFIDENCE GRADING']], on=['mutation'], how='inner')

# lower the AF threshold to 25%
df_check_lowAF = BDQ_genos.query("sample_id in @BDQ_R_isolates_with_variant & mutation!='mmpS5_c.-74G>T' & variant_allele_frequency > 0.25")

# one variant -- Rv0678_p.Met23fs -- found only at AF < 75%
df_check_lowAF = df_check_lowAF.merge(results_final.query("Drug=='Bedaquiline'")[['mutation', 'REGRESSION FINAL CONFIDENCE GRADING', 'SOLO INITIAL CONFIDENCE GRADING', 'SOLO FINAL CONFIDENCE GRADING']], on=['mutation'], how='inner')

In [52]:
# number of isolates with a Group 1/2 mutation by SOLO with GR and the number of such variants
df_check.loc[df_check['SOLO FINAL CONFIDENCE GRADING'].str.contains('Assoc w R')].sample_id.nunique(), df_check.loc[df_check['SOLO FINAL CONFIDENCE GRADING'].str.contains('Assoc w R')].mutation.nunique()

(14, 8)

In [53]:
df_check.mutation.value_counts()

mutation
mmpL5_p.Ile948Val    37
mmpL5_p.Thr794Ile    37
mmpL5_p.Asp767Asn    37
atpE_p.Ile66Met       3
Rv0678_p.Glu49fs      3
Rv0678_p.Asp47fs      3
Rv0678_p.Cys46Arg     2
atpE_p.Ala63Thr       1
Rv0678_p.Tyr145fs     1
Rv0678_p.Gln115*      1
Rv0678_p.Leu60Pro     1
Rv0678_p.Pro48fs      1
Rv0678_p.Thr69fs      1
Name: count, dtype: int64

In [42]:
BDQ_genos.mutation.str.split('_').str[0].unique()

array(['atpE', 'mmpS5', 'Rv0678', 'pepQ', 'mmpL5'], dtype=object)

In [54]:
df_check.query("mutation not in ['mmpL5_p.Ile948Val', 'mmpL5_p.Thr794Ile', 'mmpL5_p.Asp767Asn']").sample_id.nunique()

15

In [55]:
df_check.query("mutation not in ['mmpL5_p.Ile948Val', 'mmpL5_p.Thr794Ile', 'mmpL5_p.Asp767Asn']").loc[~df_check['SOLO FINAL CONFIDENCE GRADING'].str.contains('Assoc w R')]

Unnamed: 0,sample_id,variant_allele_frequency,mutation,REGRESSION FINAL CONFIDENCE GRADING,SOLO INITIAL CONFIDENCE GRADING,SOLO FINAL CONFIDENCE GRADING
3,823005,1.0,atpE_p.Ala63Thr,3) Uncertain significance,3) Uncertain significance,3) Uncertain significance
9,823005,1.0,Rv0678_p.Leu60Pro,3) Uncertain significance,3) Uncertain significance,3) Uncertain significance


In [56]:
df_check_lowAF.query("mutation not in ['mmpL5_p.Ile948Val', 'mmpL5_p.Thr794Ile', 'mmpL5_p.Asp767Asn']").sample_id.nunique()

21

In [58]:
df_check_lowAF.query("mutation not in ['mmpL5_p.Ile948Val', 'mmpL5_p.Thr794Ile', 'mmpL5_p.Asp767Asn']").loc[~df_check_lowAF['SOLO FINAL CONFIDENCE GRADING'].str.contains('Assoc w R')]

Unnamed: 0,sample_id,variant_allele_frequency,mutation,REGRESSION FINAL CONFIDENCE GRADING,SOLO INITIAL CONFIDENCE GRADING,SOLO FINAL CONFIDENCE GRADING
3,823005,1.0,atpE_p.Ala63Thr,3) Uncertain significance,3) Uncertain significance,3) Uncertain significance
12,823005,1.0,Rv0678_p.Leu60Pro,3) Uncertain significance,3) Uncertain significance,3) Uncertain significance
13,823198,0.36,Rv0678_p.Leu122Pro,3) Uncertain significance,3) Uncertain significance,3) Uncertain significance


# Results Section A

In [59]:
search_string = 'Assoc w R'

regression_Group12_agreement = len(results_final.loc[(results_final['REGRESSION FINAL CONFIDENCE GRADING'].str.contains(search_string)) & (results_final['SOLO INITIAL CONFIDENCE GRADING'].str.contains(search_string))])

SOLO_Group12 = len(results_final.loc[(results_final['SOLO INITIAL CONFIDENCE GRADING'].str.contains(search_string))])

regression_Group12_agreement, SOLO_Group12, regression_Group12_agreement/SOLO_Group12

(450, 457, 0.9846827133479212)

In [60]:
search_string = 'Assoc w R'

regression_SOLO_GR_Group12_agreement = len(results_final.loc[(results_final['REGRESSION + GRADING RULES'].str.contains(search_string)) & (results_final['SOLO FINAL CONFIDENCE GRADING'].str.contains(search_string))])

SOLO_GR_Group12 = len(results_final.loc[(results_final['SOLO FINAL CONFIDENCE GRADING'].str.contains(search_string))])

regression_SOLO_GR_Group12_agreement, SOLO_GR_Group12, regression_SOLO_GR_Group12_agreement/SOLO_GR_Group12

(1378, 1383, 0.9963846710050615)

In [61]:
# number of R-associated variants upgraded by the LoF grading rule. These are Assoc w R with grading rules
results_final.query("Reason=='LoF Upgrade'").loc[(~results_final['REGRESSION FINAL CONFIDENCE GRADING'].str.contains('Assoc w R')) & (results_final['REGRESSION GRADING + LOF UPGRADE'].str.contains('Assoc w R'))].shape

(738, 66)

In [62]:
len(results_final.loc[(~results_final['REGRESSION FINAL CONFIDENCE GRADING'].str.contains('Assoc w R')) & (results_final['REGRESSION + GRADING RULES'].str.contains('Assoc w R'))])

885

In [63]:
# mutations upgraded from Group 3 to Group 2 by other non-LoF rules
len(results_final.query("Reason != 'LoF Upgrade'").loc[(~results_final['REGRESSION FINAL CONFIDENCE GRADING'].str.contains('Assoc w R')) & (results_final['REGRESSION + GRADING RULES'].str.contains('Assoc w R'))])

147

In [64]:
results_final.query("Reason != 'LoF Upgrade'").loc[(~results_final['REGRESSION FINAL CONFIDENCE GRADING'].str.contains('Assoc w R')) & (results_final['REGRESSION + GRADING RULES'].str.contains('Assoc w R'))].Reason.value_counts()

Reason
Both Uncertain            91
Not Graded                30
ALL Evidence Only         25
Downgrade to Uncertain     1
Name: count, dtype: int64

In [65]:
results_final.query("Reason != 'LoF Upgrade'").loc[(~results_final['REGRESSION FINAL CONFIDENCE GRADING'].str.contains('Assoc w R')) & (results_final['REGRESSION + GRADING RULES'].str.contains('Assoc w R'))]['REGRESSION GRADING + LOF UPGRADE'].unique()

array(['3) Uncertain significance'], dtype=object)

In [66]:
# 807 total variants are upgraded by the LoF grading rule
results_final.query("Reason=='LoF Upgrade'")[['REGRESSION FINAL CONFIDENCE GRADING', 'REGRESSION GRADING + LOF UPGRADE']].value_counts()

REGRESSION FINAL CONFIDENCE GRADING  REGRESSION GRADING + LOF UPGRADE
3) Uncertain significance            2) Assoc w R - Interim              738
                                     4) Not assoc w R - Interim           69
Name: count, dtype: int64

In [67]:
results_final.query("Reason=='LoF Upgrade'").loc[results_final['REGRESSION GRADING + LOF UPGRADE']=='4) Not assoc w R - Interim'][['Drug', 'gene']].value_counts()

Drug          gene 
Kanamycin     eis      32
Bedaquiline   mmpL5    28
Streptomycin  whiB7     9
Name: count, dtype: int64

In [68]:
# Groups 1 and 2
regression_Groups12 = results_final.loc[(results_final['REGRESSION FINAL CONFIDENCE GRADING'].str.contains('Assoc w R'))]
regression_GR_Groups12 = results_final.loc[(results_final['REGRESSION + GRADING RULES'].str.contains('Assoc w R'))]
SOLO_Groups12 = results_final.loc[results_final['SOLO INITIAL CONFIDENCE GRADING'].str.contains('Assoc w R')]
SOLO_GR_Groups12 = results_final.loc[results_final['SOLO FINAL CONFIDENCE GRADING'].str.contains('Assoc w R')]

groups12_lst = [regression_Groups12, regression_GR_Groups12, SOLO_Groups12, SOLO_GR_Groups12]
group_names = ['Regression', 'Regression + GR', 'SOLO', 'SOLO + GR']

for name, df in zip(group_names, groups12_lst):
    print(f"{name}: {len(df)}")

Regression: 751
Regression + GR: 1636
SOLO: 457
SOLO + GR: 1383


In [69]:
len(regression_Groups12) - len(SOLO_Groups12), (len(regression_Groups12) - len(SOLO_Groups12)) / len(SOLO_Groups12)

(294, 0.6433260393873085)

In [70]:
len(regression_GR_Groups12) - len(regression_Groups12)

885

In [71]:
silent_variants_assocR = results_final.loc[(results_final['predicted_effect'].isin(silent_lst)) & (results_final['REGRESSION FINAL CONFIDENCE GRADING'].str.contains('Assoc w R'))].mutation.values
len(silent_variants_assocR), silent_variants_assocR

(1, array(['katG_c.12A>G'], dtype=object))

In [72]:
# Groups 4 and 5, +/- silent variants
regression_Groups45 = results_final.loc[(results_final['REGRESSION FINAL CONFIDENCE GRADING'].str.contains('Not assoc w R'))]
regression_GR_Groups45 = results_final.loc[(results_final['REGRESSION + GRADING RULES'].str.contains('Not assoc w R'))]
SOLO_Groups45 = results_final.loc[results_final['SOLO INITIAL CONFIDENCE GRADING'].str.contains('Not assoc w R')]
SOLO_GR_Groups45 = results_final.loc[results_final['SOLO FINAL CONFIDENCE GRADING'].str.contains('Not assoc w R')]

groups45_lst = [regression_Groups45, regression_GR_Groups45, SOLO_Groups45, SOLO_GR_Groups45]

for name, df in zip(group_names, groups45_lst):
    print(f"{name}: {len(df)} ({len(df.loc[~df['predicted_effect'].isin(silent_lst)])} non-silent)")

Regression: 224 (161 non-silent)
Regression + GR: 5197 (230 non-silent)
SOLO: 297 (230 non-silent)
SOLO + GR: 5248 (230 non-silent)


## Which variants does regression find, SOLO misses, and grading rules upgrade to interim?

In [73]:
# mutations that HAD to be upgraded to Groups 1 and 2 using rules
SOLOmiss_GRupgrade_R = results_final.loc[(~results_final['SOLO INITIAL CONFIDENCE GRADING'].str.contains('Assoc w R')) &
                                         (results_final['SOLO FINAL CONFIDENCE GRADING'].str.contains('Assoc w R'))
                                        ]

# matches with total counts above too
assert len(SOLOmiss_GRupgrade_R) == len(SOLO_GR_Groups12) - len(SOLO_Groups12)

regression_Rassoc_SOLOmiss_GRupgrade_R = SOLOmiss_GRupgrade_R.loc[SOLOmiss_GRupgrade_R['REGRESSION FINAL CONFIDENCE GRADING'].str.contains('Assoc w R')]
print(f"Regression upgrades {len(regression_Rassoc_SOLOmiss_GRupgrade_R)} R-assoc variants upgraded by grading rules.")

Regression upgrades 79 R-assoc variants upgraded by grading rules.


In [74]:
# mutations that HAD to be upgraded to Groups 1 and 2 using rules
SOLOmiss_GRupgrade_NotR = results_final.loc[(~results_final['SOLO INITIAL CONFIDENCE GRADING'].str.contains('Not assoc w R')) &
                                             (results_final['SOLO FINAL CONFIDENCE GRADING'].str.contains('Not assoc w R'))
                                            ]

# matches with total counts above too
assert len(SOLOmiss_GRupgrade_NotR) == len(SOLO_GR_Groups45) - len(SOLO_Groups45)

regression_NotR_SOLOmiss_GRupgrade_NotR = SOLOmiss_GRupgrade_NotR.loc[SOLOmiss_GRupgrade_NotR['REGRESSION FINAL CONFIDENCE GRADING'].str.contains('Not assoc w R')]
print(f"Regression upgrades {len(regression_NotR_SOLOmiss_GRupgrade_NotR)} Not R-assoc variants upgraded by grading rules.")

Regression upgrades 46 Not R-assoc variants upgraded by grading rules.


## Look into the cases where regression grades but SOLO misses

In [75]:
# combine with WHO catalogue results to get additional information
regression_Rassoc_SOLOmiss_GRupgrade_R_SOLO_results = regression_Rassoc_SOLOmiss_GRupgrade_R[['Drug', 'mutation']].merge(who_variants.rename(columns={'drug': 'Drug', 'variant': 'mutation'}))

print(len(regression_Rassoc_SOLOmiss_GRupgrade_R_SOLO_results))
assert len(regression_Rassoc_SOLOmiss_GRupgrade_R_SOLO_results) == len(regression_Rassoc_SOLOmiss_GRupgrade_R)

regression_Rassoc_SOLOmiss_GRupgrade_R[['REGRESSION FINAL CONFIDENCE GRADING', 'SOLO INITIAL CONFIDENCE GRADING', 'SOLO FINAL CONFIDENCE GRADING']].value_counts()

79


REGRESSION FINAL CONFIDENCE GRADING  SOLO INITIAL CONFIDENCE GRADING  SOLO FINAL CONFIDENCE GRADING
1) Assoc w R                         3) Uncertain significance        2) Assoc w R - Interim           42
2) Assoc w R - Interim               3) Uncertain significance        2) Assoc w R - Interim           34
1) Assoc w R                         3) Uncertain significance        1) Assoc w R                      3
Name: count, dtype: int64

In [76]:
len(regression_Rassoc_SOLOmiss_GRupgrade_R_SOLO_results.query("ALL_Present_SOLO_SR==0")), len(regression_Rassoc_SOLOmiss_GRupgrade_R_SOLO_results.query("ALL_Present_SOLO_SR < 5"))

(2, 27)

In [77]:
regression_Rassoc_SOLOmiss_GRupgrade_R_SOLO_results.query("ALL_Present_SOLO_SR >= 5").ALL_OR_SOLO_FE_sig.value_counts()

ALL_OR_SOLO_FE_sig
False    50
True      2
Name: count, dtype: int64

In [78]:
regression_Rassoc_SOLOmiss_GRupgrade_R_SOLO_results.query("ALL_Present_SOLO_SR >= 5 & ALL_OR_SOLO_FE_sig==True")[['Drug', 'mutation', 'INITIAL CONFIDENCE GRADING', 'FINAL CONFIDENCE GRADING', 'Additional grading criteria applied']]

Unnamed: 0,Drug,mutation,INITIAL CONFIDENCE GRADING,FINAL CONFIDENCE GRADING,Additional grading criteria applied
3,Clofazimine,Rv0678_p.Asp47fs,3) Uncertain significance,2) Assoc w R - Interim,Indel frameshift or premature stop codon (LoF)
43,Linezolid,rrl_n.2270G>T,3) Uncertain significance,2) Assoc w R - Interim,Selection evidence


In [79]:
27/79, 50/79

(0.34177215189873417, 0.6329113924050633)

## Save the mutations for which regression supersedes the need for grading rules to a supplementary table

In [80]:
regression_supersede_save = pd.concat([regression_Rassoc_SOLOmiss_GRupgrade_R, regression_NotR_SOLOmiss_GRupgrade_NotR])[['Drug', 'mutation', 'SOLO INITIAL CONFIDENCE GRADING', 'SOLO FINAL CONFIDENCE GRADING', 'REGRESSION FINAL CONFIDENCE GRADING']].sort_values("Drug")

len(regression_supersede_save)

125

In [81]:
regression_supersede_save['SOLO INITIAL CONFIDENCE GRADING'].value_counts()

SOLO INITIAL CONFIDENCE GRADING
3) Uncertain significance    125
Name: count, dtype: int64

In [82]:
regression_supersede_save['SOLO FINAL CONFIDENCE GRADING'].value_counts()

SOLO FINAL CONFIDENCE GRADING
2) Assoc w R - Interim        76
4) Not assoc w R - Interim    46
1) Assoc w R                   3
Name: count, dtype: int64

In [83]:
regression_supersede_save.loc[regression_supersede_save['SOLO FINAL CONFIDENCE GRADING']=='4) Not assoc w R - Interim']['REGRESSION FINAL CONFIDENCE GRADING'].value_counts()

REGRESSION FINAL CONFIDENCE GRADING
5) Not assoc w R              24
4) Not assoc w R - Interim    22
Name: count, dtype: int64

In [84]:
regression_supersede_save.loc[regression_supersede_save['SOLO FINAL CONFIDENCE GRADING'].str.contains('Assoc w R')]['REGRESSION FINAL CONFIDENCE GRADING'].value_counts()

REGRESSION FINAL CONFIDENCE GRADING
1) Assoc w R              45
2) Assoc w R - Interim    34
Name: count, dtype: int64

In [85]:
regression_supersede_save.to_csv("../tables/regression_supersede_save.csv", index=False)

# Results Section D

## RIF mutations newly graded by regression (for INH ones, focused on ahpC because there were a lot, in the other notebook)

In [96]:
# do the inverse of significant MIC coefficient so that you don't exclude mutations not tested in the MIC models
new_RIF_variants_no_MIC = results_final.query("Drug=='Rifampicin'").loc[(results_final['SOLO FINAL CONFIDENCE GRADING']=='3) Uncertain significance') & (results_final['REGRESSION FINAL CONFIDENCE GRADING'].str.contains('Assoc w R'))].merge(MIC_results, how='left').query("~(MIC_BH_pval <= 0.05)")

len(new_RIF_variants_no_MIC)

5

In [108]:
results_final.query("Drug=='Rifampicin'").loc[(results_final['SOLO FINAL CONFIDENCE GRADING']=='3) Uncertain significance') & (results_final['REGRESSION FINAL CONFIDENCE GRADING'].str.contains('Assoc w R'))].shape

(5, 66)

In [99]:
new_INH_variants_no_MIC = results_final.query("Drug=='Isoniazid'").loc[(results_final['SOLO FINAL CONFIDENCE GRADING']=='3) Uncertain significance') & (results_final['REGRESSION FINAL CONFIDENCE GRADING'].str.contains('Assoc w R'))].merge(MIC_results, how='left').query("~(MIC_BH_pval <= 0.05)")

len(new_INH_variants_no_MIC)

30

In [110]:
new_INH_variants_no_MIC[['gene', 'predicted_effect']].value_counts()

gene  predicted_effect     
katG  missense_variant         20
ahpC  upstream_gene_variant     5
inhA  upstream_gene_variant     2
ahpC  missense_variant          1
inhA  missense_variant          1
katG  upstream_gene_variant     1
Name: count, dtype: int64

In [103]:
new_INH_variants_no_MIC['ALL_Odds_Ratio'].min(), new_INH_variants_no_MIC['ALL_Odds_Ratio'].max()

(1.039131671360849, 1.165863136905473)

In [98]:
new_RIF_variants_no_MIC[['mutation', 'ALL_Odds_Ratio']]

Unnamed: 0,mutation,ALL_Odds_Ratio
0,rpoB_p.Ile491Leu,1.112158
1,rpoB_p.Phe424Val,1.091647
2,rpoB_p.Ser493Leu,1.088842
3,rpoB_p.Ile491Met,1.072449
4,rpoB_p.Ile491Thr,1.068486


# 15 Variants that are Assoc w R in WHO and Uncertain in ALL

In [22]:
WHO_AssocR_ALL_Uncertain = results_final.loc[(results_final['Initial confidence grading WHO dataset']=='Assoc w R') & (results_final['Initial confidence grading ALL dataset']=='Uncertain')]

WHO_AssocR_ALL_Uncertain['PPV_LB_diff_WHO_ALL'] = WHO_AssocR_ALL_Uncertain['WHO_R_PPV_LB'] - WHO_AssocR_ALL_Uncertain['ALL_R_PPV_LB']

len(WHO_AssocR_ALL_Uncertain)

15

In [25]:
WHO_AssocR_ALL_Uncertain['REGRESSION FINAL CONFIDENCE GRADING'].value_counts()

REGRESSION FINAL CONFIDENCE GRADING
3) Uncertain significance    15
Name: count, dtype: int64

In [23]:
WHO_AssocR_ALL_Uncertain[['SOLO INITIAL CONFIDENCE GRADING', 'SOLO FINAL CONFIDENCE GRADING']].value_counts()

SOLO INITIAL CONFIDENCE GRADING  SOLO FINAL CONFIDENCE GRADING
3) Uncertain significance        2) Assoc w R - Interim           5
                                 3) Uncertain significance        5
5) Not assoc w R                 4) Not assoc w R - Interim       2
1) Assoc w R                     2) Assoc w R - Interim           1
2) Assoc w R - Interim           2) Assoc w R - Interim           1
3) Uncertain significance        4) Not assoc w R - Interim       1
Name: count, dtype: int64

In [52]:
WHO_AssocR_ALL_Uncertain.query("(ALL_BH_pval <= 0.05 & predicted_effect not in @silent_lst) | (ALL_BH_pval <= 0.01 & predicted_effect in @silent_lst)").shape

(11, 69)

In [54]:
WHO_AssocR_ALL_Uncertain.query("((ALL_BH_pval <= 0.05 & predicted_effect not in @silent_lst) | (ALL_BH_pval <= 0.01 & predicted_effect in @silent_lst)) & ALL_R_PPV_LB < 0.25").shape

(7, 69)

In [69]:
np.sort(WHO_AssocR_ALL_Uncertain.query("((ALL_BH_pval <= 0.05 & predicted_effect not in @silent_lst) | (ALL_BH_pval <= 0.01 & predicted_effect in @silent_lst)) & ALL_R_PPV_LB < 0.25").PPV_LB_diff_WHO_ALL.values)

array([0.01842252, 0.06080397, 0.09590884, 0.15315811, 0.26316292,
       0.58097719, 0.59475568])

In [71]:
WHO_AssocR_ALL_Uncertain.query("((ALL_BH_pval <= 0.05 & predicted_effect not in @silent_lst) | (ALL_BH_pval <= 0.01 & predicted_effect in @silent_lst)) & ALL_R_PPV_LB < 0.25")[['Drug', 'mutation', 'WHO_R_PPV_LB', 'WHO_R_PPV', 'WHO_Present_R', 'WHO_Present_S', 'ALL_R_PPV_LB', 'ALL_R_PPV', 'ALL_Present_R', 'ALL_Present_S', 'PPV_LB_diff_WHO_ALL']].sort_values('PPV_LB_diff_WHO_ALL', ascending=False)

Unnamed: 0,Drug,mutation,WHO_R_PPV_LB,WHO_R_PPV,WHO_Present_R,WHO_Present_S,ALL_R_PPV_LB,ALL_R_PPV,ALL_Present_R,ALL_Present_S,PPV_LB_diff_WHO_ALL
1091,Bedaquiline,Rv0678_p.Met146Thr,0.715086,1.0,11.0,0.0,0.12033,0.229167,11.0,37.0,0.594756
1089,Bedaquiline,mmpS5_c.-74G>T,0.769425,0.914286,32.0,3.0,0.188448,0.264463,32.0,89.0,0.580977
9184,Ethionamide,ethA_p.Asp396fs,0.421277,0.857143,6.0,1.0,0.158114,1.0,2.0,0.0,0.263163
2757,Clofazimine,Rv0678_p.Ile67fs,0.260191,0.5,9.0,9.0,0.107032,0.212766,10.0,37.0,0.153158
12398,Kanamycin,whiB7_c.-116A>G,0.317366,0.439394,29.0,37.0,0.221457,0.288235,49.0,121.0,0.095909
19770,Streptomycin,gid_p.Val65fs,0.283582,0.8,4.0,1.0,0.222778,0.666667,4.0,2.0,0.060804
8808,Ethionamide,ethA_p.Trp391*,0.257131,0.47619,10.0,11.0,0.238708,0.4,14.0,21.0,0.018423


# Neutral Variants

# 1. ALL Neutral, WHO not Neutral

In [132]:
regression_neutral = results_final.loc[((results_final['Initial confidence grading WHO dataset']=='Neutral') | (results_final['Initial confidence grading ALL dataset']=='Neutral')) & (results_final['REGRESSION FINAL CONFIDENCE GRADING'].str.contains('Not assoc w R'))]

ALL_neutral_only = results_final.loc[(results_final['Initial confidence grading WHO dataset'] != 'Neutral') & (results_final['Initial confidence grading ALL dataset'] == 'Neutral')].dropna(subset='Initial confidence grading WHO dataset')

WHO_neutral_only = results_final.loc[(results_final['Initial confidence grading WHO dataset'] == 'Neutral') & (results_final['Initial confidence grading ALL dataset'] != 'Neutral')].dropna(subset='Initial confidence grading ALL dataset')

# these should NOT be Neutral: ALL Neutral, but null in WHO, meaning they are absent from the WHO dataset
assert len(results_final.loc[(pd.isnull(results_final['Initial confidence grading WHO dataset'])) & (results_final['Initial confidence grading ALL dataset']=='Neutral') & (results_final['REGRESSION FINAL CONFIDENCE GRADING'] != '3) Uncertain significance')]) == 0

print(f"{len(WHO_neutral_only)} variants Neutral in WHO only, {len(ALL_neutral_only)} variants Neutral in ALL only")

28 variants Neutral in WHO only, 53 variants Neutral in ALL only


In [133]:
ALL_neutral_only['SOLO FINAL CONFIDENCE GRADING'].value_counts()

SOLO FINAL CONFIDENCE GRADING
3) Uncertain significance     33
4) Not assoc w R - Interim    16
5) Not assoc w R               4
Name: count, dtype: int64

# Do any Neutral Variants have Significant MIC associations?

## Not necessarily problematic, the MIC coefficient could reflect an association within the susceptible range

In [2]:
MIC_results = pd.read_csv("../tables/MIC_results.csv")

regression_neutral_withMIC = regression_neutral.merge(MIC_results, on=['Drug', 'mutation'])
print(regression_neutral_withMIC.shape)

# no
print(len(regression_neutral_withMIC.query("MIC_BH_pval <= 0.05")))

NameError: name 'regression_neutral' is not defined

In [135]:
regression_neutral_withMIC.query("MIC_BH_pval <= 0.05")[['Drug', 'mutation', 'MIC_coef', 'MIC_BH_pval', 'WHO_Odds_Ratio', 'WHO_BH_pval', 'WHO_BH_neutral_pval', 'WHO_Present_SR', 'ALL_Odds_Ratio', 'ALL_BH_pval', 'ALL_BH_neutral_pval', 'ALL_Present_S', 'ALL_Present_R']]

Unnamed: 0,Drug,mutation,MIC_coef,MIC_BH_pval,WHO_Odds_Ratio,WHO_BH_pval,WHO_BH_neutral_pval,WHO_Present_SR,ALL_Odds_Ratio,ALL_BH_pval,ALL_BH_neutral_pval,ALL_Present_S,ALL_Present_R
7,Clofazimine,Rv1979c_c.-129A>G,0.042054,0.0,1.003998,0.97375,0.048553,4868.0,1.025454,0.882918,0.751148,13818.0,617.0
55,Linezolid,rrl_n.2779A>G,0.026713,0.014727,0.984404,0.993,0.0,4.0,0.987573,0.989,0.0,6.0,0.0


# SOLO Not assoc w R, Regression Uncertain 

In [136]:
SOLO_notAssoc_regression_uncertain = results_final.loc[(results_final['REGRESSION FINAL CONFIDENCE GRADING']=='3) Uncertain significance') & (results_final['SOLO INITIAL CONFIDENCE GRADING'].str.contains('Not assoc w R'))]

print(f"{len(SOLO_notAssoc_regression_uncertain)} mutations Groups 4-5 by SOLO, Group 3 by Regression")

228 mutations Groups 4-5 by SOLO, Group 3 by Regression


In [137]:
SOLO_notAssoc_regression_uncertain_ungraded = SOLO_notAssoc_regression_uncertain.query("Reason=='Not Graded'")

SOLO_notAssoc_regression_uncertain_graded = SOLO_notAssoc_regression_uncertain.query("Reason != 'Not Graded'") #who_variants.merge(SOLO_notAssoc_regression_uncertain.query("Reason != 'Not Graded'").rename(columns={'Drug': 'drug', 'mutation': 'variant'})[['drug', 'variant']])

print(f"{len(SOLO_notAssoc_regression_uncertain_ungraded)}/{len(SOLO_notAssoc_regression_uncertain)} mutations were not graded")

54/228 mutations were not graded


In [138]:
SOLO_notAssoc_regression_uncertain_graded.query("ALL_Present_SR < 5").shape

(26, 67)

In [139]:
SOLO_notAssoc_regression_uncertain_graded.query("ALL_Present_SR < 10").shape

(32, 67)

In [140]:
SOLO_notAssoc_regression_uncertain_graded.query("ALL_Present_SR < 5 & ALL_BH_neutral_pval <= 0.05").shape

(16, 67)

## 7 variants are marginally significant with OR < 1, p-value in (0.1, 0.5]

## all are synonymous variants

In [141]:
SOLO_notAssoc_regression_uncertain_graded.query("(WHO_Odds_Ratio < 1 & WHO_BH_pval <= 0.05) | (ALL_Odds_Ratio < 1 & ALL_BH_pval <= 0.05)")[['Drug', 'mutation', 'WHO_Odds_Ratio', 'WHO_BH_pval', 'WHO_S_PPV_LB', 'WHO_Present_SR', 'ALL_Odds_Ratio', 'ALL_BH_pval', 'ALL_S_PPV_LB', 'ALL_Present_SR', 'Initial confidence grading WHO dataset', 'Initial confidence grading ALL dataset', 'REGRESSION FINAL CONFIDENCE GRADING']].shape

(7, 13)

In [142]:
SOLO_notAssoc_regression_uncertain_graded.query("(WHO_Odds_Ratio < 1 & WHO_BH_pval <= 0.05) | (ALL_Odds_Ratio < 1 & ALL_BH_pval <= 0.05)").predicted_effect.unique()

array(['synonymous_variant'], dtype=object)

In [143]:
# 139 remaining that meet the SOLO Neutral criterion but don't meet regression criteria for grading
df_search = SOLO_notAssoc_regression_uncertain_graded.query("ALL_Present_SR >= 5 & ~((WHO_Odds_Ratio < 1 & WHO_BH_pval <= 0.05) | (ALL_Odds_Ratio < 1 & ALL_BH_pval <= 0.05))")[['mutation', 'Drug', 'WHO_Odds_Ratio', 'WHO_BH_pval', 'ALL_Odds_Ratio', 'ALL_BH_pval']].rename(columns={'mutation': 'variant', 'Drug': 'drug'})

who_variants.merge(df_search)[['drug', 'variant', 'WHO_Odds_Ratio', 'WHO_BH_pval', 'ALL_Odds_Ratio', 'ALL_BH_pval', 'ALL_Present_S', 'ALL_Present_R', 'WHO_PPV_ub', 'ALL_PPV_ub',  'WHO_setB',
 'WHO_setC',
 'WHO_setD1',
 'WHO_setD2']]

Unnamed: 0,drug,variant,WHO_Odds_Ratio,WHO_BH_pval,ALL_Odds_Ratio,ALL_BH_pval,ALL_Present_S,ALL_Present_R,WHO_PPV_ub,ALL_PPV_ub,WHO_setB,WHO_setC,WHO_setD1,WHO_setD2
0,Amikacin,eis_c.-10G>A,1.031197,0.583333,1.074510,0.011377,369.0,22.0,8.30%,8.40%,1.0,1.0,0.0,0.0
1,Amikacin,eis_c.-12C>T,0.970624,0.742642,1.031827,0.665283,917.0,64.0,8.79%,8.25%,1.0,1.0,0.0,0.0
2,Amikacin,eis_p.Val163Ile,1.018560,0.896000,1.022127,0.924328,711.0,53.0,10.54%,8.98%,1.0,1.0,0.0,0.0
3,Amikacin,rrs_n.492C>T,0.982933,0.896000,0.958186,0.924328,785.0,12.0,3.73%,2.62%,1.0,1.0,0.0,0.0
4,Amikacin,rrs_n.514A>C,1.086014,0.090435,1.079965,0.022032,577.0,329.0,34.26%,39.54%,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136,Streptomycin,rpsL_c.-125G>C,0.987502,0.571079,1.005987,0.631913,84.0,40.0,20.35%,41.24%,1.0,1.0,0.0,0.0
137,Streptomycin,rrs_n.492C>T,0.963921,0.410257,1.007857,0.631913,571.0,81.0,13.42%,15.20%,0.0,0.0,0.0,0.0
138,Streptomycin,Rv1258c_p.Glu194fs,1.005222,0.634835,0.955778,0.408522,3296.0,5120.0,63.23%,61.88%,1.0,1.0,0.0,0.0
139,Streptomycin,whiB7_c.-100T>C,1.016152,0.500737,1.018207,0.439115,135.0,84.0,34.48%,45.15%,1.0,1.0,0.0,0.0


In [144]:
# total
141+7+26+54

228

# Not assoc w R by Literature

In [145]:
not_assocR_literature = not_assocR_literature.merge(results_final, how='inner')
print(len(not_assocR_literature))
print(not_assocR_literature['REGRESSION FINAL CONFIDENCE GRADING'].value_counts())

28
REGRESSION FINAL CONFIDENCE GRADING
3) Uncertain significance     27
4) Not assoc w R - Interim     1
Name: count, dtype: int64


In [155]:
not_assocR_literature

Unnamed: 0,Drug,mutation,gene,predicted_effect,WHO_Odds_Ratio,WHO_pval,WHO_BH_pval,WHO_neutral_pval,WHO_BH_neutral_pval,WHO_LRT_pval,...,ALL_Spec_LB,ALL_Spec_UB,Initial confidence grading ALL dataset,REGRESSION FINAL CONFIDENCE GRADING,Reason,SOLO INITIAL CONFIDENCE GRADING,SOLO FINAL CONFIDENCE GRADING,REGRESSION GRADING + LOF UPGRADE,REGRESSION + GRADING RULES,variant_category
0,Amikacin,whiB7_c.-178C>T,whiB7,upstream_gene_variant,0.958454,0.246,0.742642,0.67,0.871545,0.7246318,...,0.993913,0.995868,Uncertain,3) Uncertain significance,Both Uncertain,5) Not assoc w R,4) Not assoc w R - Interim,3) Uncertain significance,3) Uncertain significance,c.-178C>T
1,Bedaquiline,mmpL5_p.Ile948Val,mmpL5,missense_variant,0.948618,0.119,0.210974,0.753,1.0,0.2755944,...,0.003131,0.005432,Uncertain,3) Uncertain significance,Both Uncertain,5) Not assoc w R,4) Not assoc w R - Interim,3) Uncertain significance,3) Uncertain significance,p.Ile948Val
2,Bedaquiline,mmpL5_p.Thr794Ile,mmpL5,missense_variant,1.145619,0.0,0.0,1.0,1.0,0.1317292,...,0.464525,0.481791,Uncertain,3) Uncertain significance,Both Uncertain,5) Not assoc w R,4) Not assoc w R - Interim,3) Uncertain significance,3) Uncertain significance,p.Thr794Ile
3,Bedaquiline,mmpS5_c.-74G>T,mmpS5,upstream_gene_variant,1.394573,0.0,0.0,1.0,1.0,3.124144e-14,...,0.991557,0.994482,Uncertain,3) Uncertain significance,Downgrade to Uncertain,5) Not assoc w R,4) Not assoc w R - Interim,3) Uncertain significance,3) Uncertain significance,c.-74G>T
4,Ethambutol,embA_p.Ala813Gly,embA,missense_variant,1.019164,0.019,0.18091,0.948,1.0,0.9031229,...,0.997956,0.998817,Uncertain,3) Uncertain significance,Both Uncertain,5) Not assoc w R,4) Not assoc w R - Interim,3) Uncertain significance,3) Uncertain significance,p.Ala813Gly
5,Ethambutol,embA_p.Pro639Ser,embA,missense_variant,1.00805,0.241,0.839147,0.096,0.174206,0.7817524,...,0.997859,0.998742,Uncertain,3) Uncertain significance,Both Uncertain,5) Not assoc w R,4) Not assoc w R - Interim,3) Uncertain significance,3) Uncertain significance,p.Pro639Ser
6,Ethambutol,embA_p.Val468Ala,embA,missense_variant,1.03491,0.298,0.839147,0.461,0.615681,0.6293708,...,0.997859,0.998742,Uncertain,3) Uncertain significance,Both Uncertain,5) Not assoc w R,4) Not assoc w R - Interim,3) Uncertain significance,3) Uncertain significance,p.Val468Ala
7,Ethambutol,embB_c.660G>A,embB,synonymous_variant,1.012574,0.371,0.839097,0.226,0.377979,1.0,...,0.996805,0.997904,Uncertain,3) Uncertain significance,Both Uncertain,5) Not assoc w R,4) Not assoc w R - Interim,3) Uncertain significance,3) Uncertain significance,c.660G>A
8,Ethambutol,embB_p.Gly156Cys,embB,missense_variant,1.019164,0.019,0.18091,0.948,1.0,0.9031229,...,0.997956,0.998817,Uncertain,3) Uncertain significance,Both Uncertain,5) Not assoc w R,4) Not assoc w R - Interim,3) Uncertain significance,3) Uncertain significance,p.Gly156Cys
9,Ethambutol,embB_p.Ser1054Pro,embB,missense_variant,0.984675,0.236,0.839147,0.562,0.714417,0.6390246,...,0.998379,0.999135,Uncertain,3) Uncertain significance,Both Uncertain,5) Not assoc w R,4) Not assoc w R - Interim,3) Uncertain significance,3) Uncertain significance,p.Ser1054Pro


# Variants Graded Neutral by Regression

In [151]:
neutral_variants = results_final.loc[(results_final['Initial confidence grading WHO dataset']=='Neutral') | (results_final['Initial confidence grading ALL dataset']=='Neutral') & (results_final['REGRESSION FINAL CONFIDENCE GRADING'] != '3) Uncertain significance')]

len(neutral_variants)

91

In [154]:
# gyrA_p.Arg252Leu is also from the literature
neutral_variants['SOLO FINAL CONFIDENCE GRADING'].value_counts()

SOLO FINAL CONFIDENCE GRADING
3) Uncertain significance     48
4) Not assoc w R - Interim    36
5) Not assoc w R               7
Name: count, dtype: int64

In [156]:
neutral_variants

Unnamed: 0,Drug,gene,mutation,predicted_effect,WHO_Odds_Ratio,WHO_pval,WHO_BH_pval,WHO_neutral_pval,WHO_BH_neutral_pval,WHO_LRT_pval,...,ALL_Spec_LB,ALL_Spec_UB,Initial confidence grading ALL dataset,REGRESSION FINAL CONFIDENCE GRADING,Reason,SOLO INITIAL CONFIDENCE GRADING,SOLO FINAL CONFIDENCE GRADING,REGRESSION GRADING + LOF UPGRADE,REGRESSION + GRADING RULES,variant_category
84,Amikacin,rrs,rrs_n.1077dupT,non_coding_transcript_exon_variant,1.024186,0.143,0.486809,0.193,0.346966,0.208234,...,0.999518,0.999949,Neutral,4) Not assoc w R - Interim,ALL Neutral only,3) Uncertain significance,3) Uncertain significance,4) Not assoc w R - Interim,4) Not assoc w R - Interim,n.1077dupT
89,Amikacin,rrs,rrs_n.517C>T,non_coding_transcript_exon_variant,1.008540,0.424,0.896000,0.149,0.287229,0.604213,...,0.976974,0.980880,Neutral,4) Not assoc w R - Interim,ALL Neutral only,5) Not assoc w R,5) Not assoc w R,4) Not assoc w R - Interim,4) Not assoc w R - Interim,n.517C>T
667,Amikacin,eis,eis_c.42G>C,synonymous_variant,1.000056,0.434,0.898000,0.002,0.004861,0.908762,...,0.998206,0.999200,Uncertain,5) Not assoc w R,WHO Neutral only,3) Uncertain significance,4) Not assoc w R - Interim,5) Not assoc w R,5) Not assoc w R,c.42G>C
1271,Bedaquiline,mmpL5,mmpL5_p.Leu219Phe,missense_variant,1.062927,0.000,0.000000,1.000,1.000000,0.515114,...,0.999324,0.999952,Neutral,4) Not assoc w R - Interim,ALL Neutral only,3) Uncertain significance,3) Uncertain significance,4) Not assoc w R - Interim,4) Not assoc w R - Interim,p.Leu219Phe
1447,Bedaquiline,mmpL5,mmpL5_c.2079C>G,synonymous_variant,0.969590,0.365,0.485293,0.457,1.000000,0.677829,...,0.999099,0.999875,Neutral,4) Not assoc w R - Interim,ALL Neutral only,3) Uncertain significance,4) Not assoc w R - Interim,4) Not assoc w R - Interim,4) Not assoc w R - Interim,c.2079C>G
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20253,Streptomycin,rrs,rrs_n.1012A>G,non_coding_transcript_exon_variant,1.006574,0.322,0.633719,0.354,0.722562,0.703999,...,0.998820,0.999716,Neutral,4) Not assoc w R - Interim,ALL Neutral only,3) Uncertain significance,3) Uncertain significance,4) Not assoc w R - Interim,4) Not assoc w R - Interim,n.1012A>G
20257,Streptomycin,gid,gid_p.Ala10Val,missense_variant,0.990109,0.315,0.633719,0.387,0.780489,0.742519,...,0.999098,0.999848,Neutral,4) Not assoc w R - Interim,ALL Neutral only,3) Uncertain significance,3) Uncertain significance,4) Not assoc w R - Interim,4) Not assoc w R - Interim,p.Ala10Val
20356,Streptomycin,whiB7,whiB7_c.-215delC,upstream_gene_variant,1.008081,0.165,0.465448,0.350,0.716796,1.000000,...,0.999501,0.999983,Neutral,4) Not assoc w R - Interim,ALL Neutral only,3) Uncertain significance,3) Uncertain significance,4) Not assoc w R - Interim,4) Not assoc w R - Interim,c.-215delC
20447,Streptomycin,rpsL,rpsL_c.117C>T,synonymous_variant,0.999750,0.508,0.633367,0.014,0.034712,0.964704,...,0.996940,0.998534,Uncertain,5) Not assoc w R,WHO Neutral only,3) Uncertain significance,4) Not assoc w R - Interim,5) Not assoc w R,5) Not assoc w R,c.117C>T
