In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
plt.rcParams['figure.dpi'] = 150
import seaborn as sns
import scipy.stats as st
import statsmodels
import statsmodels.api as sm
from functools import reduce

import glob, os, yaml, subprocess, itertools, sparse, sys, statsmodels, shutil
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

analysis_dir = '/n/data1/hms/dbmi/farhat/Sanjana/who-mutation-catalogue'

lineages = pd.read_csv("../lineages/combined_lineages_samples.csv")
print(lineages.shape)
import collections, warnings
warnings.filterwarnings("ignore")

sys.path.append(os.path.join(os.path.dirname(os.getcwd()), "utils"))
from stats_utils import *
from data_utils import *

coll2014 = pd.read_csv("../data/coll2014_SNP_scheme.tsv", sep="\t")
coll2014["#lineage"] = coll2014["#lineage"].str.replace("lineage", "")
coll2014.rename(columns={"#lineage": "Lineage"}, inplace=True)
coll2014['nucleotide'] = [val.split('/')[1] for val in coll2014['allele_change'].values]

drug_abbr_dict = {"Delamanid": "DLM",
                  "Bedaquiline": "BDQ",
                  "Clofazimine": "CFZ",
                  "Ethionamide": "ETO",
                  "Linezolid": "LZD",
                  "Moxifloxacin": "MXF",
                  "Capreomycin": "CAP",
                  "Amikacin": "AMK",
                  "Pretomanid": "PMD",
                  "Pyrazinamide": "PZA",
                  "Kanamycin": "KAN",
                  "Levofloxacin": "LFX",
                  "Streptomycin": "STM",
                  "Ethambutol": "EMB",
                  "Isoniazid": "INH",
                  "Rifampicin": "RIF"
                 }

cc_df = pd.read_csv("../data/drug_CC.csv")
# who_variants_V1 = pd.read_excel("../results/WHO-catalog-V1.xlsx", sheet_name='Mutation_catalogue')
who_variants = pd.read_csv("../results/WHO-catalog-V2.csv", header=[2]).query("tier==1").reset_index(drop=True)
del who_variants['mutation']

results_final = pd.read_csv("../results/Regression_Final_April2024_Tier1.csv")
not_assocR_literature = pd.read_excel("../data/NotAwR by literature.xlsx", sheet_name=0).rename(columns={'drug': 'Drug', 'variant': 'mutation'})

drug_gene_mapping = pd.read_csv("../data/drug_gene_mapping.csv")
silent_lst = ['synonymous_variant', 'initiator_codon_variant', 'stop_retained_variant']

drugs_lst = list(drug_abbr_dict.keys())
sample_ids_mapping = pd.read_csv("../data/sample_ids_mapping_20220922.csv")

# add regression + grading rules gradings
results_final = add_grading_rules_regression(results_final)

(52567, 10)


# Overview of Regression Gradings

In [2]:
regression_grading_cols_lst = ['REGRESSION FINAL CONFIDENCE GRADING', 'REGRESSION + GRADING RULES']
solo_cols_lst = ['SOLO INITIAL CONFIDENCE GRADING', 'SOLO FINAL CONFIDENCE GRADING']

regression_totals_df = []
solo_totals_df = []

for col in regression_grading_cols_lst:
    regression_totals_df.append(pd.DataFrame(results_final[col].value_counts()).reset_index().rename(columns={col: 'Grading', 'count': col}).set_index('Grading'))

for col in solo_cols_lst:
    solo_totals_df.append(pd.DataFrame(results_final[col].value_counts()).reset_index().rename(columns={col: 'Grading', 'count': col}).set_index('Grading'))

regression_totals_df = pd.concat(regression_totals_df, axis=1).loc[['Assoc w R', 'Assoc w R - Interim', 'Uncertain', 'Assoc w S - Interim', 'Assoc w S', 'Neutral']]
solo_totals_df = pd.concat(solo_totals_df, axis=1).sort_index()

In [3]:
regression_totals_df

Unnamed: 0_level_0,REGRESSION FINAL CONFIDENCE GRADING,REGRESSION + GRADING RULES
Grading,Unnamed: 1_level_1,Unnamed: 2_level_1
Assoc w R,783,641
Assoc w R - Interim,336,1200
Uncertain,20225,14599
Assoc w S - Interim,21,5000
Assoc w S,14,14
Neutral,210,135


In [34]:
solo_totals_df

Unnamed: 0_level_0,SOLO INITIAL CONFIDENCE GRADING,SOLO FINAL CONFIDENCE GRADING
Grading,Unnamed: 1_level_1,Unnamed: 2_level_1
1) Assoc w R,311,253
2) Assoc w R - Interim,146,1130
3) Uncertain significance,20835,14958
4) Not assoc w R - Interim,19,4998
5) Not assoc w R,278,250


In [38]:
results_final.query("predicted_effect not in @silent_lst")['REGRESSION + GRADING RULES'].value_counts()

REGRESSION + GRADING RULES
Uncertain              14545
Assoc w R - Interim     1200
Assoc w R                641
Neutral                  125
Assoc w S - Interim       46
Assoc w S                 14
Name: count, dtype: int64

In [17]:
new_drugs_Rassoc_df = []

for col in regression_grading_cols_lst + solo_cols_lst:

    new_drugs_Rassoc_df.append(pd.DataFrame(results_final.loc[results_final[col].str.contains('Assoc w R')].query("predicted_effect != 'LoF' & Drug in ['Bedaquiline', 'Clofazimine', 'Delamanid', 'Linezolid']")['Drug'].value_counts()).rename(columns={'count': col}))

new_drugs_Rassoc_df = pd.concat(new_drugs_Rassoc_df, axis=1)

In [18]:
new_drugs_Rassoc_df

Unnamed: 0_level_0,REGRESSION FINAL CONFIDENCE GRADING,REGRESSION + GRADING RULES,SOLO INITIAL CONFIDENCE GRADING,SOLO FINAL CONFIDENCE GRADING
Drug,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bedaquiline,39,95,23,86
Clofazimine,27,74,2,57
Delamanid,3,25,1,24
Linezolid,3,8,2,8


# Results Section A

In [19]:
# Groups 1 and 2
print(f"Regression: {len(results_final.loc[(results_final['REGRESSION FINAL CONFIDENCE GRADING'].str.contains('Assoc w R'))])}")
print(f"DA: {len(results_final.loc[results_final['SOLO INITIAL CONFIDENCE GRADING'].str.contains('Assoc w R')])}")
print(f"SOLO: {len(results_final.loc[results_final['SOLO FINAL CONFIDENCE GRADING'].str.contains('Assoc w R')])}")

Regression: 1119
DA: 457
SOLO: 1383


In [36]:
1119-457, (1119-457)/457 

(662, 1.4485776805251642)

In [15]:
# Groups 1 and 2, without silent variants
print(f"Regression: {len(results_final.loc[(~results_final['predicted_effect'].isin(silent_lst)) & (results_final['REGRESSION FINAL CONFIDENCE GRADING'].str.contains('Assoc w R'))])}")

print(f"DA: {len(results_final.loc[(~results_final['predicted_effect'].isin(silent_lst)) & (results_final['SOLO INITIAL CONFIDENCE GRADING'].str.contains('Assoc w R'))])}")

print(f"SOLO: {len(results_final.loc[(~results_final['predicted_effect'].isin(silent_lst)) & (results_final['SOLO FINAL CONFIDENCE GRADING'].str.contains('Assoc w R'))])}")

Regression: 1118
DA: 457
SOLO: 1383


In [2]:
silent_variants_assocR = results_final.loc[(results_final['predicted_effect'].isin(silent_lst)) & (results_final['REGRESSION FINAL CONFIDENCE GRADING'].str.contains('Assoc w R'))].mutation.values
len(silent_variants_assocR), silent_variants_assocR

(1, array(['katG_c.12A>G'], dtype=object))

In [None]:
INH_phenos = pd.read_csv(os.path.join(analysis_dir, "Isoniazid", "phenos_binary.csv"))

INH_genos = pd.read_csv(os.path.join(analysis_dir, "Isoniazid", "genos_1.csv.gz"), compression='gzip', usecols=['sample_id', 'resolved_symbol', 'variant_category', 'variant_allele_frequency'])

# INH_genos['mutation'] = INH_genos['resolved_symbol'] + '_' + INH_genos['variant_category']

In [None]:
mutation = 'katG_c.12A>G' #'mmpL5_c.2889G>A'
gene, variant = mutation.split('_')

samples_with_mutation = INH_genos.query("resolved_symbol==@gene & variant_category==@variant & variant_allele_frequency > 0.75").sample_id.values
len(samples_with_mutation)

In [None]:
len(samples_with_mutation), INH_phenos.query("sample_id in @samples_with_mutation")['phenotype'].sum()

In [None]:
muts_lst = INH_genos.query("~(resolved_symbol==@gene & variant_category==@variant) & variant_allele_frequency > 0.75 & sample_id in @samples_with_mutation").mutation.values

who_variants.query("drug=='Isoniazid' & variant in @muts_lst")[['drug', 'variant', 'Present_SOLO_R', 'Present_SOLO_S', 'INITIAL CONFIDENCE GRADING', 'FINAL CONFIDENCE GRADING']]

In [5]:
# Groups 4-6
print(f"Regression: {len(results_final.loc[(results_final['REGRESSION FINAL CONFIDENCE GRADING'].str.contains('|'.join(['Assoc w S', 'Neutral'])))])}")
print(f"DA: {len(results_final.loc[results_final['SOLO INITIAL CONFIDENCE GRADING'].str.contains('Not assoc w R')])}")
print(f"SOLO: {len(results_final.loc[results_final['SOLO FINAL CONFIDENCE GRADING'].str.contains('Not assoc w R')])}")

# Groups 4-5
print(f"Regression: {len(results_final.loc[results_final['REGRESSION FINAL CONFIDENCE GRADING'].str.contains('Assoc w S')])}")

Regression: 245
DA: 297
SOLO: 5248
Regression: 35


In [23]:
# Groups 4-6, without silent variants
print(f"Regression: {len(results_final.loc[(~results_final['predicted_effect'].isin(silent_lst)) & (results_final['REGRESSION FINAL CONFIDENCE GRADING'].str.contains('|'.join(['Assoc w S', 'Neutral'])))])}")

print(f"DA: {len(results_final.loc[(~results_final['predicted_effect'].isin(silent_lst)) & (results_final['SOLO INITIAL CONFIDENCE GRADING'].str.contains('Not assoc w R'))])}")

print(f"SOLO: {len(results_final.loc[(~results_final['predicted_effect'].isin(silent_lst)) & (results_final['SOLO FINAL CONFIDENCE GRADING'].str.contains('Not assoc w R'))])}")

Regression: 170
DA: 230
SOLO: 230


In [24]:
results_final.loc[(results_final['REGRESSION FINAL CONFIDENCE GRADING']=='Uncertain') & (results_final['SOLO INITIAL CONFIDENCE GRADING'].str.contains('Assoc w R'))][['Drug', 'mutation', 'WHO_Present_R', 'WHO_Present_S', 'Initial confidence grading WHO dataset', 'Initial confidence grading ALL dataset']]

Unnamed: 0,Drug,mutation,WHO_Present_R,WHO_Present_S,Initial confidence grading WHO dataset,Initial confidence grading ALL dataset
17266,Pyrazinamide,pncA_p.Asp49fs,1.0,1.0,Uncertain,Assoc w R - Interim
17430,Pyrazinamide,pncA_p.Thr22fs,2.0,0.0,Assoc w R - Interim,Uncertain


## Which R- and S-associated variants does regression find, DA misses, and SOLO upgrades to interim?

In [25]:
# mutations that HAD to be upgraded to Groups 1 and 2 using rules
DAmiss_SOLOupgrade_R = results_final.loc[(~results_final['SOLO INITIAL CONFIDENCE GRADING'].str.contains('Assoc w R')) &
                                         (results_final['SOLO FINAL CONFIDENCE GRADING'].str.contains('Assoc w R'))
                                        ]

# matches with total counts above too
print(len(DAmiss_SOLOupgrade_R), 1383-457)

regression_Rassoc_DAmiss_SOLOupgrade_R = DAmiss_SOLOupgrade_R.loc[DAmiss_SOLOupgrade_R['REGRESSION FINAL CONFIDENCE GRADING'].str.contains('Assoc w R')]
print(len(regression_Rassoc_DAmiss_SOLOupgrade_R))

926 926
202


In [26]:
# mutations that HAD to be upgraded to Groups 1 and 2 using rules
DAmiss_SOLOupgrade_S = results_final.loc[(~results_final['SOLO INITIAL CONFIDENCE GRADING'].str.contains('Not assoc w R')) &
                                         (results_final['SOLO FINAL CONFIDENCE GRADING'].str.contains('Not assoc w R'))
                                        ]

# matches with total counts above too
print(len(DAmiss_SOLOupgrade_S), 5248-297)

regression_Rassoc_DAmiss_SOLOupgrade_S = DAmiss_SOLOupgrade_S.loc[DAmiss_SOLOupgrade_S['REGRESSION FINAL CONFIDENCE GRADING'].str.contains('|'.join(['Assoc w S', 'Neutral']))]
print(len(regression_Rassoc_DAmiss_SOLOupgrade_S))

4951 4951
64


## Look into the cases where regression grades but SOLO misses

In [42]:
regression_Rassoc_DAmiss_SOLOupgrade_R[['REGRESSION FINAL CONFIDENCE GRADING', 'SOLO INITIAL CONFIDENCE GRADING', 'SOLO FINAL CONFIDENCE GRADING']].value_counts()

REGRESSION FINAL CONFIDENCE GRADING  SOLO INITIAL CONFIDENCE GRADING  SOLO FINAL CONFIDENCE GRADING
Assoc w R                            3) Uncertain significance        2) Assoc w R - Interim           103
Assoc w R - Interim                  3) Uncertain significance        2) Assoc w R - Interim            96
Assoc w R                            3) Uncertain significance        1) Assoc w R                       3
Name: count, dtype: int64

In [43]:
regression_Rassoc_DAmiss_SOLOupgrade_R_SOLO_results = regression_Rassoc_DAmiss_SOLOupgrade_R[['Drug', 'mutation']].merge(who_variants.rename(columns={'drug': 'Drug', 'variant': 'mutation'}))

len(regression_Rassoc_DAmiss_SOLOupgrade_R_SOLO_results)

202

In [45]:
len(regression_Rassoc_DAmiss_SOLOupgrade_R_SOLO_results), len(regression_Rassoc_DAmiss_SOLOupgrade_R_SOLO_results.query("Present_SOLO_SR==0")), len(regression_Rassoc_DAmiss_SOLOupgrade_R_SOLO_results.query("Present_SOLO_SR < 5"))

(202, 3, 126)

In [47]:
126/202

0.6237623762376238

In [46]:
len(regression_Rassoc_DAmiss_SOLOupgrade_R_SOLO_results.query("Present_SOLO_SR >= 5")), regression_Rassoc_DAmiss_SOLOupgrade_R_SOLO_results.query("Present_SOLO_SR >= 5").OR_SOLO_FE_sig.value_counts()

(76,
 OR_SOLO_FE_sig
 False    71
 True      5
 Name: count, dtype: int64)

# Neutral Variants

In [61]:
# 28 Tier 1 variants
neutral_literature_regression_results = results_final.merge(not_assocR_literature, on=['Drug', 'mutation'], how='right').merge(who_variants.rename(columns={'drug': 'Drug', 'variant': 'mutation'})[['Drug', 'mutation', 'Present_SOLO_S', 'Present_SOLO_R', 'Present_S', 'Present_R']], on=['Drug', 'mutation'])

neutral_literature_regression_results['REGRESSION FINAL CONFIDENCE GRADING'].value_counts()

REGRESSION FINAL CONFIDENCE GRADING
Uncertain    19
Neutral       8
Assoc w R     1
Name: count, dtype: int64

In [42]:
neutral_literature_regression_results.loc[neutral_literature_regression_results['REGRESSION FINAL CONFIDENCE GRADING']=='Neutral'].sort_values(["Drug", "ALL_Odds_Ratio"], ascending=[True, False])[['Drug', 'mutation', 'WHO_Odds_Ratio', 'ALL_Odds_Ratio', 'Present_S', 'Present_R']].to_csv("../results/neutral_vars_OR.csv", index=False)

In [50]:
neutral_literature_regression_results.sort_values(["Drug", "REGRESSION FINAL CONFIDENCE GRADING"], ascending=[True, False])[['Drug', 'mutation', 'WHO_Odds_Ratio', 'WHO_BH_neutral_pval', 'WHO_BH_LRT_neutral_pval', 'SOLO INITIAL CONFIDENCE GRADING', 'SOLO FINAL CONFIDENCE GRADING']]

Unnamed: 0,Drug,mutation,WHO_Odds_Ratio,WHO_BH_neutral_pval,WHO_BH_LRT_neutral_pval,SOLO INITIAL CONFIDENCE GRADING,SOLO FINAL CONFIDENCE GRADING
0,Amikacin,whiB7_c.-178C>T,0.950986,0.884856,0.585905,5) Not assoc w R,4) Not assoc w R - Interim
1,Bedaquiline,mmpL5_p.Ile948Val,0.936788,1.0,1.0,5) Not assoc w R,4) Not assoc w R - Interim
2,Bedaquiline,mmpL5_p.Thr794Ile,1.096449,1.0,1.0,5) Not assoc w R,4) Not assoc w R - Interim
3,Bedaquiline,mmpS5_c.-74G>T,1.40935,1.0,1.0,5) Not assoc w R,4) Not assoc w R - Interim
4,Ethambutol,embA_p.Ala813Gly,1.015761,1.0,0.0,5) Not assoc w R,4) Not assoc w R - Interim
6,Ethambutol,embA_p.Val468Ala,1.03959,0.645634,0.460355,5) Not assoc w R,4) Not assoc w R - Interim
8,Ethambutol,embB_p.Gly156Cys,1.015761,1.0,0.0,5) Not assoc w R,4) Not assoc w R - Interim
9,Ethambutol,embB_p.Ser1054Pro,0.975779,0.974543,0.698541,5) Not assoc w R,4) Not assoc w R - Interim
10,Ethambutol,embB_p.Val668Ile,0.988786,0.575934,0.168534,5) Not assoc w R,4) Not assoc w R - Interim
11,Ethambutol,embC_c.-1188C>T,1.030252,0.605246,0.0,5) Not assoc w R,4) Not assoc w R - Interim


In [62]:
neutral_literature_regression_results[['Initial confidence grading WHO dataset', 'Initial confidence grading ALL dataset']].value_counts()

Initial confidence grading WHO dataset  Initial confidence grading ALL dataset
Uncertain                               Uncertain                                 10
Neutral                                 Neutral                                    4
                                        Uncertain                                  4
                                        Assoc w R - Interim                        3
Uncertain                               Neutral                                    3
Assoc w R                               Assoc w R - Interim                        1
Assoc w R - Interim                     Neutral                                    1
                                        Uncertain                                  1
Assoc w S - Interim                     Uncertain                                  1
Name: count, dtype: int64

In [63]:
neutral_literature_regression_results.loc[(neutral_literature_regression_results['Initial confidence grading WHO dataset']=='Uncertain') & (neutral_literature_regression_results['Initial confidence grading ALL dataset']=='Neutral')]

Unnamed: 0,mutation,predicted_effect,WHO_Odds_Ratio,WHO_pval,WHO_BH_pval,WHO_neutral_pval,WHO_BH_neutral_pval,WHO_LRT_pval,WHO_BH_LRT_pval,WHO_LRT_neutral_pval,...,REGRESSION FINAL CONFIDENCE GRADING,Reason,Drug,SOLO INITIAL CONFIDENCE GRADING,SOLO FINAL CONFIDENCE GRADING,REGRESSION + GRADING RULES,Present_SOLO_S,Present_SOLO_R,Present_S,Present_R
6,embA_p.Val468Ala,missense_variant,1.03959,0.309,0.837759,0.492,0.645634,0.653341,1.0,0.346659,...,Uncertain,Neutral in ALL,Ethambutol,5) Not assoc w R,4) Not assoc w R - Interim,Assoc w S - Interim,0.0,0.0,58.0,16.0
16,katG_c.-354C>T,upstream_gene_variant,0.994494,0.366,0.55618,0.276,0.583555,0.853434,0.886362,0.146566,...,Uncertain,Neutral in ALL,Isoniazid,5) Not assoc w R,4) Not assoc w R - Interim,Assoc w S - Interim,1.0,0.0,76.0,4.0
20,whiB7_c.-178C>T,upstream_gene_variant,0.973385,0.222,0.562626,0.602,0.861483,0.844293,1.0,0.155707,...,Uncertain,Neutral in ALL,Kanamycin,5) Not assoc w R,4) Not assoc w R - Interim,Assoc w S - Interim,91.0,1.0,115.0,4.0


In [71]:
results_final.loc[(results_final['Initial confidence grading WHO dataset']=='Uncertain') & (results_final['Initial confidence grading ALL dataset']=='Neutral')].shape

(153, 62)

In [67]:
results_final.loc[(results_final['Initial confidence grading WHO dataset']=='Uncertain') & (results_final['Initial confidence grading ALL dataset']=='Neutral')][['SOLO INITIAL CONFIDENCE GRADING', 'SOLO FINAL CONFIDENCE GRADING']].value_counts()

SOLO INITIAL CONFIDENCE GRADING  SOLO FINAL CONFIDENCE GRADING
3) Uncertain significance        3) Uncertain significance        102
                                 4) Not assoc w R - Interim        23
5) Not assoc w R                 5) Not assoc w R                  22
3) Uncertain significance        2) Assoc w R - Interim             3
5) Not assoc w R                 4) Not assoc w R - Interim         3
Name: count, dtype: int64

In [76]:
search_lst = results_final.loc[(results_final['Initial confidence grading WHO dataset']=='Uncertain') & (results_final['Initial confidence grading ALL dataset']=='Neutral') & (results_final['SOLO FINAL CONFIDENCE GRADING']=='2) Assoc w R - Interim')].mutation.values

results_final.loc[(results_final['Initial confidence grading WHO dataset']=='Uncertain') & (results_final['Initial confidence grading ALL dataset']=='Neutral') & (results_final['SOLO FINAL CONFIDENCE GRADING']=='2) Assoc w R - Interim')]

8925     1.036471
18590    1.001678
18601    0.998989
Name: ALL_Odds_Ratio, dtype: float64

In [77]:
who_variants.query("variant in @search_lst")[['drug', 'variant', 'Present_R', 'Present_S', 'OR', 'OR_SOLO']]

Unnamed: 0,drug,variant,Present_R,Present_S,OR,OR_SOLO
8981,Ethionamide,ethA_p.Arg463fs,83.0,124.0,2.0049,inf
18783,Rifampicin,rpoB_p.Asn437Asp,15.0,6.0,4.5439,
19173,Rifampicin,rpoB_p.Lys446Gln,5.0,1.0,9.0839,


# Supplementary Results Section B

In [26]:
minor_missed = results_final.loc[(results_final['SOLO INITIAL CONFIDENCE GRADING'].str.contains('Not assoc w R')) & (results_final['REGRESSION FINAL CONFIDENCE GRADING']=='Uncertain')]

print(minor_missed.shape)

minor_missed = minor_missed.merge(who_variants.rename(columns={'drug': 'Drug', 'variant': 'mutation'})[['Drug', 'mutation', 'Present_SOLO_S', 'Present_SOLO_R', 'Present_S', 'Present_R']])

(245, 62)


In [27]:
minor_missed['SOLO INITIAL CONFIDENCE GRADING'].value_counts()

SOLO INITIAL CONFIDENCE GRADING
5) Not assoc w R              229
4) Not assoc w R - Interim     16
Name: count, dtype: int64

In [8]:
minor_missed.query("Reason=='Not Graded'").shape

(54, 62)

In [10]:
minor_missed.query("Reason!='Not Graded'")['Reason'].value_counts()

Reason
Silent variant downgrade    56
WHO ALL Same Grading        51
Insufficient evidence       32
WHO Uncertain               22
ALL Evidence Only           16
Both Interim                14
Name: count, dtype: int64

In [22]:
minor_missed.query("Reason!='Not Graded'")['Initial confidence grading WHO dataset'].value_counts()

Initial confidence grading WHO dataset
Uncertain              133
Assoc w S - Interim     27
Neutral                 12
Assoc w R - Interim      3
Name: count, dtype: int64

In [31]:
minor_missed.query("Reason != 'Not Graded'")[['Drug', 'mutation', 'Present_SOLO_S', 'Present_SOLO_R', 'Present_S', 'Present_R']]

Unnamed: 0,Drug,mutation,Present_SOLO_S,Present_SOLO_R,Present_S,Present_R
0,Amikacin,rrs_n.514A>C,328.0,8.0,577.0,329.0
1,Amikacin,eis_c.-10G>A,310.0,18.0,369.0,22.0
2,Amikacin,eis_c.-12C>T,657.0,22.0,917.0,64.0
3,Amikacin,whiB7_p.Gly64fs,84.0,1.0,98.0,2.0
4,Amikacin,eis_p.Val163Ile,635.0,13.0,711.0,53.0
...,...,...,...,...,...,...
239,Streptomycin,gid_p.Tyr195His,0.0,0.0,220.0,149.0
240,Streptomycin,Rv1258c_p.Gly363Val,0.0,0.0,56.0,1.0
241,Streptomycin,gid_c.615A>G,,,6745.0,6224.0
242,Streptomycin,gid_c.330G>T,,,1744.0,366.0


In [33]:
len(minor_missed.query("Reason != 'Not Graded'")), len(minor_missed.query("Reason != 'Not Graded' & Present_SOLO_S > Present_SOLO_R")), len(minor_missed.query("Present_SOLO_S > Present_SOLO_R"))

(191, 77, 77)

In [19]:
minor_missed.query("Reason!='Not Graded'").loc[minor_missed['Initial confidence grading ALL dataset']!= 'Neutral'][['Initial confidence grading WHO dataset', 'Initial confidence grading ALL dataset']].value_counts()

Initial confidence grading WHO dataset  Initial confidence grading ALL dataset
Uncertain                               Uncertain                                 91
Assoc w S - Interim                     Assoc w S - Interim                       19
Uncertain                               Assoc w S - Interim                       10
Assoc w S - Interim                     Uncertain                                  8
Neutral                                 Assoc w S - Interim                        8
Uncertain                               Assoc w R - Interim                        7
Neutral                                 Assoc w R - Interim                        4
Assoc w R - Interim                     Assoc w R - Interim                        1
                                        Uncertain                                  1
Name: count, dtype: int64

In [21]:
minor_missed.loc[(minor_missed['Initial confidence grading ALL dataset'].str.contains('Assoc w S')) | (minor_missed['Initial confidence grading WHO dataset'].str.contains('Assoc w S'))].shape

(45, 62)

In [None]:
minor_missed.query("")