In [1]:
import pandas
import numpy
import copy

from statsmodels.stats.contingency_tables import mcnemar

pandas.options.display.max_rows = 200

In [2]:
drug_names_table = pandas.read_csv("dat/drugs/drug_names_lookup.csv")
drug_names_table.set_index("DRUG", inplace=True)
drug_names_lookup = {}
for idx, row in drug_names_table.iterrows():
    drug_names_lookup[idx] = row.DRUG_NAME.capitalize()
drug_names_lookup

{'AMI': 'Amikacin',
 'BDQ': 'Bedaquiline',
 'CAP': 'Capreomycin',
 'CFZ': 'Clofazimine',
 'DLM': 'Delamanid',
 'EMB': 'Ethambutol',
 'ETH': 'Ethionamide',
 'INH': 'Isoniazid',
 'KAN': 'Kanamycin',
 'LEV': 'Levofloxacin',
 'LZD': 'Linezolid',
 'MXF': 'Moxifloxacin',
 'PZA': 'Pyrazinamide',
 'RFB': 'Rifabutin',
 'RIF': 'Rifampicin',
 'STM': 'Streptomycin'}

Now read in the list of drugs in the WHOv2 catalogue as well as the performance of the WHOv2 catalogue as reported in Annex 1 of the report

In [3]:
who_drugs = list(pandas.read_csv('dat/drugs/who2_drugs.csv').drug)

who = pandas.read_csv('dat/WHO2-Annex1-table.csv')
who = who[(who.catalogue=='WHO2') & (who.FRS==0.75)]

# reverse the order of the table so that the drugs are in the same order on all graphs
who = who.iloc[::-1]
who

Unnamed: 0,drug,catalogue,FRS,sensitivity,specificity,PPV,sensitivity_low,sensitivity_high,specificity_low,specificity_high,PPV_low,PPV_high
43,CAP,WHO2,0.75,66.2,97.8,80.1,64.1,68.2,97.6,98.1,78.1,81.9
40,KAN,WHO2,0.75,74.9,96.7,79.3,73.4,76.3,96.4,96.9,77.9,80.7
37,ETH,WHO2,0.75,74.8,85.9,63.9,73.6,76.0,85.3,86.4,62.7,65.1
34,STM,WHO2,0.75,79.7,94.1,89.9,78.9,80.5,93.7,94.4,89.3,90.5
31,AMI,WHO2,0.75,72.8,98.3,82.8,71.0,74.6,98.1,98.5,81.2,84.4
28,DLM,WHO2,0.75,14.7,99.9,72.5,10.6,19.7,99.8,99.9,58.3,84.1
25,CFZ,WHO2,0.75,17.0,98.7,38.1,14.2,20.0,98.5,98.9,32.6,43.8
22,LEV,WHO2,0.75,84.8,96.9,88.1,83.9,85.7,96.7,97.1,87.3,89.0
19,MXF,WHO2,0.75,85.7,93.5,74.0,84.6,86.8,93.2,93.9,72.7,75.2
16,LZD,WHO2,0.75,34.0,99.8,78.4,29.2,39.0,99.7,99.9,71.3,84.5


In [4]:
PHENOTYPES = pandas.read_csv('dat/PHENOTYPES.csv')
PHENOTYPES = PHENOTYPES[((PHENOTYPES.DRUG=='BDQ') & (PHENOTYPES.PHENOTYPE_METHOD=='MGIT')) | ((PHENOTYPES.DRUG=='LZD') & (PHENOTYPES.PHENOTYPE_METHOD=='UKMYC')) | ((PHENOTYPES.DRUG=='PZA') & (PHENOTYPES.PHENOTYPE_METHOD!='MGIT')) | ((~PHENOTYPES.DRUG.isin(['BDQ','LZD'])))]
PHENOTYPES.set_index(['ENA_RUN_ACCESSION', 'DRUG'], inplace=True)
PHENOTYPES.rename(columns={'BINARY_PHENOTYPE': 'pDST'}, inplace=True)
PHENOTYPES.sort_index(inplace=True)

predictions = pandas.read_csv('dat/PREDICTIONS.csv')
predictions = predictions[predictions.SET=='nulls+minors']
predictions.drop(columns=['SET'], inplace=True)
predictions.set_index(['ENA_RUN_ACCESSION', 'DRUG'], inplace=True)
predictions.rename(columns={'PREDICTION': 'gnomonicus'}, inplace=True)
predictions = predictions[predictions.index.isin(PHENOTYPES.index)]

tbprofiler = pandas.read_csv('dat/tbprofiler_PREDICTIONS.csv')
tbprofiler.set_index(['ENA_RUN_ACCESSION', 'DRUG'], inplace=True)
tbprofiler = tbprofiler[tbprofiler.index.isin(PHENOTYPES.index)]
tbprofiler.rename(columns={'PREDICTION': 'TB-Profiler'}, inplace=True)

assert len(predictions) == len(tbprofiler)

comparison = predictions.join(tbprofiler, how='inner')
comparison = comparison.join(PHENOTYPES[['pDST']], how='inner')
comparison

Unnamed: 0_level_0,Unnamed: 1_level_0,gnomonicus,TB-Profiler,pDST
ENA_RUN_ACCESSION,DRUG,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ERR13286038,BDQ,R,R,R
ERR13286039,BDQ,R,R,R
ERR13286042,BDQ,S,S,S
ERR13286043,BDQ,S,S,S
ERR13286045,BDQ,S,S,R
...,...,...,...,...
SRR1165546,CAP,S,S,S
SRR1165572,PZA,S,S,S
SRR1165572,CAP,F,S,S
SRR1165601,PZA,R,R,R


## Table 3

First let's list the number of samples in each category of prediction for both tools.

In [5]:
table3 = pandas.crosstab(comparison['gnomonicus'], comparison['TB-Profiler'], margins=True)
table3

TB-Profiler,R,S,U,All
gnomonicus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
F,4,23,0,27
R,3705,43,0,3748
S,20,8932,23,8975
U,14,965,92,1071
All,3743,9963,115,13821


In [6]:
print(table3.to_latex(column_format='r|rrr', multirow=True))

\begin{tabular}{r|rrr}
\toprule
TB-Profiler & R & S & U & All \\
gnomonicus &  &  &  &  \\
\midrule
F & 4 & 23 & 0 & 27 \\
R & 3705 & 43 & 0 & 3748 \\
S & 20 & 8932 & 23 & 8975 \\
U & 14 & 965 & 92 & 1071 \\
All & 3743 & 9963 & 115 & 13821 \\
\bottomrule
\end{tabular}



Now only consider those that are phenotypically resistant

In [7]:
a = comparison[comparison['pDST']=='R']
table3_r = pandas.crosstab(a['gnomonicus'], a['TB-Profiler'], margins=True)
table3_r

TB-Profiler,R,S,U,All
gnomonicus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
F,3,5,0,8
R,3297,33,0,3330
S,18,853,10,881
U,11,205,33,249
All,3329,1096,43,4468


In [8]:
print(table3_r.to_latex(column_format='r|rrr', multirow=True))

\begin{tabular}{r|rrr}
\toprule
TB-Profiler & R & S & U & All \\
gnomonicus &  &  &  &  \\
\midrule
F & 3 & 5 & 0 & 8 \\
R & 3297 & 33 & 0 & 3330 \\
S & 18 & 853 & 10 & 881 \\
U & 11 & 205 & 33 & 249 \\
All & 3329 & 1096 & 43 & 4468 \\
\bottomrule
\end{tabular}



Looking in more detail at the true discrepants we find that gnomonicus gets 33+2 correct and TB-Profiler gets 18+10 correct.

In [9]:
a = comparison[(comparison['TB-Profiler']=='R') & (comparison['gnomonicus']=='S')]
a.pDST.value_counts()

pDST
R    18
S     2
Name: count, dtype: int64

In [10]:
a = comparison[(comparison['TB-Profiler']=='S') & (comparison['gnomonicus']=='R')]
a.pDST.value_counts()

pDST
R    33
S    10
Name: count, dtype: int64

## Table 4

Now let's break out Table 3 by drug for the true discrepancies only. We don't report the McNemar as the RS,SR sample numbers are too small to be meaningful.

In [11]:
rows = []

comparison.reset_index(inplace=True)

for drug in who_drugs:
    foo = comparison[comparison.DRUG==drug]
    foo = foo[foo.gnomonicus.isin(['R', 'S']) & (foo['TB-Profiler'].isin(['R', 'S']))]
    table = pandas.crosstab(foo['gnomonicus'], foo['TB-Profiler'])
    results = mcnemar(table)
    drug_name = drug_names_lookup[drug]
    total = table['R']['R'], table['S']['R'], table['R']['S'], table['S']['S']
    rows.append([drug_name, table['R']['R'], table['S']['R'], table['R']['S'], table['S']['S'], "%.3f" % results.pvalue])

comparison_statistics = pandas.DataFrame(rows, columns=['DRUG', 'RR', 'RS', 'SR', 'SS', 'McNemar p-value']).set_index('DRUG')
# comparison_statistics.to_csv('dat/tbprofiler_STATISTICS.csv')
comparison_statistics

Unnamed: 0_level_0,RR,RS,SR,SS,McNemar p-value
DRUG,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Isoniazid,483,2,1,467,1.0
Rifampicin,480,0,1,497,1.0
Pyrazinamide,256,3,0,351,0.25
Ethambutol,360,0,2,576,0.5
Bedaquiline,268,5,3,456,0.727
Linezolid,34,1,0,882,1.0
Moxifloxacin,290,1,4,632,0.375
Levofloxacin,290,1,4,636,0.375
Clofazimine,64,2,3,878,1.0
Delamanid,18,1,0,739,1.0


In [12]:
print(comparison_statistics.to_latex(column_format='r|rrrr|r', multirow=True))

\begin{tabular}{r|rrrr|r}
\toprule
 & RR & RS & SR & SS & McNemar p-value \\
DRUG &  &  &  &  &  \\
\midrule
Isoniazid & 483 & 2 & 1 & 467 & 1.000 \\
Rifampicin & 480 & 0 & 1 & 497 & 1.000 \\
Pyrazinamide & 256 & 3 & 0 & 351 & 0.250 \\
Ethambutol & 360 & 0 & 2 & 576 & 0.500 \\
Bedaquiline & 268 & 5 & 3 & 456 & 0.727 \\
Linezolid & 34 & 1 & 0 & 882 & 1.000 \\
Moxifloxacin & 290 & 1 & 4 & 632 & 0.375 \\
Levofloxacin & 290 & 1 & 4 & 636 & 0.375 \\
Clofazimine & 64 & 2 & 3 & 878 & 1.000 \\
Delamanid & 18 & 1 & 0 & 739 & 1.000 \\
Amikacin & 213 & 5 & 0 & 719 & 0.062 \\
Streptomycin & 224 & 6 & 0 & 313 & 0.031 \\
Ethionamide & 304 & 4 & 0 & 498 & 0.125 \\
Kanamycin & 226 & 5 & 0 & 704 & 0.062 \\
Capreomycin & 195 & 7 & 2 & 584 & 0.180 \\
\bottomrule
\end{tabular}



## Table S3

Let's prepare a detailed breakdown by drug of the predictions by both tools and how they compare to the pDST result.

In [13]:
detailed_table = copy.deepcopy(comparison)
detailed_table['joint'] = detailed_table['gnomonicus'] + detailed_table['TB-Profiler']
from pandas.api.types import CategoricalDtype
cat_type = CategoricalDtype(categories=["RR", "RS", 'FR','FS',"SR", 'SS', 'SU', 'UR', 'US', 'UU' ], ordered=True)

detailed_table.DRUG = detailed_table.DRUG.astype('category')
detailed_table.DRUG = detailed_table.DRUG.cat.set_categories(who_drugs)
detailed_table.sort_values('DRUG', inplace=True)

detailed_table['joint'] = detailed_table['joint'].astype(cat_type)
detailed_table.reset_index(inplace=True)
detailed_table['DRUG'] = detailed_table['DRUG'].map(drug_names_lookup)
detailed_table

Unnamed: 0,index,ENA_RUN_ACCESSION,DRUG,gnomonicus,TB-Profiler,pDST,joint
0,2992,ERR3287789,Isoniazid,R,R,R,RR
1,13152,ERR8976047,Isoniazid,S,S,S,SS
2,9381,ERR4829698,Isoniazid,S,S,S,SS
3,9392,ERR4829738,Isoniazid,R,R,R,RR
4,3227,ERR4796360,Isoniazid,R,R,R,RR
...,...,...,...,...,...,...,...
13816,2540,ERR2516329,Capreomycin,R,R,R,RR
13817,2538,ERR2516308,Capreomycin,S,S,S,SS
13818,2536,ERR2516291,Capreomycin,R,R,S,RR
13819,4471,ERR4797772,Capreomycin,S,S,S,SS


In [14]:
table_s3 = pandas.crosstab([detailed_table['DRUG'],detailed_table['pDST']], detailed_table['joint'])
print(table_s3.to_latex(column_format='r|rrr|rrr|rrr|rrr|rrr|rrr|rrr|rrr|rrr|rrr', multirow=True))

\begin{tabular}{r|rrr|rrr|rrr|rrr|rrr|rrr|rrr|rrr|rrr|rrr}
\toprule
 & joint & RR & RS & FR & FS & SR & SS & SU & UR & US & UU \\
DRUG & pDST &  &  &  &  &  &  &  &  &  &  \\
\midrule
\multirow[t]{2}{*}{Isoniazid} & R & 458 & 1 & 3 & 0 & 1 & 24 & 1 & 0 & 8 & 3 \\
 & S & 25 & 1 & 0 & 1 & 0 & 443 & 0 & 0 & 24 & 5 \\
\cline{1-12}
\multirow[t]{2}{*}{Rifampicin} & R & 453 & 0 & 0 & 0 & 1 & 20 & 0 & 0 & 1 & 0 \\
 & S & 27 & 0 & 0 & 1 & 0 & 477 & 0 & 0 & 18 & 0 \\
\cline{1-12}
\multirow[t]{2}{*}{Pyrazinamide} & R & 247 & 2 & 0 & 0 & 0 & 28 & 0 & 1 & 7 & 6 \\
 & S & 9 & 1 & 0 & 0 & 0 & 323 & 0 & 0 & 0 & 3 \\
\cline{1-12}
\multirow[t]{2}{*}{Ethambutol} & R & 249 & 0 & 0 & 1 & 1 & 22 & 3 & 0 & 4 & 7 \\
 & S & 111 & 0 & 0 & 3 & 1 & 554 & 9 & 0 & 23 & 10 \\
\cline{1-12}
\multirow[t]{2}{*}{Bedaquiline} & R & 261 & 3 & 0 & 0 & 3 & 90 & 4 & 0 & 33 & 0 \\
 & S & 7 & 2 & 0 & 0 & 0 & 366 & 0 & 0 & 5 & 0 \\
\cline{1-12}
\multirow[t]{2}{*}{Linezolid} & R & 32 & 1 & 0 & 0 & 0 & 76 & 0 & 0 & 8 & 0 \\
 & S &

## Table S4

In [15]:
discrepant_samples = comparison[(comparison.gnomonicus.isin(['R','S'])) & (comparison['TB-Profiler'].isin(['R','S'])) & (comparison.gnomonicus!=comparison['TB-Profiler'])]
discrepant_samples.reset_index(inplace=True)
discrepant_samples.sort_values(['DRUG','gnomonicus'], inplace=True)
discrepant_samples.set_index(['ENA_RUN_ACCESSION', 'DRUG'], inplace=True)
discrepant_samples

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  discrepant_samples.sort_values(['DRUG','gnomonicus'], inplace=True)


Unnamed: 0_level_0,Unnamed: 1_level_0,index,gnomonicus,TB-Profiler,pDST
ENA_RUN_ACCESSION,DRUG,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ERR4797034,AMI,3683,R,S,R
ERR4797106,AMI,3764,R,S,R
ERR4797640,AMI,4348,R,S,R
ERR4828998,AMI,8914,R,S,R
ERR4830157,AMI,9587,R,S,R
ERR13286130,BDQ,79,R,S,S
ERR13289278,BDQ,355,R,S,S
ERR9992658,BDQ,13449,R,S,R
ERR9992730,BDQ,13487,R,S,R
ERR9993193,BDQ,13705,R,S,R


In [16]:
EFFECTS = pandas.read_csv('dat/EFFECTS.csv')
EFFECTS = EFFECTS[(EFFECTS.SET=='nulls+minors') & (EFFECTS.PREDICTION=='R')]
EFFECTS.set_index(['ENA_RUN_ACCESSION', 'DRUG'], inplace=True)
EFFECTS
discrepants_gnomonicus_R = discrepant_samples[discrepant_samples.gnomonicus=='R'].join(EFFECTS[['GENE','MUTATION','PREDICTION']], how='left')
def shorten_deletions(row):
    if 'del' in row.MUTATION:
        cols = row.MUTATION.split('_')
        return cols[0] + '_del_' + str(len(cols[2]))
    else:
        return row.MUTATION
discrepants_gnomonicus_R.reset_index(inplace=True)
discrepants_gnomonicus_R.set_index(['ENA_RUN_ACCESSION', 'DRUG','pDST'], inplace=True)
discrepants_gnomonicus_R['MUTATION'] = discrepants_gnomonicus_R.apply(shorten_deletions, axis=1)
discrepants_gnomonicus_R = discrepants_gnomonicus_R[['GENE', 'MUTATION']]
discrepants_gnomonicus_R.rename(columns={'GENE': 'g_GENE', 'MUTATION': 'g_MUTATION'}, inplace=True)
discrepants_gnomonicus_R

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,g_GENE,g_MUTATION
ENA_RUN_ACCESSION,DRUG,pDST,Unnamed: 3_level_1,Unnamed: 4_level_1
ERR4797034,AMI,R,rrs,a1401g:3
ERR4797106,AMI,R,rrs,a1401g
ERR4797640,AMI,R,rrs,a1401g:4
ERR4828998,AMI,R,rrs,a1401g
ERR4830157,AMI,R,rrs,a1401g:9
ERR13286130,BDQ,S,Rv0678,494_ins_ct:16
ERR13289278,BDQ,S,Rv0678,141_ins_c:11
ERR9992658,BDQ,R,Rv0678,383_del_1
ERR9992730,BDQ,R,Rv0678,66_del_76
ERR9993193,BDQ,R,Rv0678,138_ins_g:9


In [17]:
EFFECTS = pandas.read_csv('dat/tbprofiler_EFFECTS.csv')
EFFECTS = EFFECTS[(EFFECTS.PREDICTION=='R')]
EFFECTS.set_index(['ENA_RUN_ACCESSION', 'DRUG'], inplace=True)
discrepants_tbprofiler_R = discrepant_samples[discrepant_samples['TB-Profiler']=='R'].join(EFFECTS[['GENE','MUTATION','PREDICTION']], how='left')
discrepants_tbprofiler_R.reset_index(inplace=True)
discrepants_tbprofiler_R.set_index(['ENA_RUN_ACCESSION', 'DRUG','pDST'], inplace=True)
discrepants_tbprofiler_R.rename(columns={'GENE': 't_GENE', 'MUTATION': 't_MUTATION'}, inplace=True)
discrepants_tbprofiler_R[['t_GENE', 't_MUTATION']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,t_GENE,t_MUTATION
ENA_RUN_ACCESSION,DRUG,pDST,Unnamed: 3_level_1,Unnamed: 4_level_1
ERR9992895,BDQ,R,mmpR5,c.198delG
ERR9992913,BDQ,R,mmpR5,c.198delG
ERR9993051,BDQ,R,mmpR5,c.198delG
SRR1163087,CAP,R,rrs,n.1401A>G
SRR1165525,CAP,R,rrs,n.1401A>G
ERR4831746,CFZ,R,mmpR5,c.198delG
ERR4831769,CFZ,S,mmpR5,c.198delG
ERR8975663,CFZ,R,mmpR5,c.421_425delGATCTinsA
ERR2510311,EMB,R,embB,p.Met306Leu
ERR4829977,EMB,S,embB,p.Gly406Asp


In [18]:
table_s4 = discrepants_gnomonicus_R.join(discrepants_tbprofiler_R[['t_GENE', 't_MUTATION']], how='outer')
table_s4.replace(numpy.nan, '', inplace=True)
table_s4

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,g_GENE,g_MUTATION,t_GENE,t_MUTATION
ENA_RUN_ACCESSION,DRUG,pDST,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ERR13286130,BDQ,S,Rv0678,494_ins_ct:16,,
ERR13289278,BDQ,S,Rv0678,141_ins_c:11,,
ERR2510311,EMB,R,,,embB,p.Met306Leu
ERR2510328,LEV,R,,,gyrA,p.Asp94Asn
ERR2510328,LEV,R,,,gyrA,p.Asp94Tyr
ERR2510328,MXF,R,,,gyrA,p.Asp94Asn
ERR2510328,MXF,R,,,gyrA,p.Asp94Tyr
ERR2510548,INH,R,,,katG,p.Ser315Thr
ERR2510548,INH,R,,,katG,p.Ser315Asn
ERR2510654,STM,R,rrs,c517t:4,,


In [19]:
print(table_s4.to_latex(column_format='rlr|rr|rr', multirow=True))

\begin{tabular}{rlr|rr|rr}
\toprule
 &  &  & g_GENE & g_MUTATION & t_GENE & t_MUTATION \\
ENA_RUN_ACCESSION & DRUG & pDST &  &  &  &  \\
\midrule
ERR13286130 & BDQ & S & Rv0678 & 494_ins_ct:16 &  &  \\
\cline{1-7} \cline{2-7}
ERR13289278 & BDQ & S & Rv0678 & 141_ins_c:11 &  &  \\
\cline{1-7} \cline{2-7}
ERR2510311 & EMB & R &  &  & embB & p.Met306Leu \\
\cline{1-7} \cline{2-7}
\multirow[t]{4}{*}{ERR2510328} & \multirow[t]{2}{*}{LEV} & R &  &  & gyrA & p.Asp94Asn \\
 &  & R &  &  & gyrA & p.Asp94Tyr \\
\cline{2-7}
 & \multirow[t]{2}{*}{MXF} & R &  &  & gyrA & p.Asp94Asn \\
 &  & R &  &  & gyrA & p.Asp94Tyr \\
\cline{1-7} \cline{2-7}
\multirow[t]{2}{*}{ERR2510548} & \multirow[t]{2}{*}{INH} & R &  &  & katG & p.Ser315Thr \\
 &  & R &  &  & katG & p.Ser315Asn \\
\cline{1-7} \cline{2-7}
ERR2510654 & STM & R & rrs & c517t:4 &  &  \\
\cline{1-7} \cline{2-7}
ERR2510725 & CAP & R & rrs & a1401g &  &  \\
\cline{1-7} \cline{2-7}
ERR2510733 & CAP & S & rrs & a1401g &  &  \\
\cline{1-7} \cline{2-7}