In [2]:
import pandas, numpy, copy


import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle

from src.utils import plot_truthtables, plot_growth_boxplot, plot_dilution_boxplot

import matplotlib
matplotlib.rcParams.update({'font.size': 7})

%load_ext autoreload
%autoreload 2

In [3]:
savefig=True

Read in the large `RESULTS` table created in the previous notebook

In [4]:
results = pandas.read_csv('dat/RESULTS.csv')
results[:3]

Unnamed: 0,set,drug,method,dataset,quality,sensitivity,sensitivity_sem,specificity,specificity_sem,PPV,PPV_sem,RR,SR,UR,RS,SS,US,Total
0,basic,INH,UKMYC,bootstrap-0,ALL,0.944882,,0.895197,,0.909091,,,,,,,,
1,basic,INH,UKMYC,bootstrap-0,HIGH,0.979592,,0.944915,,0.948617,,,,,,,,
2,basic,INH,UKMYC,bootstrap-1,ALL,0.964844,,0.96875,,0.972441,,,,,,,,


In [5]:
drug_names_table = pandas.read_csv("dat/drugs/drug_names_lookup.csv")
drug_names_table.set_index("DRUG", inplace=True)
drug_names_lookup = {}
for idx, row in drug_names_table.iterrows():
    drug_names_lookup[idx] = row.DRUG_NAME.capitalize()
drug_names_lookup

{'AMI': 'Amikacin',
 'BDQ': 'Bedaquiline',
 'CAP': 'Capreomycin',
 'CFZ': 'Clofazimine',
 'DLM': 'Delamanid',
 'EMB': 'Ethambutol',
 'ETH': 'Ethionamide',
 'INH': 'Isoniazid',
 'KAN': 'Kanamycin',
 'LEV': 'Levofloxacin',
 'LZD': 'Linezolid',
 'MXF': 'Moxifloxacin',
 'PZA': 'Pyrazinamide',
 'RFB': 'Rifabutin',
 'RIF': 'Rifampicin',
 'STM': 'Streptomycin'}

Now read in the list of drugs in the WHOv2 catalogue as well as the performance of the WHOv2 catalogue as reported in Annex 1 of the report

In [6]:
who_drugs = list(pandas.read_csv('dat/drugs/who2_drugs.csv').drug)

who = pandas.read_csv('dat/WHO2-Annex1-table.csv')
who = who[(who.catalogue=='WHO2') & (who.FRS==0.75)]

# reverse the order of the table so that the drugs are in the same order on all graphs
who = who.iloc[::-1]
who

Unnamed: 0,drug,catalogue,FRS,sensitivity,specificity,PPV,sensitivity_low,sensitivity_high,specificity_low,specificity_high,PPV_low,PPV_high
43,CAP,WHO2,0.75,66.2,97.8,80.1,64.1,68.2,97.6,98.1,78.1,81.9
40,KAN,WHO2,0.75,74.9,96.7,79.3,73.4,76.3,96.4,96.9,77.9,80.7
37,ETH,WHO2,0.75,74.8,85.9,63.9,73.6,76.0,85.3,86.4,62.7,65.1
34,STM,WHO2,0.75,79.7,94.1,89.9,78.9,80.5,93.7,94.4,89.3,90.5
31,AMI,WHO2,0.75,72.8,98.3,82.8,71.0,74.6,98.1,98.5,81.2,84.4
28,DLM,WHO2,0.75,14.7,99.9,72.5,10.6,19.7,99.8,99.9,58.3,84.1
25,CFZ,WHO2,0.75,17.0,98.7,38.1,14.2,20.0,98.5,98.9,32.6,43.8
22,LEV,WHO2,0.75,84.8,96.9,88.1,83.9,85.7,96.7,97.1,87.3,89.0
19,MXF,WHO2,0.75,85.7,93.5,74.0,84.6,86.8,93.2,93.9,72.7,75.2
16,LZD,WHO2,0.75,34.0,99.8,78.4,29.2,39.0,99.7,99.9,71.3,84.5


In [7]:
df = results[(results.dataset=='bootstrapped50') & (results.quality=='ALL')]

# reverse the order of the table so the drugs are in the correct order from top to bottom in the plot
df = df.iloc[::-1]

df[:15]

Unnamed: 0,set,drug,method,dataset,quality,sensitivity,sensitivity_sem,specificity,specificity_sem,PPV,PPV_sem,RR,SR,UR,RS,SS,US,Total
5406,tbprofiler,CAP,MGIT,bootstrapped50,ALL,95.201627,0.506388,89.875687,0.427619,74.568939,0.975568,,,,,,,
5352,tbprofiler,KAN,UKMYC,bootstrapped50,ALL,95.848834,0.521734,90.42656,0.38431,74.849755,0.906583,,,,,,,
5248,tbprofiler,ETH,UKMYC,bootstrapped50,ALL,68.452941,0.901986,90.14701,0.384757,78.251962,0.787475,,,,,,,
5146,tbprofiler,STM,MGIT,bootstrapped50,ALL,91.987601,0.468212,89.219826,0.526204,83.706286,0.643615,,,,,,,
5092,tbprofiler,AMI,UKMYC,bootstrapped50,ALL,97.936783,0.377808,89.784762,0.43264,72.386043,0.984924,,,,,,,
4988,tbprofiler,DLM,UKMYC,bootstrapped50,ALL,95.630281,1.949323,87.046666,0.409457,11.30204,1.105476,,,,,,,
4884,tbprofiler,CFZ,UKMYC,bootstrapped50,ALL,62.176401,2.332136,73.986428,0.518565,15.308787,0.782172,,,,,,,
4780,tbprofiler,LEV,UKMYC,bootstrapped50,ALL,91.820965,0.609063,92.390199,0.279696,83.667677,0.668867,,,,,,,
4676,tbprofiler,MXF,UKMYC,bootstrapped50,ALL,85.75605,0.959165,94.691045,0.317253,87.295731,0.741683,,,,,,,
4572,tbprofiler,LZD,UKMYC,bootstrapped50,ALL,94.367663,1.682743,91.282773,0.375608,29.01004,1.474896,,,,,,,


Now also produce a table for the UKMYC drugs with HIGH confidence MIC measurements. Because four drugs will not have values we have to manually insert the rows to ensure the graphs work.


In [8]:
df2 = results[(results.dataset=='bootstrapped50') & (results.quality=='HIGH') & (results.set=='nulls+minors')]

rows = []
for i in ['PZA', 'BDQ', 'STM', 'CAP']:
    row = ['nulls+minor', i, 'UKMYC', 'bootstrapped50', 'HIGH'] + [None for i in range(13)]
    rows.append(row)

df3 = pandas.DataFrame(rows, columns=df2.columns)
df2 = pandas.concat([df2, df3])

df2.drug = df2.drug.astype('category')
df2.drug = df2.drug.cat.set_categories(who_drugs)
df2.sort_values('drug', inplace=True)
df2 = df2.iloc[::-1]
df2['set'] = 'nulls+minor+high'
df2

# And again for tbprofiler, so we have _just_ high quality results too
df4 = results[(results.dataset=='bootstrapped50') & (results.quality=='HIGH') & (results.set=='tbprofiler')]

rows = []
for i in ['PZA', 'BDQ', 'STM', 'CAP']:
    row = ['tbprofiler', i, 'UKMYC', 'bootstrapped50', 'HIGH'] + [None for i in range(13)]
    rows.append(row)

df3 = pandas.DataFrame(rows, columns=df4.columns)
df4 = pandas.concat([df4, df3])

df4.drug = df4.drug.astype('category')
df4.drug = df4.drug.cat.set_categories(who_drugs)
df4.sort_values('drug', inplace=True)
df4 = df4.iloc[::-1]
df4['set'] = 'tbprofiler+high'
df4

  df2 = pandas.concat([df2, df3])
  df4 = pandas.concat([df4, df3])


Unnamed: 0,set,drug,method,dataset,quality,sensitivity,sensitivity_sem,specificity,specificity_sem,PPV,PPV_sem,RR,SR,UR,RS,SS,US,Total
3,tbprofiler+high,CAP,UKMYC,bootstrapped50,HIGH,,,,,,,,,,,,,
5353,tbprofiler+high,KAN,UKMYC,bootstrapped50,HIGH,96.209138,0.494473,93.601508,0.356409,83.783041,0.852305,,,,,,,
5249,tbprofiler+high,ETH,UKMYC,bootstrapped50,HIGH,69.082742,1.125784,91.803011,0.364159,82.433325,0.907601,,,,,,,
2,tbprofiler+high,STM,UKMYC,bootstrapped50,HIGH,,,,,,,,,,,,,
5093,tbprofiler+high,AMI,UKMYC,bootstrapped50,HIGH,97.727768,0.407523,92.765749,0.381687,81.901791,1.007373,,,,,,,
4989,tbprofiler+high,DLM,UKMYC,bootstrapped50,HIGH,100.0,0.0,93.045304,0.331268,20.252971,1.713067,,,,,,,
4885,tbprofiler+high,CFZ,UKMYC,bootstrapped50,HIGH,33.74662,3.042795,89.481445,0.36423,13.127144,1.140586,,,,,,,
4781,tbprofiler+high,LEV,UKMYC,bootstrapped50,HIGH,92.881791,0.621038,94.515939,0.398998,88.668137,0.750139,,,,,,,
4677,tbprofiler+high,MXF,UKMYC,bootstrapped50,HIGH,86.906317,0.673083,95.729101,0.316557,91.188263,0.654171,,,,,,,
4573,tbprofiler+high,LZD,UKMYC,bootstrapped50,HIGH,96.053094,1.189539,94.592865,0.290237,42.806432,1.936738,,,,,,,


In [9]:
table = results[(results.dataset=='bootstrapped50') & (results.quality=='ALL') & (results.set!='nulls')]
table = pandas.concat([table, df2, df4])
table = table[['set', 'drug', 'sensitivity',
       'sensitivity_sem', 'specificity', 'specificity_sem']]

for col in ['sensitivity', 'sensitivity_sem', 'specificity', 'specificity_sem']:
    table[col] = table[col].map('{:,.1f}'.format)
for col in ['sensitivity', 'specificity']:
    table[col] = table[col] + ' ±' + table[col+'_sem']
table = table[['set','drug', 'sensitivity',
        'specificity', ]]
table.drug = table.drug.astype('category')
table.drug = table.drug.cat.set_categories(who_drugs)
table.set = table.set.astype('category')
table.set = table.set.cat.set_categories(['basic', 'nulls+minors', 'nulls+minor+high', "tbprofiler", "tbprofiler+high"])
table.set_index(['set','drug'], inplace=True)
table = table.unstack(level=0)
table.columns = table.columns.swaplevel(0,1)
table.sort_index(axis=1, level=0, inplace=True)
table.replace('nan ±nan', '', inplace=True)
table.rename(drug_names_lookup, inplace=True)
table

set,basic,basic,nulls+minors,nulls+minors,nulls+minor+high,nulls+minor+high,tbprofiler,tbprofiler,tbprofiler+high,tbprofiler+high
Unnamed: 0_level_1,sensitivity,specificity,sensitivity,specificity,sensitivity,specificity,sensitivity,specificity,sensitivity,specificity
drug,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
Isoniazid,95.2 ±0.4,93.2 ±0.5,94.5 ±0.4,94.4 ±0.4,95.7 ±0.4,96.0 ±0.3,94.9 ±0.4,92.7 ±0.4,95.6 ±0.4,95.6 ±0.3
Rifampicin,95.4 ±0.4,94.4 ±0.4,94.5 ±0.4,95.8 ±0.3,95.6 ±0.3,96.5 ±0.4,94.5 ±0.5,95.4 ±0.4,95.9 ±0.4,96.7 ±0.4
Pyrazinamide,96.4 ±0.5,89.0 ±0.5,95.7 ±0.5,91.5 ±0.5,,,96.7 ±0.3,90.9 ±0.5,,
Ethambutol,69.3 ±1.0,95.0 ±0.4,68.4 ±1.0,95.5 ±0.3,67.3 ±1.0,95.9 ±0.4,68.7 ±0.9,95.2 ±0.4,67.3 ±0.9,96.5 ±0.3
Bedaquiline,96.8 ±0.5,65.4 ±0.7,97.2 ±0.4,67.6 ±0.7,,,97.7 ±0.3,74.6 ±0.7,,
Linezolid,97.0 ±1.2,91.0 ±0.4,94.3 ±1.9,91.7 ±0.4,95.6 ±1.3,94.8 ±0.3,94.4 ±1.7,91.3 ±0.4,96.1 ±1.2,94.6 ±0.3
Moxifloxacin,86.0 ±0.9,92.8 ±0.4,84.9 ±0.9,94.4 ±0.3,86.2 ±0.8,95.9 ±0.3,85.8 ±1.0,94.7 ±0.3,86.9 ±0.7,95.7 ±0.3
Levofloxacin,92.4 ±0.7,91.1 ±0.5,91.7 ±0.7,93.0 ±0.4,92.6 ±0.5,95.0 ±0.3,91.8 ±0.6,92.4 ±0.3,92.9 ±0.6,94.5 ±0.4
Clofazimine,62.8 ±3.3,73.4 ±0.5,62.4 ±3.1,73.5 ±0.6,21.4 ±3.4,89.5 ±0.3,62.2 ±2.3,74.0 ±0.5,33.7 ±3.0,89.5 ±0.4
Delamanid,95.5 ±1.8,88.3 ±0.5,95.5 ±1.8,88.3 ±0.5,100.0 ±0.0,94.2 ±0.3,95.6 ±1.9,87.0 ±0.4,100.0 ±0.0,93.0 ±0.3


In [10]:
print(table.to_latex(column_format='r|rr|rr|rr', multirow=True))

\begin{tabular}{r|rr|rr|rr}
\toprule
set & \multicolumn{2}{r}{basic} & \multicolumn{2}{r}{nulls+minors} & \multicolumn{2}{r}{nulls+minor+high} & \multicolumn{2}{r}{tbprofiler} & \multicolumn{2}{r}{tbprofiler+high} \\
 & sensitivity & specificity & sensitivity & specificity & sensitivity & specificity & sensitivity & specificity & sensitivity & specificity \\
drug &  &  &  &  &  &  &  &  &  &  \\
\midrule
Isoniazid & 95.2 ±0.4 & 93.2 ±0.5 & 94.5 ±0.4 & 94.4 ±0.4 & 95.7 ±0.4 & 96.0 ±0.3 & 94.9 ±0.4 & 92.7 ±0.4 & 95.6 ±0.4 & 95.6 ±0.3 \\
Rifampicin & 95.4 ±0.4 & 94.4 ±0.4 & 94.5 ±0.4 & 95.8 ±0.3 & 95.6 ±0.3 & 96.5 ±0.4 & 94.5 ±0.5 & 95.4 ±0.4 & 95.9 ±0.4 & 96.7 ±0.4 \\
Pyrazinamide & 96.4 ±0.5 & 89.0 ±0.5 & 95.7 ±0.5 & 91.5 ±0.5 &  &  & 96.7 ±0.3 & 90.9 ±0.5 &  &  \\
Ethambutol & 69.3 ±1.0 & 95.0 ±0.4 & 68.4 ±1.0 & 95.5 ±0.3 & 67.3 ±1.0 & 95.9 ±0.4 & 68.7 ±0.9 & 95.2 ±0.4 & 67.3 ±0.9 & 96.5 ±0.3 \\
Bedaquiline & 96.8 ±0.5 & 65.4 ±0.7 & 97.2 ±0.4 & 67.6 ±0.7 &  &  & 97.7 ±0.3 & 74.6 ±0.7 &

In [11]:
table = results[(results.dataset=='bootstrapped50') & (results.quality=='ALL') & (results.set!='nulls')]
table = pandas.concat([table, df2, df4])
table = table[['set', 'drug', 'PPV',
       'PPV_sem']]

for col in ['PPV', 'PPV_sem']:
    table[col] = table[col].map('{:,.1f}'.format)
for col in ['PPV']:
    table[col] = table[col] + ' ±' + table[col+'_sem']
table = table[['set','drug', 'PPV']]
table.drug = table.drug.astype('category')
table.drug = table.drug.cat.set_categories(who_drugs)
table.set = table.set.astype('category')
table.set = table.set.cat.set_categories(['basic', 'nulls+minors', 'nulls+minor+high', "tbprofiler", "tbprofiler+high"])
table.set_index(['set','drug'], inplace=True)
table = table.unstack(level=0)
table.columns = table.columns.swaplevel(0,1)
table.sort_index(axis=1, level=0, inplace=True)
table.replace('nan ±nan', '', inplace=True)
table.rename(drug_names_lookup, inplace=True)
table

set,basic,nulls+minors,nulls+minor+high,tbprofiler,tbprofiler+high
Unnamed: 0_level_1,PPV,PPV,PPV,PPV,PPV
drug,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Isoniazid,93.3 ±0.4,94.7 ±0.4,96.3 ±0.3,92.8 ±0.4,95.8 ±0.3
Rifampicin,93.9 ±0.4,95.6 ±0.4,96.7 ±0.3,95.1 ±0.4,96.7 ±0.3
Pyrazinamide,85.7 ±0.7,89.3 ±0.6,,88.6 ±0.6,
Ethambutol,88.8 ±0.8,90.1 ±0.7,92.1 ±0.6,89.8 ±0.7,93.0 ±0.5
Bedaquiline,45.4 ±1.0,50.7 ±1.0,,67.6 ±0.8,
Linezolid,24.3 ±1.9,30.4 ±2.1,44.9 ±2.0,29.0 ±1.5,42.8 ±1.9
Moxifloxacin,83.3 ±0.9,87.4 ±0.7,92.0 ±0.6,87.3 ±0.7,91.2 ±0.7
Levofloxacin,81.2 ±0.9,85.6 ±0.7,90.1 ±0.7,83.7 ±0.7,88.7 ±0.8
Clofazimine,8.0 ±0.7,8.8 ±0.7,5.5 ±0.8,15.3 ±0.8,13.1 ±1.1
Delamanid,16.6 ±1.5,16.6 ±1.5,28.9 ±2.5,11.3 ±1.1,20.3 ±1.7


In [12]:
print(table.to_latex(column_format='r|rr|rr|rr', multirow=True))

\begin{tabular}{r|rr|rr|rr}
\toprule
set & basic & nulls+minors & nulls+minor+high & tbprofiler & tbprofiler+high \\
 & PPV & PPV & PPV & PPV & PPV \\
drug &  &  &  &  &  \\
\midrule
Isoniazid & 93.3 ±0.4 & 94.7 ±0.4 & 96.3 ±0.3 & 92.8 ±0.4 & 95.8 ±0.3 \\
Rifampicin & 93.9 ±0.4 & 95.6 ±0.4 & 96.7 ±0.3 & 95.1 ±0.4 & 96.7 ±0.3 \\
Pyrazinamide & 85.7 ±0.7 & 89.3 ±0.6 &  & 88.6 ±0.6 &  \\
Ethambutol & 88.8 ±0.8 & 90.1 ±0.7 & 92.1 ±0.6 & 89.8 ±0.7 & 93.0 ±0.5 \\
Bedaquiline & 45.4 ±1.0 & 50.7 ±1.0 &  & 67.6 ±0.8 &  \\
Linezolid & 24.3 ±1.9 & 30.4 ±2.1 & 44.9 ±2.0 & 29.0 ±1.5 & 42.8 ±1.9 \\
Moxifloxacin & 83.3 ±0.9 & 87.4 ±0.7 & 92.0 ±0.6 & 87.3 ±0.7 & 91.2 ±0.7 \\
Levofloxacin & 81.2 ±0.9 & 85.6 ±0.7 & 90.1 ±0.7 & 83.7 ±0.7 & 88.7 ±0.8 \\
Clofazimine & 8.0 ±0.7 & 8.8 ±0.7 & 5.5 ±0.8 & 15.3 ±0.8 & 13.1 ±1.1 \\
Delamanid & 16.6 ±1.5 & 16.6 ±1.5 & 28.9 ±2.5 & 11.3 ±1.1 & 20.3 ±1.7 \\
Amikacin & 73.4 ±0.9 & 76.1 ±1.0 & 84.8 ±0.7 & 72.4 ±1.0 & 81.9 ±1.0 \\
Streptomycin & 89.7 ±0.6 & 90.7 ±0.6 & 

In [13]:
colours = {'sensitivity': ['#990000', '#d7301f', '#ef6548'], 'specificity': ['#034e7b', '#0570b0','#3690c0'], 'PPV': ['#005a32', '#238443','#41ab5d']}


for metric in ['sensitivity', 'specificity','PPV']:
    fig = plt.figure(figsize=(2.8, 8.5))
    axes = plt.gca()
    axes.spines["top"].set_visible(False)
    axes.spines["right"].set_visible(False)
    axes.spines["bottom"].set_visible(False)
    axes.get_xaxis().set_visible(False)
    axes.plot([100,100], [-0.5, 14.5], color='#cccccc', linewidth=0.5, linestyle='-')
    y=numpy.arange(len(df[results.set=='basic']))
    axes.set_yticks(y, df[results.set=='basic']['drug'])
    e = (
        who[metric + "_high"] - who[metric],
        who[metric] - who[metric + "_low"],
    )
    axes.barh(y+0.3, who[metric], 0.2, label=who[metric], color='#cccccc', edgecolor='white', linewidth=1, alpha=0.5)
    subset = who[[metric]]
    subset.columns = ['x']
    y=0
    for idx,row in subset.iterrows():
        axes.text(row.x+2, y+0.3, "%.1f" % row.x, ha="left", va='center', color='#cccccc', fontweight='light')
        y+=1

    subset = df[results.set=='basic'][[metric, metric+'_sem']]
    subset.columns = ['x', 'e']
    y=0
    for idx,row in subset.iterrows():
        axes.plot([row.x, row.x], [y, y+0.2], color=colours[metric][0], linewidth=1)
        axes.add_patch(Rectangle((row.x-row.e, y), 2*row.e, 0.2, fc=colours[metric][0], alpha=0.2))
        axes.text(row.x+2, y+0.1, "%.1f" % row.x, ha="left", va='center', color=colours[metric][0], fontweight='heavy') #, backgroundcolor='white')
        y+=1

    subset = df[results.set=='nulls+minors'][[metric, metric+'_sem']]    
    subset.columns = ['x', 'e']
    y=0
    for idx,row in subset.iterrows():
        axes.plot([row.x, row.x], [y-0.2, y], color=colours[metric][1], linewidth=1)
        axes.add_patch(Rectangle((row.x-row.e, y-0.2), 2*row.e, 0.2, fc=colours[metric][1], alpha=0.2))
        axes.text(row.x+2, y-0.1, "%.1f" % row.x, ha="left", va='center', color=colours[metric][1], fontweight='bold')
        y+=1

    subset = df2[[metric, metric+'_sem']]
    subset.columns = ['x', 'e']
    y=0
    for idx,row in subset.iterrows():
        axes.plot([row.x, row.x], [y-0.4, y-0.2], color=colours[metric][2], linewidth=1)
        axes.add_patch(Rectangle((row.x-row.e, y-0.4), 2*row.e, 0.2, fc=colours[metric][2], alpha=0.2))
        axes.text(row.x+2, y-0.3, "%.1f" % row.x, ha="left", va='center', color=colours[metric][2], fontweight='bold')
        y+=1

    axes.set_ylim(-0.3, 14.5)

    
    fig.savefig('pdf/fig-results-main-'+metric+'.pdf', bbox_inches='tight')
    plt.close()

  y=numpy.arange(len(df[results.set=='basic']))
  axes.set_yticks(y, df[results.set=='basic']['drug'])
  subset = df[results.set=='basic'][[metric, metric+'_sem']]
  subset = df[results.set=='nulls+minors'][[metric, metric+'_sem']]
  y=numpy.arange(len(df[results.set=='basic']))
  axes.set_yticks(y, df[results.set=='basic']['drug'])
  subset = df[results.set=='basic'][[metric, metric+'_sem']]
  subset = df[results.set=='nulls+minors'][[metric, metric+'_sem']]
  y=numpy.arange(len(df[results.set=='basic']))
  axes.set_yticks(y, df[results.set=='basic']['drug'])
  subset = df[results.set=='basic'][[metric, metric+'_sem']]
  subset = df[results.set=='nulls+minors'][[metric, metric+'_sem']]


In [14]:
# Version of the above, but swapping internal comparisons for tbprofiler
colours = {'sensitivity': ['#d7301f', '#b5514a', '#2a30de', '#0570b0'], 'specificity': ['#d7301f', '#b5514a', '#2a30de', '#0570b0'], 'PPV': ['#d7301f', '#b5514a', '#2a30de', '#0570b0']}

for metric in ['sensitivity', 'specificity','PPV']:
    fig = plt.figure(figsize=(2.8, 8.5))
    axes = plt.gca()
    axes.spines["top"].set_visible(False)
    axes.spines["right"].set_visible(False)
    axes.spines["bottom"].set_visible(False)
    axes.get_xaxis().set_visible(False)
    axes.plot([100,100], [-0.5, 14.5], color='#cccccc', linewidth=0.5, linestyle='-')
    y=numpy.arange(len(df[results.set=='nulls+minors']))
    axes.set_yticks(y, df[results.set=='nulls+minors']['drug'])
    e = (
        who[metric + "_high"] - who[metric],
        who[metric] - who[metric + "_low"],
    )
    axes.barh(y+0.2, who[metric], 0.2, label=who[metric], color='#cccccc', edgecolor='white', linewidth=1, alpha=0.5)

    # nulls+minors
    subset = df[results.set=='nulls+minors'][[metric, metric+'_sem']]    
    subset.columns = ['x', 'e']
    y=0
    for idx,row in subset.iterrows():
        axes.plot([row.x, row.x], [y, y+0.2], color=colours[metric][0], linewidth=1)
        axes.add_patch(Rectangle((row.x-row.e, y), 2*row.e, 0.2, fc=colours[metric][0], alpha=0.2))
        axes.text(row.x+2, y+0.1, "%.1f" % row.x, ha="left", va='center', color=colours[metric][0], fontweight='bold')
        y+=1

    # tbprofiler
    subset = df[results.set=='tbprofiler'][[metric, metric+'_sem']]    
    subset.columns = ['x', 'e']
    y=0
    for idx,row in subset.iterrows():
        axes.plot([row.x, row.x], [y-0.2, y], color=colours[metric][2], linewidth=1)
        axes.add_patch(Rectangle((row.x-row.e, y-0.2), 2*row.e, 0.2, fc=colours[metric][2], alpha=0.2))
        axes.text(row.x+2, y-0.1, "%.1f" % row.x, ha="left", va='center', color=colours[metric][2], fontweight='bold')
        y+=1

    # nulls+minors+high
    subset = df2[[metric, metric+'_sem']]
    subset.columns = ['x', 'e']
    y=0
    for idx,row in subset.iterrows():
        axes.plot([row.x, row.x], [y-0.4, y-0.2], color=colours[metric][1], linewidth=1)
        axes.add_patch(Rectangle((row.x-row.e, y-0.4), 2*row.e, 0.2, fc=colours[metric][1], alpha=0.2))
        axes.text(row.x+2, y-0.3, "%.1f" % row.x, ha="left", va='center', color=colours[metric][1], fontweight='bold')
        y+=1
    
    # tbprofiler+high
    subset = df4[[metric, metric+'_sem']]
    subset.columns = ['x', 'e']
    y=0
    for idx,row in subset.iterrows():
        axes.plot([row.x, row.x], [y-0.6, y-0.4], color=colours[metric][3], linewidth=1)
        axes.add_patch(Rectangle((row.x-row.e, y-0.6), 2*row.e, 0.2, fc=colours[metric][3], alpha=0.2))
        axes.text(row.x+2, y-0.5, "%.1f" % row.x, ha="left", va='center', color=colours[metric][3], fontweight='bold')
        y+=1

    axes.set_ylim(-0.3, 14.5)

    
    fig.savefig('pdf/fig-results-main-tbprofiler-'+metric+'.pdf', bbox_inches='tight')
    plt.close()

  y=numpy.arange(len(df[results.set=='nulls+minors']))
  axes.set_yticks(y, df[results.set=='nulls+minors']['drug'])
  subset = df[results.set=='nulls+minors'][[metric, metric+'_sem']]
  subset = df[results.set=='tbprofiler'][[metric, metric+'_sem']]
  y=numpy.arange(len(df[results.set=='nulls+minors']))
  axes.set_yticks(y, df[results.set=='nulls+minors']['drug'])
  subset = df[results.set=='nulls+minors'][[metric, metric+'_sem']]
  subset = df[results.set=='tbprofiler'][[metric, metric+'_sem']]
  y=numpy.arange(len(df[results.set=='nulls+minors']))
  axes.set_yticks(y, df[results.set=='nulls+minors']['drug'])
  subset = df[results.set=='nulls+minors'][[metric, metric+'_sem']]
  subset = df[results.set=='tbprofiler'][[metric, metric+'_sem']]


In [15]:
for set in ['basic', 'nulls', 'nulls+minors', "tbprofiler"]:

    df = results[(results.set==set)]

    plot_truthtables(df, ['ALL','HIGH'], filestem=f'table-{set}-', savefig=savefig)


In [16]:
df

Unnamed: 0,set,drug,method,dataset,quality,sensitivity,sensitivity_sem,specificity,specificity_sem,PPV,PPV_sem,RR,SR,UR,RS,SS,US,Total
4056,tbprofiler,INH,UKMYC,bootstrap-0,ALL,0.949791,,0.901575,,0.900794,,,,,,,,
4057,tbprofiler,INH,UKMYC,bootstrap-0,HIGH,0.938697,,0.965957,,0.968379,,,,,,,,
4058,tbprofiler,INH,UKMYC,bootstrap-1,ALL,0.940928,,0.930233,,0.925311,,,,,,,,
4059,tbprofiler,INH,UKMYC,bootstrap-1,HIGH,0.930147,,0.955556,,0.961977,,,,,,,,
4060,tbprofiler,INH,UKMYC,bootstrap-10,ALL,0.944664,,0.945607,,0.948413,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5403,tbprofiler,CAP,MGIT,bootstrap-7,ALL,0.948276,,0.908136,,0.758621,,,,,,,,
5404,tbprofiler,CAP,MGIT,bootstrap-8,ALL,0.975410,,0.872679,,0.712575,,,,,,,,
5405,tbprofiler,CAP,MGIT,bootstrap-9,ALL,0.962121,,0.871935,,0.729885,,,,,,,,
5406,tbprofiler,CAP,MGIT,bootstrapped50,ALL,95.201627,0.506388,89.875687,0.427619,74.568939,0.975568,,,,,,,


In [17]:
UKMYC_PHENOTYPES = pandas.read_csv('dat/UKMYC_1000_phenotypes.csv')
UKMYC_PHENOTYPES.set_index('ENA_RUN_ACCESSION', inplace=True)

UKMYC_SAMPLES = pandas.read_csv('dat/UKMYC_1000_samples.csv')
UKMYC_SAMPLES.set_index('ENA_RUN_ACCESSION', inplace=True)

UKMYC_PHENOTYPES = UKMYC_PHENOTYPES.join(UKMYC_SAMPLES[['POS_AVG_GROWTH']])
UKMYC_PHENOTYPES.reset_index(inplace=True)
UKMYC_PHENOTYPES.set_index(['ENA_RUN_ACCESSION', 'DRUG'], inplace=True)
UKMYC_PHENOTYPES[:3]

Unnamed: 0_level_0,Unnamed: 1_level_0,UNIQUEID,BINARY_PHENOTYPE,PHENOTYPE_QUALITY,PHENOTYPE_METHOD,PLATEDESIGN,MIC,DILUTION,POS_AVG_GROWTH
ENA_RUN_ACCESSION,DRUG,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ERR4810791,INH,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>1.6,8,44.68
ERR4810791,RIF,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>4,8,44.68
ERR4810791,EMB,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>8,9,44.68


In [18]:
PREDICTIONS = pandas.read_csv('dat/PREDICTIONS.csv')
tbprofiler = pandas.read_csv('dat/tbprofiler_PREDICTIONS.csv')
tbprofiler["SET"] = "tbprofiler"
PREDICTIONS = PREDICTIONS._append(tbprofiler, ignore_index=True)
print(PREDICTIONS)
PREDICTIONS.set_index(['ENA_RUN_ACCESSION', 'DRUG'], inplace=True)
PREDICTIONS[:3]

               SET ENA_RUN_ACCESSION DRUG PREDICTION
0            basic       ERR13286038  INH          R
1            basic       ERR13286038  RIF          R
2            basic       ERR13286038  PZA          R
3            basic       ERR13286038  EMB          U
4            basic       ERR13286038  BDQ          S
...            ...               ...  ...        ...
159775  tbprofiler        ERR2516031  AMI          S
159776  tbprofiler        ERR2516031  KAN          S
159777  tbprofiler        ERR2516031  CAP          S
159778  tbprofiler        ERR2516031  CFZ          S
159779  tbprofiler        ERR2516031  ETH          R

[159780 rows x 4 columns]


Unnamed: 0_level_0,Unnamed: 1_level_0,SET,PREDICTION
ENA_RUN_ACCESSION,DRUG,Unnamed: 2_level_1,Unnamed: 3_level_1
ERR13286038,INH,basic,R
ERR13286038,RIF,basic,R
ERR13286038,PZA,basic,R


In [19]:
UKMYC_RESULTS = UKMYC_PHENOTYPES.join(PREDICTIONS)
UKMYC_RESULTS.reset_index(inplace=True)
UKMYC_RESULTS[:3]

Unnamed: 0,ENA_RUN_ACCESSION,DRUG,UNIQUEID,BINARY_PHENOTYPE,PHENOTYPE_QUALITY,PHENOTYPE_METHOD,PLATEDESIGN,MIC,DILUTION,POS_AVG_GROWTH,SET,PREDICTION
0,ERR4810791,INH,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>1.6,8,44.68,basic,R
1,ERR4810791,INH,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>1.6,8,44.68,nulls,R
2,ERR4810791,INH,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>1.6,8,44.68,nulls+minors,R


In [20]:
def define_outcome(row):
    if row.PREDICTION in ['S', 'U']:
        if row.BINARY_PHENOTYPE == 'R':
            return '(S+U)R'
        else:
            return '(S+U)S'
    else:
        return row.PREDICTION+row.BINARY_PHENOTYPE

UKMYC_RESULTS['OUTCOME'] = UKMYC_RESULTS.apply(define_outcome, axis=1)
UKMYC_RESULTS.OUTCOME.value_counts()

OUTCOME
(S+U)S    33105
RR         9525
(S+U)R     3873
RS         1477
FS           18
FR            2
Name: count, dtype: int64

In [21]:
DISCREPANCY_SET = UKMYC_RESULTS[(UKMYC_RESULTS.PHENOTYPE_METHOD=='UKMYC') & (UKMYC_RESULTS.OUTCOME.str[0]!='F')]
DISCREPANCY_SET[:6]

Unnamed: 0,ENA_RUN_ACCESSION,DRUG,UNIQUEID,BINARY_PHENOTYPE,PHENOTYPE_QUALITY,PHENOTYPE_METHOD,PLATEDESIGN,MIC,DILUTION,POS_AVG_GROWTH,SET,PREDICTION,OUTCOME
0,ERR4810791,INH,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>1.6,8,44.68,basic,R,RR
1,ERR4810791,INH,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>1.6,8,44.68,nulls,R,RR
2,ERR4810791,INH,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>1.6,8,44.68,nulls+minors,R,RR
3,ERR4810791,INH,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>1.6,8,44.68,tbprofiler,R,RR
4,ERR4810791,RIF,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>4,8,44.68,basic,R,RR
5,ERR4810791,RIF,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>4,8,44.68,nulls,R,RR


In [22]:
for set in ['basic', 'nulls', 'nulls+minors', "tbprofiler"]:
    for drug in DISCREPANCY_SET.DRUG.unique():
        for quality in ['HIGH','ALL']:
            if quality=='HIGH':
                df = copy.deepcopy(DISCREPANCY_SET[(DISCREPANCY_SET.SET==set) & (DISCREPANCY_SET.DRUG==drug) & (DISCREPANCY_SET.PHENOTYPE_QUALITY==quality)])
            else:

                df = copy.deepcopy(DISCREPANCY_SET[(DISCREPANCY_SET.SET==set) & (DISCREPANCY_SET.DRUG==drug)])
            plot_growth_boxplot(df, filename=f'growth-{set}-{drug}-{quality}.pdf', savefig=savefig)

In [23]:
for set in ['basic', 'nulls', 'nulls+minors', "tbprofiler"]:
    for platedesign in ['UKMYC5', 'UKMYC6']:
        for drug in DISCREPANCY_SET.DRUG.unique():
            for quality in ['HIGH','ALL']:
                if quality=='HIGH':
                    df = copy.deepcopy(DISCREPANCY_SET[(DISCREPANCY_SET.SET==set) & (DISCREPANCY_SET.PLATEDESIGN==platedesign) & (DISCREPANCY_SET.DRUG==drug) & (DISCREPANCY_SET.PHENOTYPE_QUALITY==quality)])
                else:

                    df = copy.deepcopy(DISCREPANCY_SET[(DISCREPANCY_SET.SET==set) & (DISCREPANCY_SET.PLATEDESIGN==platedesign) & (DISCREPANCY_SET.DRUG==drug)])
                if df.empty:
                    continue
                plot_dilution_boxplot(df, filename=f'mic-{set}-{drug}-{platedesign}-{quality}.pdf', savefig=savefig)

  df = df[["MIC", "OUTCOME", "ENA_RUN_ACCESSION"]].groupby(["OUTCOME", "MIC"]).count()
  df = df[["MIC", "OUTCOME", "ENA_RUN_ACCESSION"]].groupby(["OUTCOME", "MIC"]).count()
  df = df[["MIC", "OUTCOME", "ENA_RUN_ACCESSION"]].groupby(["OUTCOME", "MIC"]).count()
  df = df[["MIC", "OUTCOME", "ENA_RUN_ACCESSION"]].groupby(["OUTCOME", "MIC"]).count()
  df = df[["MIC", "OUTCOME", "ENA_RUN_ACCESSION"]].groupby(["OUTCOME", "MIC"]).count()
  df = df[["MIC", "OUTCOME", "ENA_RUN_ACCESSION"]].groupby(["OUTCOME", "MIC"]).count()
  df = df[["MIC", "OUTCOME", "ENA_RUN_ACCESSION"]].groupby(["OUTCOME", "MIC"]).count()
  df = df[["MIC", "OUTCOME", "ENA_RUN_ACCESSION"]].groupby(["OUTCOME", "MIC"]).count()
  df = df[["MIC", "OUTCOME", "ENA_RUN_ACCESSION"]].groupby(["OUTCOME", "MIC"]).count()
  df = df[["MIC", "OUTCOME", "ENA_RUN_ACCESSION"]].groupby(["OUTCOME", "MIC"]).count()
  df = df[["MIC", "OUTCOME", "ENA_RUN_ACCESSION"]].groupby(["OUTCOME", "MIC"]).count()
  df = df[["MIC", "OUTCOME", "ENA_RUN_ACCES