In [1]:
import pandas, numpy, copy


import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle

from src.utils import plot_truthtables, plot_growth_boxplot, plot_dilution_boxplot

import matplotlib
matplotlib.rcParams.update({'font.size': 7})

%load_ext autoreload
%autoreload 2

  from scipy.stats import sem


In [2]:
savefig=True

Read in the large `RESULTS` table created in the previous notebook

In [3]:
results = pandas.read_csv('dat/RESULTS.csv')
results[:3]

Unnamed: 0,set,drug,method,dataset,quality,sensitivity,sensitivity_sem,specificity,specificity_sem,PPV,PPV_sem,RR,SR,UR,RS,SS,US,Total
0,basic,INH,UKMYC,bootstrap-0,ALL,88.803089,,95.020747,,95.041322,,,,,,,,
1,basic,INH,UKMYC,bootstrap-0,HIGH,93.172691,,96.414343,,96.26556,,,,,,,,
2,basic,INH,UKMYC,bootstrap-1,ALL,91.828794,,94.650206,,94.779116,,,,,,,,


In [4]:
drug_names_table = pandas.read_csv("dat/drugs/drug_names_lookup.csv")
drug_names_table.set_index("DRUG", inplace=True)
drug_names_lookup = {}
for idx, row in drug_names_table.iterrows():
    drug_names_lookup[idx] = row.DRUG_NAME.capitalize()
drug_names_lookup

{'AMI': 'Amikacin',
 'BDQ': 'Bedaquiline',
 'CAP': 'Capreomycin',
 'CFZ': 'Clofazimine',
 'DLM': 'Delamanid',
 'EMB': 'Ethambutol',
 'ETH': 'Ethionamide',
 'INH': 'Isoniazid',
 'KAN': 'Kanamycin',
 'LEV': 'Levofloxacin',
 'LZD': 'Linezolid',
 'MXF': 'Moxifloxacin',
 'PZA': 'Pyrazinamide',
 'RFB': 'Rifabutin',
 'RIF': 'Rifampicin',
 'STM': 'Streptomycin'}

Now read in the list of drugs in the WHOv2 catalogue as well as the performance of the WHOv2 catalogue as reported in Annex 1 of the report

In [5]:
who_drugs = list(pandas.read_csv('dat/drugs/who2_drugs.csv').drug)

who = pandas.read_csv('dat/WHO2-Annex1-table.csv')
who = who[(who.catalogue=='WHO2') & (who.FRS==0.75)]

# reverse the order of the table so that the drugs are in the same order on all graphs
who = who.iloc[::-1]
who

Unnamed: 0,drug,catalogue,FRS,sensitivity,specificity,PPV,sensitivity_low,sensitivity_high,specificity_low,specificity_high,PPV_low,PPV_high
43,CAP,WHO2,0.75,66.2,97.8,80.1,64.1,68.2,97.6,98.1,78.1,81.9
40,KAN,WHO2,0.75,74.9,96.7,79.3,73.4,76.3,96.4,96.9,77.9,80.7
37,ETH,WHO2,0.75,74.8,85.9,63.9,73.6,76.0,85.3,86.4,62.7,65.1
34,STM,WHO2,0.75,79.7,94.1,89.9,78.9,80.5,93.7,94.4,89.3,90.5
31,AMI,WHO2,0.75,72.8,98.3,82.8,71.0,74.6,98.1,98.5,81.2,84.4
28,DLM,WHO2,0.75,14.7,99.9,72.5,10.6,19.7,99.8,99.9,58.3,84.1
25,CFZ,WHO2,0.75,17.0,98.7,38.1,14.2,20.0,98.5,98.9,32.6,43.8
22,LEV,WHO2,0.75,84.8,96.9,88.1,83.9,85.7,96.7,97.1,87.3,89.0
19,MXF,WHO2,0.75,85.7,93.5,74.0,84.6,86.8,93.2,93.9,72.7,75.2
16,LZD,WHO2,0.75,34.0,99.8,78.4,29.2,39.0,99.7,99.9,71.3,84.5


In [6]:
df = results[(results.dataset=='bootstrapped50') & (results.quality=='ALL')]

# reverse the order of the table so the drugs are in the correct order from top to bottom in the plot
df = df.iloc[::-1]

df[:15]

Unnamed: 0,set,drug,method,dataset,quality,sensitivity,sensitivity_sem,specificity,specificity_sem,PPV,PPV_sem,RR,SR,UR,RS,SS,US,Total
5406,tbprofiler,CAP,MGIT,bootstrapped50,ALL,73.778591,0.858111,98.356579,0.165979,95.263516,0.46716,,,,,,,
5352,tbprofiler,KAN,UKMYC,bootstrapped50,ALL,75.792708,1.219868,98.773814,0.144725,96.108923,0.498343,,,,,,,
5248,tbprofiler,ETH,UKMYC,bootstrapped50,ALL,76.163428,0.972133,85.782603,0.515811,68.869456,0.951987,,,,,,,
5146,tbprofiler,STM,MGIT,bootstrapped50,ALL,81.65828,0.755183,94.734582,0.370543,91.492364,0.564204,,,,,,,
5092,tbprofiler,AMI,UKMYC,bootstrapped50,ALL,73.524728,1.340422,99.452694,0.110497,98.156461,0.381987,,,,,,,
4988,tbprofiler,DLM,UKMYC,bootstrapped50,ALL,12.136589,1.173966,99.884504,0.048697,93.673138,3.049446,,,,,,,
4884,tbprofiler,CFZ,UKMYC,bootstrapped50,ALL,15.681539,0.954221,96.579447,0.304725,64.659869,2.417524,,,,,,,
4780,tbprofiler,LEV,UKMYC,bootstrapped50,ALL,84.375245,0.899532,96.322791,0.282159,91.641336,0.683472,,,,,,,
4676,tbprofiler,MXF,UKMYC,bootstrapped50,ALL,87.803368,0.769765,93.905796,0.327253,85.483244,0.859514,,,,,,,
4572,tbprofiler,LZD,UKMYC,bootstrapped50,ALL,28.336685,2.127962,99.754229,0.066905,93.981433,1.690401,,,,,,,


Now also produce a table for the UKMYC drugs with HIGH confidence MIC measurements. Because four drugs will not have values we have to manually insert the rows to ensure the graphs work.


In [7]:
df2 = results[(results.dataset=='bootstrapped50') & (results.quality=='HIGH') & (results.set=='nulls+minors')]

rows = []
for i in ['PZA', 'BDQ', 'STM', 'CAP']:
    row = ['nulls+minor', i, 'UKMYC', 'bootstrapped50', 'HIGH'] + [None for i in range(13)]
    rows.append(row)

df3 = pandas.DataFrame(rows, columns=df2.columns)
df2 = pandas.concat([df2, df3])

df2.drug = df2.drug.astype('category')
df2.drug = df2.drug.cat.set_categories(who_drugs)
df2.sort_values('drug', inplace=True)
df2 = df2.iloc[::-1]
df2['set'] = 'nulls+minor+high'
df2

# And again for tbprofiler, so we have _just_ high quality results too
df4 = results[(results.dataset=='bootstrapped50') & (results.quality=='HIGH') & (results.set=='tbprofiler')]

rows = []
for i in ['PZA', 'BDQ', 'STM', 'CAP']:
    row = ['tbprofiler', i, 'UKMYC', 'bootstrapped50', 'HIGH'] + [None for i in range(13)]
    rows.append(row)

df3 = pandas.DataFrame(rows, columns=df4.columns)
df4 = pandas.concat([df4, df3])

df4.drug = df4.drug.astype('category')
df4.drug = df4.drug.cat.set_categories(who_drugs)
df4.sort_values('drug', inplace=True)
df4 = df4.iloc[::-1]
df4['set'] = 'tbprofiler+high'
df4

  df2 = pandas.concat([df2, df3])
  df4 = pandas.concat([df4, df3])


Unnamed: 0,set,drug,method,dataset,quality,sensitivity,sensitivity_sem,specificity,specificity_sem,PPV,PPV_sem,RR,SR,UR,RS,SS,US,Total
3,tbprofiler+high,CAP,UKMYC,bootstrapped50,HIGH,,,,,,,,,,,,,
5353,tbprofiler+high,KAN,UKMYC,bootstrapped50,HIGH,83.385287,0.859529,98.420985,0.143129,95.705886,0.413357,,,,,,,
5249,tbprofiler+high,ETH,UKMYC,bootstrapped50,HIGH,80.543146,0.735785,85.274274,0.478749,69.510668,0.879077,,,,,,,
2,tbprofiler+high,STM,UKMYC,bootstrapped50,HIGH,,,,,,,,,,,,,
5093,tbprofiler+high,AMI,UKMYC,bootstrapped50,HIGH,81.694152,0.882003,99.321261,0.138669,98.085858,0.395483,,,,,,,
4989,tbprofiler+high,DLM,UKMYC,bootstrapped50,HIGH,20.301021,1.961048,100.0,0.0,100.0,0.0,,,,,,,
4885,tbprofiler+high,CFZ,UKMYC,bootstrapped50,HIGH,12.672378,1.454851,96.675759,0.18544,32.664167,2.828255,,,,,,,
4781,tbprofiler+high,LEV,UKMYC,bootstrapped50,HIGH,88.617751,0.6861,96.560885,0.279071,92.59963,0.587145,,,,,,,
4677,tbprofiler+high,MXF,UKMYC,bootstrapped50,HIGH,91.67503,0.798313,93.171644,0.399497,86.377684,0.800475,,,,,,,
4573,tbprofiler+high,LZD,UKMYC,bootstrapped50,HIGH,43.372744,2.233722,99.824869,0.051939,95.891402,1.275161,,,,,,,


In [8]:
table = results[(results.dataset=='bootstrapped50') & (results.quality=='ALL') & (results.set!='nulls')]
table = pandas.concat([table, df2, df4])
table = table[['set', 'drug', 'sensitivity',
       'sensitivity_sem', 'specificity', 'specificity_sem']]

for col in ['sensitivity', 'sensitivity_sem', 'specificity', 'specificity_sem']:
    table[col] = table[col].map('{:,.1f}'.format)
for col in ['sensitivity', 'specificity']:
    table[col] = table[col] + ' ±' + table[col+'_sem']
table = table[['set','drug', 'sensitivity',
        'specificity', ]]
table.drug = table.drug.astype('category')
table.drug = table.drug.cat.set_categories(who_drugs)
table.set = table.set.astype('category')
table.set = table.set.cat.set_categories(['basic', 'nulls+minors', 'nulls+minor+high', "tbprofiler", "tbprofiler+high"])
table.set_index(['set','drug'], inplace=True)
table = table.unstack(level=0)
table.columns = table.columns.swaplevel(0,1)
table.sort_index(axis=1, level=0, inplace=True)
table.replace('nan ±nan', '', inplace=True)
table.rename(drug_names_lookup, inplace=True)
table

set,basic,basic,nulls+minors,nulls+minors,nulls+minor+high,nulls+minor+high,tbprofiler,tbprofiler,tbprofiler+high,tbprofiler+high
Unnamed: 0_level_1,sensitivity,specificity,sensitivity,specificity,sensitivity,specificity,sensitivity,specificity,sensitivity,specificity
drug,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
Isoniazid,91.4 ±0.5,95.3 ±0.4,92.8 ±0.5,94.4 ±0.4,95.6 ±0.4,95.1 ±0.4,92.8 ±0.4,94.7 ±0.4,95.6 ±0.4,95.4 ±0.4
Rifampicin,94.3 ±0.5,96.0 ±0.3,95.7 ±0.4,95.0 ±0.4,96.4 ±0.3,95.3 ±0.4,95.9 ±0.4,95.0 ±0.4,96.7 ±0.3,95.3 ±0.4
Pyrazinamide,81.7 ±0.7,97.4 ±0.3,85.8 ±0.7,97.0 ±0.4,,,85.5 ±0.7,97.4 ±0.3,,
Ethambutol,85.5 ±0.8,84.7 ±0.6,87.3 ±0.8,83.7 ±0.6,90.5 ±0.6,81.5 ±0.6,87.5 ±0.8,83.6 ±0.6,91.0 ±0.6,81.4 ±0.6
Bedaquiline,40.7 ±1.0,98.6 ±0.2,66.5 ±0.9,97.8 ±0.3,,,66.4 ±0.9,98.4 ±0.2,,
Linezolid,22.4 ±1.9,99.9 ±0.1,29.2 ±2.1,99.8 ±0.1,45.0 ±2.2,99.8 ±0.1,28.3 ±2.1,99.8 ±0.1,43.4 ±2.2,99.8 ±0.1
Moxifloxacin,81.4 ±0.8,94.6 ±0.3,86.7 ±0.7,93.9 ±0.3,90.5 ±0.7,93.2 ±0.4,87.8 ±0.8,93.9 ±0.3,91.7 ±0.8,93.2 ±0.4
Levofloxacin,78.1 ±1.0,96.8 ±0.3,83.4 ±0.9,96.3 ±0.3,87.5 ±0.7,96.6 ±0.3,84.4 ±0.9,96.3 ±0.3,88.6 ±0.7,96.6 ±0.3
Clofazimine,7.4 ±0.7,97.8 ±0.2,14.8 ±0.9,96.4 ±0.3,12.7 ±1.5,96.5 ±0.2,15.7 ±1.0,96.6 ±0.3,12.7 ±1.5,96.7 ±0.2
Delamanid,12.1 ±1.2,99.9 ±0.0,12.9 ±1.2,99.9 ±0.0,21.4 ±2.0,100.0 ±0.0,12.1 ±1.2,99.9 ±0.0,20.3 ±2.0,100.0 ±0.0


In [9]:
print(table.to_latex(column_format='r|rr|rr|rr', multirow=True))

\begin{tabular}{r|rr|rr|rr}
\toprule
set & \multicolumn{2}{r}{basic} & \multicolumn{2}{r}{nulls+minors} & \multicolumn{2}{r}{nulls+minor+high} & \multicolumn{2}{r}{tbprofiler} & \multicolumn{2}{r}{tbprofiler+high} \\
 & sensitivity & specificity & sensitivity & specificity & sensitivity & specificity & sensitivity & specificity & sensitivity & specificity \\
drug &  &  &  &  &  &  &  &  &  &  \\
\midrule
Isoniazid & 91.4 ±0.5 & 95.3 ±0.4 & 92.8 ±0.5 & 94.4 ±0.4 & 95.6 ±0.4 & 95.1 ±0.4 & 92.8 ±0.4 & 94.7 ±0.4 & 95.6 ±0.4 & 95.4 ±0.4 \\
Rifampicin & 94.3 ±0.5 & 96.0 ±0.3 & 95.7 ±0.4 & 95.0 ±0.4 & 96.4 ±0.3 & 95.3 ±0.4 & 95.9 ±0.4 & 95.0 ±0.4 & 96.7 ±0.3 & 95.3 ±0.4 \\
Pyrazinamide & 81.7 ±0.7 & 97.4 ±0.3 & 85.8 ±0.7 & 97.0 ±0.4 &  &  & 85.5 ±0.7 & 97.4 ±0.3 &  &  \\
Ethambutol & 85.5 ±0.8 & 84.7 ±0.6 & 87.3 ±0.8 & 83.7 ±0.6 & 90.5 ±0.6 & 81.5 ±0.6 & 87.5 ±0.8 & 83.6 ±0.6 & 91.0 ±0.6 & 81.4 ±0.6 \\
Bedaquiline & 40.7 ±1.0 & 98.6 ±0.2 & 66.5 ±0.9 & 97.8 ±0.3 &  &  & 66.4 ±0.9 & 98.4 ±0.2 &

In [10]:
table = results[(results.dataset=='bootstrapped50') & (results.quality=='ALL') & (results.set!='nulls')]
table = pandas.concat([table, df2, df4])
table = table[['set', 'drug', 'PPV',
       'PPV_sem']]

for col in ['PPV', 'PPV_sem']:
    table[col] = table[col].map('{:,.1f}'.format)
for col in ['PPV']:
    table[col] = table[col] + ' ±' + table[col+'_sem']
table = table[['set','drug', 'PPV']]
table.drug = table.drug.astype('category')
table.drug = table.drug.cat.set_categories(who_drugs)
table.set = table.set.astype('category')
table.set = table.set.cat.set_categories(['basic', 'nulls+minors', 'nulls+minor+high', "tbprofiler", "tbprofiler+high"])
table.set_index(['set','drug'], inplace=True)
table = table.unstack(level=0)
table.columns = table.columns.swaplevel(0,1)
table.sort_index(axis=1, level=0, inplace=True)
table.replace('nan ±nan', '', inplace=True)
table.rename(drug_names_lookup, inplace=True)
table

set,basic,nulls+minors,nulls+minor+high,tbprofiler,tbprofiler+high
Unnamed: 0_level_1,PPV,PPV,PPV,PPV,PPV
drug,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Isoniazid,95.2 ±0.4,94.4 ±0.4,95.3 ±0.3,94.7 ±0.4,95.6 ±0.3
Rifampicin,95.6 ±0.4,94.7 ±0.4,95.5 ±0.4,94.7 ±0.4,95.5 ±0.4
Pyrazinamide,96.4 ±0.5,96.1 ±0.5,,96.6 ±0.4,
Ethambutol,69.3 ±1.0,68.5 ±1.0,67.4 ±0.9,68.5 ±1.0,67.3 ±0.9
Bedaquiline,96.8 ±0.5,97.0 ±0.3,,97.7 ±0.3,
Linezolid,95.9 ±1.8,94.2 ±1.6,96.1 ±1.2,94.0 ±1.7,95.9 ±1.3
Moxifloxacin,86.0 ±0.9,85.3 ±0.9,86.2 ±0.8,85.5 ±0.9,86.4 ±0.8
Levofloxacin,92.0 ±0.7,91.6 ±0.7,92.5 ±0.6,91.6 ±0.7,92.6 ±0.6
Clofazimine,57.9 ±3.4,61.8 ±2.4,31.6 ±2.8,64.7 ±2.4,32.7 ±2.8
Delamanid,93.7 ±3.0,94.0 ±2.9,100.0 ±0.0,93.7 ±3.0,100.0 ±0.0


In [11]:
print(table.to_latex(column_format='r|rr|rr|rr', multirow=True))

\begin{tabular}{r|rr|rr|rr}
\toprule
set & basic & nulls+minors & nulls+minor+high & tbprofiler & tbprofiler+high \\
 & PPV & PPV & PPV & PPV & PPV \\
drug &  &  &  &  &  \\
\midrule
Isoniazid & 95.2 ±0.4 & 94.4 ±0.4 & 95.3 ±0.3 & 94.7 ±0.4 & 95.6 ±0.3 \\
Rifampicin & 95.6 ±0.4 & 94.7 ±0.4 & 95.5 ±0.4 & 94.7 ±0.4 & 95.5 ±0.4 \\
Pyrazinamide & 96.4 ±0.5 & 96.1 ±0.5 &  & 96.6 ±0.4 &  \\
Ethambutol & 69.3 ±1.0 & 68.5 ±1.0 & 67.4 ±0.9 & 68.5 ±1.0 & 67.3 ±0.9 \\
Bedaquiline & 96.8 ±0.5 & 97.0 ±0.3 &  & 97.7 ±0.3 &  \\
Linezolid & 95.9 ±1.8 & 94.2 ±1.6 & 96.1 ±1.2 & 94.0 ±1.7 & 95.9 ±1.3 \\
Moxifloxacin & 86.0 ±0.9 & 85.3 ±0.9 & 86.2 ±0.8 & 85.5 ±0.9 & 86.4 ±0.8 \\
Levofloxacin & 92.0 ±0.7 & 91.6 ±0.7 & 92.5 ±0.6 & 91.6 ±0.7 & 92.6 ±0.6 \\
Clofazimine & 57.9 ±3.4 & 61.8 ±2.4 & 31.6 ±2.8 & 64.7 ±2.4 & 32.7 ±2.8 \\
Delamanid & 93.7 ±3.0 & 94.0 ±2.9 & 100.0 ±0.0 & 93.7 ±3.0 & 100.0 ±0.0 \\
Amikacin & 98.6 ±0.3 & 98.2 ±0.4 & 98.1 ±0.4 & 98.2 ±0.4 & 98.1 ±0.4 \\
Streptomycin & 92.8 ±0.6 & 91.6 ±0

In [12]:
colours = {'sensitivity': [ '#ef6548', '#d7301f', '#990000'], 'specificity': ['#3690c0', '#0570b0', '#034e7b'], 'PPV': ['#41ab5d', '#238443','#005a32']}


for metric in ['sensitivity', 'specificity','PPV']:
    fig = plt.figure(figsize=(2.8, 8.5))
    axes = plt.gca()
    axes.spines["top"].set_visible(False)
    axes.spines["right"].set_visible(False)
    axes.spines["bottom"].set_visible(False)
    axes.get_xaxis().set_visible(False)
    axes.set_xticks([])
    axes.grid(False)
    axes.plot([100,100], [-0.5, 14.5], color='#cccccc', linewidth=0.5, linestyle='-')
    y=numpy.arange(len(df[results.set=='basic']))
    axes.set_yticks(y, df[results.set=='basic']['drug'])
    # axes.tick_params(axis="y", direction='out', length=8)
    axes.set_ylim(-0.3, 14.5)
    e = (
        who[metric + "_high"] - who[metric],
        who[metric] - who[metric + "_low"],
    )
    axes.barh(y+0.3, who[metric], 0.2, label=who[metric], color='#cccccc', edgecolor='white', linewidth=1, alpha=0.5)
    subset = who[[metric]]
    subset.columns = ['x']
    y=0
    for idx,row in subset.iterrows():
        axes.text(row.x+2, y+0.3, "%.1f" % row.x, ha="left", va='center', color='#cccccc', fontweight='light')
        y+=1

    subset = df[results.set=='basic'][[metric, metric+'_sem']]
    subset.columns = ['x', 'e']
    y=0
    for idx,row in subset.iterrows():
        axes.plot([row.x, row.x], [y, y+0.2], color=colours[metric][0], linewidth=1)
        axes.add_patch(Rectangle((row.x-row.e, y), 2*row.e, 0.2, fc=colours[metric][0], alpha=0.2))
        axes.text(row.x+2, y+0.1, "%.1f" % row.x, ha="left", va='center', color=colours[metric][0], fontweight='heavy') #, backgroundcolor='white')
        y+=1

    subset = df[results.set=='nulls+minors'][[metric, metric+'_sem']]    
    subset.columns = ['x', 'e']
    y=0
    for idx,row in subset.iterrows():
        axes.plot([row.x, row.x], [y-0.2, y], color=colours[metric][1], linewidth=1)
        axes.add_patch(Rectangle((row.x-row.e, y-0.2), 2*row.e, 0.2, fc=colours[metric][1], alpha=0.2))
        axes.text(row.x+2, y-0.1, "%.1f" % row.x, ha="left", va='center', color=colours[metric][1], fontweight='bold')
        y+=1

    subset = df2[[metric, metric+'_sem']]
    subset.columns = ['x', 'e']
    y=0
    for idx,row in subset.iterrows():
        axes.plot([row.x, row.x], [y-0.4, y-0.2], color=colours[metric][2], linewidth=1)
        axes.add_patch(Rectangle((row.x-row.e, y-0.4), 2*row.e, 0.2, fc=colours[metric][2], alpha=0.2))
        axes.text(row.x+2, y-0.3, "%.1f" % row.x, ha="left", va='center', color=colours[metric][2], fontweight='bold')
        y+=1

    

    
    fig.savefig('pdf/fig-results-main-'+metric+'.pdf', bbox_inches='tight')
    plt.close()

  y=numpy.arange(len(df[results.set=='basic']))
  axes.set_yticks(y, df[results.set=='basic']['drug'])
  subset = df[results.set=='basic'][[metric, metric+'_sem']]
  subset = df[results.set=='nulls+minors'][[metric, metric+'_sem']]
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
  y=numpy.arange(len(df[results.set=='basic']))
  axes.set_yticks(y, df[results.set=='basic']['drug'])
  subset = df[results.set=='basic'][[metric, metric+'_sem']]
  subset = df[results.set=='nulls+minors'][[metric, metric+'_sem']]
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx

In [13]:
# Version of the above, but swapping internal comparisons for tbprofiler
colours = {'sensitivity': ['#d7301f', '#990000', '#969696', '#525252'], 'specificity': ['#0570b0', '#034e7b', '#969696', '#525252'], 'PPV': ['#238443', '#005a32', '#969696', '#525252']}

for metric in ['sensitivity', 'specificity','PPV']:
    fig = plt.figure(figsize=(2.8, 8.5))
    axes = plt.gca()
    axes.spines["top"].set_visible(False)
    axes.spines["right"].set_visible(False)
    axes.spines["bottom"].set_visible(False)
    axes.get_xaxis().set_visible(False)
    axes.set_xticks([])
    axes.grid(False)
    axes.plot([100,100], [-0.5, 14.5], color='#cccccc', linewidth=0.5, linestyle='-')
    y=numpy.arange(len(df[results.set=='nulls+minors']))
    axes.set_yticks(y, df[results.set=='nulls+minors']['drug'])

    # nulls+minors
    subset = df[results.set=='nulls+minors'][[metric, metric+'_sem']]    
    subset.columns = ['x', 'e']
    y=0
    for idx,row in subset.iterrows():
        axes.barh(y+0.3, row.x, 0.2,  xerr=row.e, color=colours[metric][0], ecolor=colours[metric][0], edgecolor=colours[metric][0], linewidth=1, alpha=0.7)
        axes.text(row.x+2, y+0.3, "%.1f" % row.x, ha="left", va='center', color=colours[metric][0], fontweight='bold')
        y+=1

    # tbprofiler
    subset = df[results.set=='tbprofiler'][[metric, metric+'_sem']]    
    subset.columns = ['x', 'e']
    y=0
    for idx,row in subset.iterrows():
        axes.barh(y+0.1, row.x, 0.2,  xerr=row.e, color='white', ecolor=colours[metric][2], edgecolor=colours[metric][2], linewidth=1, alpha=0.7)
        axes.text(row.x+2, y+0.1, "%.1f" % row.x, ha="left", va='center', color=colours[metric][2], fontweight='bold')
        y+=1

    # nulls+minors+high
    subset = df2[[metric, metric+'_sem']]
    subset.columns = ['x', 'e']
    y=0
    for idx,row in subset.iterrows():
        axes.barh(y-0.1, row.x, 0.2,  xerr=row.e, color=colours[metric][1], ecolor=colours[metric][1], edgecolor=colours[metric][1], linewidth=1, alpha=1)
        axes.text(row.x+2, y-0.1, "%.1f" % row.x, ha="left", va='center', color=colours[metric][1], fontweight='bold')
        y+=1
    
    # tbprofiler+high
    subset = df4[[metric, metric+'_sem']]
    subset.columns = ['x', 'e']
    y=0
    for idx,row in subset.iterrows():
        axes.barh(y-0.3, row.x, 0.2,  xerr=row.e, color='white', ecolor=colours[metric][3], edgecolor=colours[metric][3], linewidth=1, alpha=1)
        axes.text(row.x+2, y-0.3, "%.1f" % row.x, ha="left", va='center', color=colours[metric][3], fontweight='bold')
        y+=1

    axes.set_ylim(-0.3, 14.5)
    axes.set_xlim(0, 100)

    
    fig.savefig('pdf/fig-results-main-tbprofiler-'+metric+'.pdf', bbox_inches='tight')
    plt.close()

  y=numpy.arange(len(df[results.set=='nulls+minors']))
  axes.set_yticks(y, df[results.set=='nulls+minors']['drug'])
  subset = df[results.set=='nulls+minors'][[metric, metric+'_sem']]
  subset = df[results.set=='tbprofiler'][[metric, metric+'_sem']]
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
  y=numpy.arange(len(df[results.set=='nulls+minors']))
  axes.set_yticks(y, df[results.set=='nulls+minors']['drug'])
  subset = df[results.se

In [14]:
for set in ['basic', 'nulls', 'nulls+minors', "tbprofiler"]:

    df = results[(results.set==set)]

    plot_truthtables(df, ['ALL','HIGH'], filestem=f'table-{set}-', savefig=savefig)


In [15]:
df

Unnamed: 0,set,drug,method,dataset,quality,sensitivity,sensitivity_sem,specificity,specificity_sem,PPV,PPV_sem,RR,SR,UR,RS,SS,US,Total
4056,tbprofiler,INH,UKMYC,bootstrap-0,ALL,89.961390,,95.020747,,95.102041,,,,,,,,
4057,tbprofiler,INH,UKMYC,bootstrap-0,HIGH,94.779116,,95.219124,,95.161290,,,,,,,,
4058,tbprofiler,INH,UKMYC,bootstrap-1,ALL,92.996109,,93.004115,,93.359375,,,,,,,,
4059,tbprofiler,INH,UKMYC,bootstrap-1,HIGH,96.774194,,96.031746,,96.000000,,,,,,,,
4060,tbprofiler,INH,UKMYC,bootstrap-10,ALL,94.979079,,94.636015,,94.190871,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5403,tbprofiler,CAP,MGIT,bootstrap-7,ALL,76.250000,,98.823529,,96.825397,,,,,,,,
5404,tbprofiler,CAP,MGIT,bootstrap-8,ALL,70.068027,,98.866856,,96.261682,,,,,,,,
5405,tbprofiler,CAP,MGIT,bootstrap-9,ALL,67.816092,,98.466258,,95.934959,,,,,,,,
5406,tbprofiler,CAP,MGIT,bootstrapped50,ALL,73.778591,0.858111,98.356579,0.165979,95.263516,0.46716,,,,,,,


In [16]:
UKMYC_PHENOTYPES = pandas.read_csv('dat/UKMYC_1000_phenotypes.csv')
UKMYC_PHENOTYPES.set_index('ENA_RUN_ACCESSION', inplace=True)

UKMYC_SAMPLES = pandas.read_csv('dat/UKMYC_1000_samples.csv')
UKMYC_SAMPLES.set_index('ENA_RUN_ACCESSION', inplace=True)

UKMYC_PHENOTYPES = UKMYC_PHENOTYPES.join(UKMYC_SAMPLES[['POS_AVG_GROWTH']])
UKMYC_PHENOTYPES.reset_index(inplace=True)
UKMYC_PHENOTYPES.set_index(['ENA_RUN_ACCESSION', 'DRUG'], inplace=True)
UKMYC_PHENOTYPES[:3]

Unnamed: 0_level_0,Unnamed: 1_level_0,UNIQUEID,BINARY_PHENOTYPE,PHENOTYPE_QUALITY,PHENOTYPE_METHOD,PLATEDESIGN,MIC,DILUTION,POS_AVG_GROWTH
ENA_RUN_ACCESSION,DRUG,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ERR4810791,INH,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>1.6,8,44.68
ERR4810791,RIF,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>4,8,44.68
ERR4810791,EMB,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>8,9,44.68


In [17]:
PREDICTIONS = pandas.read_csv('dat/PREDICTIONS.csv')
tbprofiler = pandas.read_csv('dat/tbprofiler_PREDICTIONS.csv')
tbprofiler["SET"] = "tbprofiler"
PREDICTIONS = PREDICTIONS._append(tbprofiler, ignore_index=True)
print(PREDICTIONS)
PREDICTIONS.set_index(['ENA_RUN_ACCESSION', 'DRUG'], inplace=True)
PREDICTIONS[:3]

               SET ENA_RUN_ACCESSION DRUG PREDICTION
0            basic       ERR13286038  INH          R
1            basic       ERR13286038  RIF          R
2            basic       ERR13286038  PZA          R
3            basic       ERR13286038  EMB          U
4            basic       ERR13286038  BDQ          S
...            ...               ...  ...        ...
159475  tbprofiler        SRR1165601  AMI          S
159476  tbprofiler        SRR1165601  KAN          S
159477  tbprofiler        SRR1165601  CAP          S
159478  tbprofiler        SRR1165601  CFZ          S
159479  tbprofiler        SRR1165601  ETH          S

[159480 rows x 4 columns]


Unnamed: 0_level_0,Unnamed: 1_level_0,SET,PREDICTION
ENA_RUN_ACCESSION,DRUG,Unnamed: 2_level_1,Unnamed: 3_level_1
ERR13286038,INH,basic,R
ERR13286038,RIF,basic,R
ERR13286038,PZA,basic,R


In [18]:
UKMYC_RESULTS = UKMYC_PHENOTYPES.join(PREDICTIONS,how='inner')
UKMYC_RESULTS.reset_index(inplace=True)
UKMYC_RESULTS[:3]

Unnamed: 0,ENA_RUN_ACCESSION,DRUG,UNIQUEID,BINARY_PHENOTYPE,PHENOTYPE_QUALITY,PHENOTYPE_METHOD,PLATEDESIGN,MIC,DILUTION,POS_AVG_GROWTH,SET,PREDICTION
0,ERR4810791,INH,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>1.6,8,44.68,basic,R
1,ERR4810791,INH,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>1.6,8,44.68,nulls,R
2,ERR4810791,INH,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>1.6,8,44.68,nulls+minors,R


In [19]:
def define_outcome(row):
    if row.PREDICTION in ['S', 'U']:
        if row.BINARY_PHENOTYPE == 'R':
            return '(S+U)R'
        else:
            return '(S+U)S'
    else:
        return str(row.PREDICTION)+str(row.BINARY_PHENOTYPE)

UKMYC_RESULTS['OUTCOME'] = UKMYC_RESULTS.apply(define_outcome, axis=1)
UKMYC_RESULTS.OUTCOME.value_counts()

OUTCOME
(S+U)S    33009
RR         9546
(S+U)R     3818
RS         1505
FS           18
FR            8
Name: count, dtype: int64

In [20]:
DISCREPANCY_SET = UKMYC_RESULTS[(UKMYC_RESULTS.PHENOTYPE_METHOD=='UKMYC') & (UKMYC_RESULTS.OUTCOME.str[0]!='F')]
DISCREPANCY_SET[:6]

Unnamed: 0,ENA_RUN_ACCESSION,DRUG,UNIQUEID,BINARY_PHENOTYPE,PHENOTYPE_QUALITY,PHENOTYPE_METHOD,PLATEDESIGN,MIC,DILUTION,POS_AVG_GROWTH,SET,PREDICTION,OUTCOME
0,ERR4810791,INH,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>1.6,8,44.68,basic,R,RR
1,ERR4810791,INH,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>1.6,8,44.68,nulls,R,RR
2,ERR4810791,INH,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>1.6,8,44.68,nulls+minors,R,RR
3,ERR4810791,INH,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>1.6,8,44.68,tbprofiler,R,RR
4,ERR4810791,RIF,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>4,8,44.68,basic,R,RR
5,ERR4810791,RIF,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>4,8,44.68,nulls,R,RR


In [21]:
for set in ['basic', 'nulls', 'nulls+minors']:
    for drug in DISCREPANCY_SET.DRUG.unique():
        for quality in ['HIGH','ALL']:
            if quality=='HIGH':
                df = copy.deepcopy(DISCREPANCY_SET[(DISCREPANCY_SET.SET==set) & (DISCREPANCY_SET.DRUG==drug) & (DISCREPANCY_SET.PHENOTYPE_QUALITY==quality)])
            else:

                df = copy.deepcopy(DISCREPANCY_SET[(DISCREPANCY_SET.SET==set) & (DISCREPANCY_SET.DRUG==drug)])
            plot_growth_boxplot(df, filename=f'growth-{set}-{drug}-{quality}.pdf', savefig=savefig)

In [22]:
for set in ['basic', 'nulls', 'nulls+minors']:
    for platedesign in ['UKMYC5', 'UKMYC6']:
        for drug in DISCREPANCY_SET.DRUG.unique():
            for quality in ['HIGH','ALL']:
                if quality=='HIGH':
                    df = copy.deepcopy(DISCREPANCY_SET[(DISCREPANCY_SET.SET==set) & (DISCREPANCY_SET.PLATEDESIGN==platedesign) & (DISCREPANCY_SET.DRUG==drug) & (DISCREPANCY_SET.PHENOTYPE_QUALITY==quality)])
                else:

                    df = copy.deepcopy(DISCREPANCY_SET[(DISCREPANCY_SET.SET==set) & (DISCREPANCY_SET.PLATEDESIGN==platedesign) & (DISCREPANCY_SET.DRUG==drug)])
                if df.empty:
                    continue
                plot_dilution_boxplot(df, filename=f'mic-{set}-{drug}-{platedesign}-{quality}', savefig=savefig)

  df = df[["MIC", "OUTCOME", "ENA_RUN_ACCESSION"]].groupby(["OUTCOME", "MIC"]).count()
  df = df[["MIC", "OUTCOME", "ENA_RUN_ACCESSION"]].groupby(["OUTCOME", "MIC"]).count()
  df = df[["MIC", "OUTCOME", "ENA_RUN_ACCESSION"]].groupby(["OUTCOME", "MIC"]).count()
  df = df[["MIC", "OUTCOME", "ENA_RUN_ACCESSION"]].groupby(["OUTCOME", "MIC"]).count()
  df = df[["MIC", "OUTCOME", "ENA_RUN_ACCESSION"]].groupby(["OUTCOME", "MIC"]).count()
  df = df[["MIC", "OUTCOME", "ENA_RUN_ACCESSION"]].groupby(["OUTCOME", "MIC"]).count()
  df = df[["MIC", "OUTCOME", "ENA_RUN_ACCESSION"]].groupby(["OUTCOME", "MIC"]).count()
  df = df[["MIC", "OUTCOME", "ENA_RUN_ACCESSION"]].groupby(["OUTCOME", "MIC"]).count()
  df = df[["MIC", "OUTCOME", "ENA_RUN_ACCESSION"]].groupby(["OUTCOME", "MIC"]).count()
  df = df[["MIC", "OUTCOME", "ENA_RUN_ACCESSION"]].groupby(["OUTCOME", "MIC"]).count()
  df = df[["MIC", "OUTCOME", "ENA_RUN_ACCESSION"]].groupby(["OUTCOME", "MIC"]).count()
  df = df[["MIC", "OUTCOME", "ENA_RUN_ACCES