In [1]:
import pandas
import numpy
import copy
import matplotlib

import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle

from statsmodels.stats.contingency_tables import mcnemar

from src.utils import plot_truthtables, plot_growth_boxplot, plot_dilution_boxplot

matplotlib.rcParams.update({'font.size': 7})

%load_ext autoreload
%autoreload 2

  import scipy.linalg


In [2]:
savefig=False

Read in the large `RESULTS` table created in the previous notebook

In [3]:
results = pandas.read_csv('dat/RESULTS.csv')
results[:3]

Unnamed: 0,set,drug,method,dataset,quality,sensitivity,sensitivity_sem,specificity,specificity_sem,PPV,PPV_sem,RR,SR,UR,RS,SS,US,Total
0,basic,INH,UKMYC,bootstrap-0,ALL,88.803089,,95.020747,,95.041322,,,,,,,,
1,basic,INH,UKMYC,bootstrap-0,HIGH,93.172691,,96.414343,,96.26556,,,,,,,,
2,basic,INH,UKMYC,bootstrap-1,ALL,91.828794,,94.650206,,94.779116,,,,,,,,


In [4]:
drug_names_table = pandas.read_csv("dat/drugs/drug_names_lookup.csv")
drug_names_table.set_index("DRUG", inplace=True)
drug_names_lookup = {}
for idx, row in drug_names_table.iterrows():
    drug_names_lookup[idx] = row.DRUG_NAME.capitalize()
drug_names_lookup

{'AMI': 'Amikacin',
 'BDQ': 'Bedaquiline',
 'CAP': 'Capreomycin',
 'CFZ': 'Clofazimine',
 'DLM': 'Delamanid',
 'EMB': 'Ethambutol',
 'ETH': 'Ethionamide',
 'INH': 'Isoniazid',
 'KAN': 'Kanamycin',
 'LEV': 'Levofloxacin',
 'LZD': 'Linezolid',
 'MXF': 'Moxifloxacin',
 'PZA': 'Pyrazinamide',
 'RFB': 'Rifabutin',
 'RIF': 'Rifampicin',
 'STM': 'Streptomycin'}

Now read in the list of drugs in the WHOv2 catalogue as well as the performance of the WHOv2 catalogue as reported in Annex 1 of the report

In [5]:
who_drugs = list(pandas.read_csv('dat/drugs/who2_drugs.csv').drug)

who = pandas.read_csv('dat/WHO2-Annex1-table.csv')
who = who[(who.catalogue=='WHO2') & (who.FRS==0.75)]

# reverse the order of the table so that the drugs are in the same order on all graphs
who = who.iloc[::-1]
who

Unnamed: 0,drug,catalogue,FRS,sensitivity,specificity,PPV,sensitivity_low,sensitivity_high,specificity_low,specificity_high,PPV_low,PPV_high
43,CAP,WHO2,0.75,66.2,97.8,80.1,64.1,68.2,97.6,98.1,78.1,81.9
40,KAN,WHO2,0.75,74.9,96.7,79.3,73.4,76.3,96.4,96.9,77.9,80.7
37,ETH,WHO2,0.75,74.8,85.9,63.9,73.6,76.0,85.3,86.4,62.7,65.1
34,STM,WHO2,0.75,79.7,94.1,89.9,78.9,80.5,93.7,94.4,89.3,90.5
31,AMI,WHO2,0.75,72.8,98.3,82.8,71.0,74.6,98.1,98.5,81.2,84.4
28,DLM,WHO2,0.75,14.7,99.9,72.5,10.6,19.7,99.8,99.9,58.3,84.1
25,CFZ,WHO2,0.75,17.0,98.7,38.1,14.2,20.0,98.5,98.9,32.6,43.8
22,LEV,WHO2,0.75,84.8,96.9,88.1,83.9,85.7,96.7,97.1,87.3,89.0
19,MXF,WHO2,0.75,85.7,93.5,74.0,84.6,86.8,93.2,93.9,72.7,75.2
16,LZD,WHO2,0.75,34.0,99.8,78.4,29.2,39.0,99.7,99.9,71.3,84.5


Now also produce a table for the UKMYC drugs with HIGH confidence MIC measurements. Because four drugs will not have values we have to manually insert the rows to ensure the graphs work.


In [6]:
summary = results[(results.dataset=='bootstrapped50') & (results.quality=='ALL')]

# reverse the order of the table so the drugs are in the correct order from top to bottom in the plot
summary = summary.iloc[::-1]


df2 = results[(results.dataset=='bootstrapped50') & (results.quality=='HIGH') & (results.set=='nulls+minors')]

rows = []
for i in ['PZA', 'BDQ', 'STM', 'CAP']:
    row = ['nulls+minor', i, 'UKMYC', 'bootstrapped50', 'HIGH'] + [None for i in range(13)]
    rows.append(row)

df3 = pandas.DataFrame(rows, columns=df2.columns)
df2 = pandas.concat([df2, df3])

df2.drug = df2.drug.astype('category')
df2.drug = df2.drug.cat.set_categories(who_drugs)
df2.sort_values('drug', inplace=True)
df2 = df2.iloc[::-1]
df2['set'] = 'nulls+minors+high'
df2

# # And again for tbprofiler, so we have _just_ high quality results too
# df4 = results[(results.dataset=='bootstrapped50') & (results.quality=='HIGH') & (results.set=='tbprofiler')]

# rows = []
# for i in ['PZA', 'BDQ', 'STM', 'CAP']:
#     row = ['tbprofiler', i, 'UKMYC', 'bootstrapped50', 'HIGH'] + [None for i in range(13)]
#     rows.append(row)

# df3 = pandas.DataFrame(rows, columns=df4.columns)
# df4 = pandas.concat([df4, df3])

# df4.drug = df4.drug.astype('category')
# df4.drug = df4.drug.cat.set_categories(who_drugs)
# df4.sort_values('drug', inplace=True)
# df4 = df4.iloc[::-1]
# df4['set'] = 'tbprofiler+high'
# df4

summary = pandas.concat([summary, df2])
summary.set.value_counts()

  df2 = pandas.concat([df2, df3])


set
tbprofiler           15
nulls+minors         15
nulls                15
basic                15
nulls+minors+high    15
Name: count, dtype: int64

In [7]:
table = summary[summary.set!='nulls']
table = table[['set', 'drug', 'sensitivity',
       'sensitivity_sem', 'specificity', 'specificity_sem']]

for col in ['sensitivity', 'sensitivity_sem', 'specificity', 'specificity_sem']:
    table[col] = table[col].map('{:,.1f}'.format)
for col in ['sensitivity', 'specificity']:
    table[col] = table[col] + ' ±' + table[col+'_sem']
table = table[['set','drug', 'sensitivity',
        'specificity', ]]
table.drug = table.drug.astype('category')
table.drug = table.drug.cat.set_categories(who_drugs)
table.set = table.set.astype('category')
table.set = table.set.cat.set_categories(['basic', 'nulls+minors', 'nulls+minor+high', "tbprofiler", "tbprofiler+high"])
table.set_index(['set','drug'], inplace=True)
table = table.unstack(level=0)
table.columns = table.columns.swaplevel(0,1)
table.sort_index(axis=1, level=0, inplace=True)
table.replace('nan ±nan', '', inplace=True)
table.rename(drug_names_lookup, inplace=True)
table

set,basic,basic,nulls+minors,nulls+minors,tbprofiler,tbprofiler,nan,nan
Unnamed: 0_level_1,sensitivity,specificity,sensitivity,specificity,sensitivity,specificity,sensitivity,specificity
drug,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Isoniazid,91.4 ±0.5,95.3 ±0.4,92.8 ±0.5,94.4 ±0.4,92.8 ±0.4,94.7 ±0.4,95.6 ±0.4,95.1 ±0.4
Rifampicin,94.3 ±0.5,96.0 ±0.3,95.7 ±0.4,95.0 ±0.4,95.9 ±0.4,95.0 ±0.4,96.4 ±0.3,95.3 ±0.4
Pyrazinamide,81.7 ±0.7,97.4 ±0.3,85.8 ±0.7,97.0 ±0.4,85.5 ±0.7,97.4 ±0.3,,
Ethambutol,85.5 ±0.8,84.7 ±0.6,87.3 ±0.8,83.7 ±0.6,87.5 ±0.8,83.6 ±0.6,90.5 ±0.6,81.5 ±0.6
Bedaquiline,40.7 ±1.0,98.6 ±0.2,66.5 ±0.9,97.8 ±0.3,66.4 ±0.9,98.4 ±0.2,,
Linezolid,22.4 ±1.9,99.9 ±0.1,29.2 ±2.1,99.8 ±0.1,28.3 ±2.1,99.8 ±0.1,45.0 ±2.2,99.8 ±0.1
Moxifloxacin,81.4 ±0.8,94.6 ±0.3,86.7 ±0.7,93.9 ±0.3,87.8 ±0.8,93.9 ±0.3,90.5 ±0.7,93.2 ±0.4
Levofloxacin,78.1 ±1.0,96.8 ±0.3,83.4 ±0.9,96.3 ±0.3,84.4 ±0.9,96.3 ±0.3,87.5 ±0.7,96.6 ±0.3
Clofazimine,7.4 ±0.7,97.8 ±0.2,14.8 ±0.9,96.4 ±0.3,15.7 ±1.0,96.6 ±0.3,12.7 ±1.5,96.5 ±0.2
Delamanid,12.1 ±1.2,99.9 ±0.0,12.9 ±1.2,99.9 ±0.0,12.1 ±1.2,99.9 ±0.0,21.4 ±2.0,100.0 ±0.0


In [8]:
print(table.to_latex(column_format='r|rr|rr|rr', multirow=True))

\begin{tabular}{r|rr|rr|rr}
\toprule
set & \multicolumn{2}{r}{basic} & \multicolumn{2}{r}{nulls+minors} & \multicolumn{2}{r}{tbprofiler} & \multicolumn{2}{r}{NaN} \\
 & sensitivity & specificity & sensitivity & specificity & sensitivity & specificity & sensitivity & specificity \\
drug &  &  &  &  &  &  &  &  \\
\midrule
Isoniazid & 91.4 ±0.5 & 95.3 ±0.4 & 92.8 ±0.5 & 94.4 ±0.4 & 92.8 ±0.4 & 94.7 ±0.4 & 95.6 ±0.4 & 95.1 ±0.4 \\
Rifampicin & 94.3 ±0.5 & 96.0 ±0.3 & 95.7 ±0.4 & 95.0 ±0.4 & 95.9 ±0.4 & 95.0 ±0.4 & 96.4 ±0.3 & 95.3 ±0.4 \\
Pyrazinamide & 81.7 ±0.7 & 97.4 ±0.3 & 85.8 ±0.7 & 97.0 ±0.4 & 85.5 ±0.7 & 97.4 ±0.3 &  &  \\
Ethambutol & 85.5 ±0.8 & 84.7 ±0.6 & 87.3 ±0.8 & 83.7 ±0.6 & 87.5 ±0.8 & 83.6 ±0.6 & 90.5 ±0.6 & 81.5 ±0.6 \\
Bedaquiline & 40.7 ±1.0 & 98.6 ±0.2 & 66.5 ±0.9 & 97.8 ±0.3 & 66.4 ±0.9 & 98.4 ±0.2 &  &  \\
Linezolid & 22.4 ±1.9 & 99.9 ±0.1 & 29.2 ±2.1 & 99.8 ±0.1 & 28.3 ±2.1 & 99.8 ±0.1 & 45.0 ±2.2 & 99.8 ±0.1 \\
Moxifloxacin & 81.4 ±0.8 & 94.6 ±0.3 & 86.7 ±0.7 & 9

In [9]:
table = summary[summary.set!='nulls']
table = table[['set', 'drug', 'PPV',
       'PPV_sem']]

for col in ['PPV', 'PPV_sem']:
    table[col] = table[col].map('{:,.1f}'.format)
for col in ['PPV']:
    table[col] = table[col] + ' ±' + table[col+'_sem']
table = table[['set','drug', 'PPV']]
table.drug = table.drug.astype('category')
table.drug = table.drug.cat.set_categories(who_drugs)
table.set = table.set.astype('category')
table.set = table.set.cat.set_categories(['basic', 'nulls+minors', 'nulls+minor+high', "tbprofiler", "tbprofiler+high"])
table.set_index(['set','drug'], inplace=True)
table = table.unstack(level=0)
table.columns = table.columns.swaplevel(0,1)
table.sort_index(axis=1, level=0, inplace=True)
table.replace('nan ±nan', '', inplace=True)
table.rename(drug_names_lookup, inplace=True)
table

set,basic,nulls+minors,tbprofiler,nan
Unnamed: 0_level_1,PPV,PPV,PPV,PPV
drug,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Isoniazid,95.2 ±0.4,94.4 ±0.4,94.7 ±0.4,95.3 ±0.3
Rifampicin,95.6 ±0.4,94.7 ±0.4,94.7 ±0.4,95.5 ±0.4
Pyrazinamide,96.4 ±0.5,96.1 ±0.5,96.6 ±0.4,
Ethambutol,69.3 ±1.0,68.5 ±1.0,68.5 ±1.0,67.4 ±0.9
Bedaquiline,96.8 ±0.5,97.0 ±0.3,97.7 ±0.3,
Linezolid,95.9 ±1.8,94.2 ±1.6,94.0 ±1.7,96.1 ±1.2
Moxifloxacin,86.0 ±0.9,85.3 ±0.9,85.5 ±0.9,86.2 ±0.8
Levofloxacin,92.0 ±0.7,91.6 ±0.7,91.6 ±0.7,92.5 ±0.6
Clofazimine,57.9 ±3.4,61.8 ±2.4,64.7 ±2.4,31.6 ±2.8
Delamanid,93.7 ±3.0,94.0 ±2.9,93.7 ±3.0,100.0 ±0.0


In [10]:
print(table.to_latex(column_format='r|rr|rr|rr', multirow=True))

\begin{tabular}{r|rr|rr|rr}
\toprule
set & basic & nulls+minors & tbprofiler & NaN \\
 & PPV & PPV & PPV & PPV \\
drug &  &  &  &  \\
\midrule
Isoniazid & 95.2 ±0.4 & 94.4 ±0.4 & 94.7 ±0.4 & 95.3 ±0.3 \\
Rifampicin & 95.6 ±0.4 & 94.7 ±0.4 & 94.7 ±0.4 & 95.5 ±0.4 \\
Pyrazinamide & 96.4 ±0.5 & 96.1 ±0.5 & 96.6 ±0.4 &  \\
Ethambutol & 69.3 ±1.0 & 68.5 ±1.0 & 68.5 ±1.0 & 67.4 ±0.9 \\
Bedaquiline & 96.8 ±0.5 & 97.0 ±0.3 & 97.7 ±0.3 &  \\
Linezolid & 95.9 ±1.8 & 94.2 ±1.6 & 94.0 ±1.7 & 96.1 ±1.2 \\
Moxifloxacin & 86.0 ±0.9 & 85.3 ±0.9 & 85.5 ±0.9 & 86.2 ±0.8 \\
Levofloxacin & 92.0 ±0.7 & 91.6 ±0.7 & 91.6 ±0.7 & 92.5 ±0.6 \\
Clofazimine & 57.9 ±3.4 & 61.8 ±2.4 & 64.7 ±2.4 & 31.6 ±2.8 \\
Delamanid & 93.7 ±3.0 & 94.0 ±2.9 & 93.7 ±3.0 & 100.0 ±0.0 \\
Amikacin & 98.6 ±0.3 & 98.2 ±0.4 & 98.2 ±0.4 & 98.1 ±0.4 \\
Streptomycin & 92.8 ±0.6 & 91.6 ±0.6 & 91.5 ±0.6 &  \\
Ethionamide & 69.8 ±0.9 & 68.3 ±0.9 & 68.9 ±1.0 & 68.9 ±0.9 \\
Kanamycin & 96.8 ±0.4 & 96.2 ±0.5 & 96.1 ±0.5 & 95.8 ±0.4 \\
Capreomyci

In [11]:
colours = {'sensitivity': [ '#ef6548', '#d7301f', '#990000'], 'specificity': ['#3690c0', '#0570b0', '#034e7b'], 'PPV': ['#41ab5d', '#238443','#005a32']}


for metric in ['sensitivity', 'specificity','PPV']:
    fig = plt.figure(figsize=(2.8, 8.5))
    axes = plt.gca()
    axes.spines["top"].set_visible(False)
    axes.spines["right"].set_visible(False)
    axes.spines["bottom"].set_visible(False)
    axes.get_xaxis().set_visible(False)
    axes.set_xticks([])
    axes.grid(False)
    axes.plot([100,100], [-0.5, 14.5], color='#cccccc', linewidth=0.5, linestyle='-')
    y=numpy.arange(len(summary[summary.set=='basic']))
    axes.set_yticks(y, summary[summary.set=='basic']['drug'])
    axes.set_ylim(-0.3, 14.5)
    e = (
        who[metric + "_high"] - who[metric],
        who[metric] - who[metric + "_low"],
    )
    axes.barh(y+0.3, who[metric], 0.2, label=who[metric], color='#cccccc', edgecolor='white', linewidth=1, alpha=0.5)
    subset = who[[metric]]
    subset.columns = ['x']
    y=0
    for idx,row in subset.iterrows():
        axes.text(row.x+2, y+0.3, "%.1f" % row.x, ha="left", va='center', color='#cccccc', fontweight='light')
        y+=1

    subset = summary[summary.set=='basic'][[metric, metric+'_sem']]
    subset.columns = ['x', 'e']
    y=0
    for idx,row in subset.iterrows():
        axes.plot([row.x, row.x], [y, y+0.2], color=colours[metric][0], linewidth=1)
        axes.add_patch(Rectangle((row.x-row.e, y), 2*row.e, 0.2, fc=colours[metric][0], alpha=0.2))
        axes.text(row.x+2, y+0.1, "%.1f" % row.x, ha="left", va='center', color=colours[metric][0], fontweight='heavy') #, backgroundcolor='white')
        y+=1

    subset = summary[summary.set=='nulls+minors'][[metric, metric+'_sem']]
    subset.columns = ['x', 'e']
    y=0
    for idx,row in subset.iterrows():
        axes.plot([row.x, row.x], [y-0.2, y], color=colours[metric][1], linewidth=1)
        axes.add_patch(Rectangle((row.x-row.e, y-0.2), 2*row.e, 0.2, fc=colours[metric][1], alpha=0.2))
        axes.text(row.x+2, y-0.1, "%.1f" % row.x, ha="left", va='center', color=colours[metric][1], fontweight='bold')
        y+=1

    subset = summary[summary.set=='nulls+minors+high'][[metric, metric+'_sem']]
    subset.columns = ['x', 'e']
    y=0
    for idx,row in subset.iterrows():
        axes.plot([row.x, row.x], [y-0.4, y-0.2], color=colours[metric][2], linewidth=1)
        axes.add_patch(Rectangle((row.x-row.e, y-0.4), 2*row.e, 0.2, fc=colours[metric][2], alpha=0.2))
        axes.text(row.x+2, y-0.3, "%.1f" % row.x, ha="left", va='center', color=colours[metric][2], fontweight='bold')
        y+=1

    fig.savefig('pdf/fig-results-main-'+metric+'.pdf', bbox_inches='tight')
    plt.close()

posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values


In [12]:
for set in ['basic', 'nulls', 'nulls+minors', "tbprofiler"]:

    df = results[(results.set==set)]

    plot_truthtables(df, ['ALL','HIGH'], filestem=f'table-{set}-', savefig=savefig)


In [13]:
UKMYC_PHENOTYPES = pandas.read_csv('dat/UKMYC_1000_phenotypes.csv')
UKMYC_PHENOTYPES.set_index('ENA_RUN_ACCESSION', inplace=True)

UKMYC_SAMPLES = pandas.read_csv('dat/UKMYC_1000_samples.csv')
UKMYC_SAMPLES.set_index('ENA_RUN_ACCESSION', inplace=True)

UKMYC_PHENOTYPES = UKMYC_PHENOTYPES.join(UKMYC_SAMPLES[['POS_AVG_GROWTH']])
UKMYC_PHENOTYPES.reset_index(inplace=True)
UKMYC_PHENOTYPES.set_index(['ENA_RUN_ACCESSION', 'DRUG'], inplace=True)
UKMYC_PHENOTYPES[:3]

PREDICTIONS = pandas.read_csv('dat/PREDICTIONS.csv')
tbprofiler = pandas.read_csv('dat/tbprofiler_PREDICTIONS.csv')
tbprofiler["SET"] = "tbprofiler"
PREDICTIONS = PREDICTIONS._append(tbprofiler, ignore_index=True)
PREDICTIONS.set_index(['ENA_RUN_ACCESSION', 'DRUG'], inplace=True)
PREDICTIONS[:3]

UKMYC_RESULTS = UKMYC_PHENOTYPES.join(PREDICTIONS,how='inner')
UKMYC_RESULTS.reset_index(inplace=True)
UKMYC_RESULTS[:3]

def define_outcome(row):
    if row.PREDICTION in ['S', 'U']:
        if row.BINARY_PHENOTYPE == 'R':
            return '(S+U)R'
        else:
            return '(S+U)S'
    else:
        return str(row.PREDICTION)+str(row.BINARY_PHENOTYPE)

UKMYC_RESULTS['OUTCOME'] = UKMYC_RESULTS.apply(define_outcome, axis=1)
UKMYC_RESULTS.OUTCOME.value_counts()

OUTCOME
(S+U)S    33009
RR         9546
(S+U)R     3818
RS         1505
FS           18
FR            8
Name: count, dtype: int64

In [14]:
DISCREPANCY_SET = UKMYC_RESULTS[(UKMYC_RESULTS.PHENOTYPE_METHOD=='UKMYC') & (UKMYC_RESULTS.OUTCOME.str[0]!='F')]
DISCREPANCY_SET[:6]

Unnamed: 0,ENA_RUN_ACCESSION,DRUG,UNIQUEID,BINARY_PHENOTYPE,PHENOTYPE_QUALITY,PHENOTYPE_METHOD,PLATEDESIGN,MIC,DILUTION,POS_AVG_GROWTH,SET,PREDICTION,OUTCOME
0,ERR4810791,INH,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>1.6,8,44.68,basic,R,RR
1,ERR4810791,INH,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>1.6,8,44.68,nulls,R,RR
2,ERR4810791,INH,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>1.6,8,44.68,nulls+minors,R,RR
3,ERR4810791,INH,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>1.6,8,44.68,tbprofiler,R,RR
4,ERR4810791,RIF,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>4,8,44.68,basic,R,RR
5,ERR4810791,RIF,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>4,8,44.68,nulls,R,RR


In [15]:
for set in ['basic', 'nulls', 'nulls+minors']:
    for drug in DISCREPANCY_SET.DRUG.unique():
        for quality in ['HIGH','ALL']:
            if quality=='HIGH':
                df = copy.deepcopy(DISCREPANCY_SET[(DISCREPANCY_SET.SET==set) & (DISCREPANCY_SET.DRUG==drug) & (DISCREPANCY_SET.PHENOTYPE_QUALITY==quality)])
            else:

                df = copy.deepcopy(DISCREPANCY_SET[(DISCREPANCY_SET.SET==set) & (DISCREPANCY_SET.DRUG==drug)])
            plot_growth_boxplot(df, filename=f'growth-{set}-{drug}-{quality}.pdf', savefig=savefig)

In [16]:
for set in ['basic', 'nulls', 'nulls+minors']:
    for platedesign in ['UKMYC5', 'UKMYC6']:
        for drug in DISCREPANCY_SET.DRUG.unique():
            for quality in ['HIGH','ALL']:
                if quality=='HIGH':
                    df = copy.deepcopy(DISCREPANCY_SET[(DISCREPANCY_SET.SET==set) & (DISCREPANCY_SET.PLATEDESIGN==platedesign) & (DISCREPANCY_SET.DRUG==drug) & (DISCREPANCY_SET.PHENOTYPE_QUALITY==quality)])
                else:

                    df = copy.deepcopy(DISCREPANCY_SET[(DISCREPANCY_SET.SET==set) & (DISCREPANCY_SET.PLATEDESIGN==platedesign) & (DISCREPANCY_SET.DRUG==drug)])
                if df.empty:
                    continue
                plot_dilution_boxplot(df, filename=f'mic-{set}-{drug}-{platedesign}-{quality}', savefig=savefig)

  df = df[["MIC", "OUTCOME", "ENA_RUN_ACCESSION"]].groupby(["OUTCOME", "MIC"]).count()
  df = df[["MIC", "OUTCOME", "ENA_RUN_ACCESSION"]].groupby(["OUTCOME", "MIC"]).count()
  df = df[["MIC", "OUTCOME", "ENA_RUN_ACCESSION"]].groupby(["OUTCOME", "MIC"]).count()
  df = df[["MIC", "OUTCOME", "ENA_RUN_ACCESSION"]].groupby(["OUTCOME", "MIC"]).count()
  df = df[["MIC", "OUTCOME", "ENA_RUN_ACCESSION"]].groupby(["OUTCOME", "MIC"]).count()
  df = df[["MIC", "OUTCOME", "ENA_RUN_ACCESSION"]].groupby(["OUTCOME", "MIC"]).count()
  df = df[["MIC", "OUTCOME", "ENA_RUN_ACCESSION"]].groupby(["OUTCOME", "MIC"]).count()
  df = df[["MIC", "OUTCOME", "ENA_RUN_ACCESSION"]].groupby(["OUTCOME", "MIC"]).count()
  df = df[["MIC", "OUTCOME", "ENA_RUN_ACCESSION"]].groupby(["OUTCOME", "MIC"]).count()
  df = df[["MIC", "OUTCOME", "ENA_RUN_ACCESSION"]].groupby(["OUTCOME", "MIC"]).count()
  df = df[["MIC", "OUTCOME", "ENA_RUN_ACCESSION"]].groupby(["OUTCOME", "MIC"]).count()
  df = df[["MIC", "OUTCOME", "ENA_RUN_ACCES

In [17]:
# Version of the above, but swapping internal comparisons for tbprofiler
colours = {'sensitivity': ['#d7301f', '#969696'], 'specificity': ['#0570b0', '#969696'], 'PPV': ['#238443', '#969696']}

for metric in ['sensitivity', 'specificity','PPV']:
    fig = plt.figure(figsize=(2.8, 4.5))
    axes = plt.gca()
    axes.spines["top"].set_visible(False)
    axes.spines["right"].set_visible(False)
    axes.spines["bottom"].set_visible(False)
    axes.get_xaxis().set_visible(False)
    axes.set_xticks([])
    axes.grid(False)
    # axes.plot([100,100], [-0.5, 14.5], color='#cccccc', linewidth=0.5, linestyle='-')
    y=numpy.arange(len(summary[summary.set=='nulls+minors']))
    axes.set_yticks(y, summary[summary.set=='nulls+minors']['drug'])

    # nulls+minors
    subset = summary[summary.set=='nulls+minors'][[metric, metric+'_sem']]    
    subset.columns = ['x', 'e']
    y=0
    for idx,row in subset.iterrows():
        axes.barh(y+0.3, row.x, 0.4,  xerr=row.e, color=colours[metric][0], ecolor=colours[metric][0], edgecolor=colours[metric][0], linewidth=1, alpha=0.5)
        axes.text(row.x+3, y+0.25, "%.1f" % row.x, ha="left", va='center', color=colours[metric][0], fontweight='bold')
        y+=1

    # tbprofiler
    subset = summary[summary.set=='tbprofiler'][[metric, metric+'_sem']]    
    subset.columns = ['x', 'e']
    y=0
    for idx,row in subset.iterrows():
        axes.barh(y-0.1, row.x, 0.4,  xerr=row.e, color=colours[metric][1], ecolor=colours[metric][1], edgecolor=colours[metric][1], linewidth=1, alpha=0.5)
        axes.text(row.x+3, y-0.15, "%.1f" % row.x, ha="left", va='center', color=colours[metric][1], fontweight='bold')
        y+=1

    axes.set_ylim(-0.3, 14.7)
    axes.set_xlim(0, 100)

    fig.savefig('pdf/fig-results-main-tbprofiler-'+metric+'.pdf', bbox_inches='tight')
    plt.close()

In [18]:
PHENOTYPES = pandas.read_csv('dat/PHENOTYPES.csv')
PHENOTYPES.set_index(['ENA_RUN_ACCESSION', 'DRUG'], inplace=True)
PHENOTYPES.sort_index(inplace=True)

predictions = pandas.read_csv('dat/PREDICTIONS.csv')
predictions = predictions[predictions.SET=='nulls+minors']
predictions.drop(columns=['SET'], inplace=True)
predictions.set_index(['ENA_RUN_ACCESSION', 'DRUG'], inplace=True)
predictions.rename(columns={'PREDICTION': 'GPAS'}, inplace=True)
predictions = predictions[predictions.index.isin(PHENOTYPES.index)]

tbprofiler = pandas.read_csv('dat/tbprofiler_PREDICTIONS.csv')
tbprofiler.set_index(['ENA_RUN_ACCESSION', 'DRUG'], inplace=True)
tbprofiler = tbprofiler[tbprofiler.index.isin(PHENOTYPES.index)]
tbprofiler.rename(columns={'PREDICTION': 'TBProfiler'}, inplace=True)

assert len(predictions) == len(tbprofiler)

comparison = predictions.join(tbprofiler, how='inner')
comparison

Unnamed: 0_level_0,Unnamed: 1_level_0,GPAS,TBProfiler
ENA_RUN_ACCESSION,DRUG,Unnamed: 2_level_1,Unnamed: 3_level_1
ERR13286038,BDQ,R,R
ERR13286038,LZD,S,S
ERR13286039,BDQ,R,R
ERR13286039,LZD,S,S
ERR13286042,BDQ,S,S
...,...,...,...
SRR1165572,LZD,S,S
SRR1165572,CAP,F,S
SRR1165601,PZA,R,R
SRR1165601,LZD,U,S


In [19]:
pandas.crosstab(comparison['GPAS'], comparison['TBProfiler'])

TBProfiler,R,S,U
GPAS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
F,4,25,0
R,3848,45,0
S,23,10457,23
U,14,1065,92


In [20]:
comparison.reset_index(inplace=True)
foo = comparison[(~comparison.GPAS.isin(['R','S'])) | (~comparison.TBProfiler.isin(['R','S']))]
comparison.set_index(['ENA_RUN_ACCESSION', 'DRUG'], inplace=True)
pandas.crosstab([foo['DRUG'], foo['GPAS']], foo['TBProfiler'])

Unnamed: 0_level_0,TBProfiler,R,S,U
DRUG,GPAS,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AMI,F,0,1,0
AMI,U,0,60,0
BDQ,S,0,0,4
BDQ,U,0,86,0
CAP,F,0,6,0
CAP,U,0,31,3
CFZ,U,0,51,0
DLM,U,0,240,0
EMB,F,0,4,0
EMB,S,0,0,12


In [21]:
rows = []

for drug in who_drugs:
    foo = comparison.loc[(slice(None), drug),:]
    foo = foo[foo.GPAS.isin(['R', 'S']) & (foo.TBProfiler.isin(['R', 'S']))]
    table = pandas.crosstab(foo['GPAS'], foo['TBProfiler'])
    results = mcnemar(table)
    rows.append([drug, table['R']['R'], table['S']['R'], table['R']['S'], table['S']['S'], results.pvalue])

comparison_statistics = pandas.DataFrame(rows, columns=['DRUG', 'RR', 'RS', 'SR', 'SS', 'pvalue']).set_index('DRUG')
comparison_statistics.to_csv('dat/tbprofiler_STATISTICS.csv')
comparison_statistics

Unnamed: 0_level_0,RR,RS,SR,SS,pvalue
DRUG,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
INH,483,2,1,467,1.0
RIF,480,0,1,497,1.0
PZA,256,3,0,351,0.25
EMB,360,0,2,576,0.5
BDQ,334,6,6,1336,1.0
LZD,111,2,0,1527,0.5
MXF,290,1,4,632,0.375
LEV,290,1,4,636,0.375
CFZ,64,2,3,878,1.0
DLM,18,1,0,739,1.0
