In [1]:
import pandas, numpy, copy


import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle

from src.utils import plot_truthtables, plot_growth_boxplot, plot_dilution_boxplot

import matplotlib
matplotlib.rcParams.update({'font.size': 7})

%load_ext autoreload
%autoreload 2

In [2]:
savefig=False

Read in the large `RESULTS` table created in the previous notebook

In [3]:
results = pandas.read_csv('dat/RESULTS.csv')
results[:3]

Unnamed: 0,set,drug,method,dataset,quality,sensitivity,sensitivity_sem,specificity,specificity_sem,PPV,PPV_sem,RR,SR,UR,RS,SS,US,Total
0,basic,INH,UKMYC,bootstrapped50,ALL,91.240875,0.480167,95.39515,0.367757,95.206597,0.370676,,,,,,,
1,basic,INH,UKMYC,bootstrapped50,HIGH,93.746796,0.336085,96.624866,0.352979,96.618858,0.342746,,,,,,,
2,basic,INH,UKMYC,entire,ALL,91.2,0.0,95.6,0.0,95.39749,0.0,456.0,33.0,11.0,22.0,448.0,30.0,1000.0


In [4]:
drug_names_table = pandas.read_csv("dat/drugs/drug_names_lookup.csv")
drug_names_table.set_index("DRUG", inplace=True)
drug_names_lookup = {}
for idx, row in drug_names_table.iterrows():
    drug_names_lookup[idx] = row.DRUG_NAME.capitalize()
drug_names_lookup

{'AMI': 'Amikacin',
 'BDQ': 'Bedaquiline',
 'CAP': 'Capreomycin',
 'CFZ': 'Clofazimine',
 'DLM': 'Delamanid',
 'EMB': 'Ethambutol',
 'ETH': 'Ethionamide',
 'INH': 'Isoniazid',
 'KAN': 'Kanamycin',
 'LEV': 'Levofloxacin',
 'LZD': 'Linezolid',
 'MXF': 'Moxifloxacin',
 'PZA': 'Pyrazinamide',
 'RFB': 'Rifabutin',
 'RIF': 'Rifampicin',
 'STM': 'Streptomycin'}

Now read in the list of drugs in the WHOv2 catalogue as well as the performance of the WHOv2 catalogue as reported in Annex 1 of the report

In [5]:
who_drugs = list(pandas.read_csv('dat/drugs/who2_drugs.csv').drug)

who = pandas.read_csv('dat/WHO2-Annex1-table.csv')
who = who[(who.catalogue=='WHO2') & (who.FRS==0.75)]

# reverse the order of the table so that the drugs are in the same order on all graphs
who = who.iloc[::-1]
who

Unnamed: 0,drug,catalogue,FRS,sensitivity,specificity,PPV,sensitivity_low,sensitivity_high,specificity_low,specificity_high,PPV_low,PPV_high
43,CAP,WHO2,0.75,66.2,97.8,80.1,64.1,68.2,97.6,98.1,78.1,81.9
40,KAN,WHO2,0.75,74.9,96.7,79.3,73.4,76.3,96.4,96.9,77.9,80.7
37,ETH,WHO2,0.75,74.8,85.9,63.9,73.6,76.0,85.3,86.4,62.7,65.1
34,STM,WHO2,0.75,79.7,94.1,89.9,78.9,80.5,93.7,94.4,89.3,90.5
31,AMI,WHO2,0.75,72.8,98.3,82.8,71.0,74.6,98.1,98.5,81.2,84.4
28,DLM,WHO2,0.75,14.7,99.9,72.5,10.6,19.7,99.8,99.9,58.3,84.1
25,CFZ,WHO2,0.75,17.0,98.7,38.1,14.2,20.0,98.5,98.9,32.6,43.8
22,LEV,WHO2,0.75,84.8,96.9,88.1,83.9,85.7,96.7,97.1,87.3,89.0
19,MXF,WHO2,0.75,85.7,93.5,74.0,84.6,86.8,93.2,93.9,72.7,75.2
16,LZD,WHO2,0.75,34.0,99.8,78.4,29.2,39.0,99.7,99.9,71.3,84.5


In [6]:
df = results[(results.dataset=='bootstrapped50') & (results.quality=='ALL')]

# reverse the order of the table so the drugs are in the correct order from top to bottom in the plot
df = df.iloc[::-1]

df[:15]

Unnamed: 0,set,drug,method,dataset,quality,sensitivity,sensitivity_sem,specificity,specificity_sem,PPV,PPV_sem,RR,SR,UR,RS,SS,US,Total
154,nulls+minors,CAP,MGIT,bootstrapped50,ALL,74.983621,0.941786,98.295397,0.200364,94.980289,0.593964,,,,,,,
150,nulls+minors,KAN,UKMYC,bootstrapped50,ALL,76.749387,1.047989,98.699913,0.182592,96.031846,0.551728,,,,,,,
146,nulls+minors,ETH,UKMYC,bootstrapped50,ALL,72.242845,1.038901,87.01596,0.512836,69.356946,0.993347,,,,,,,
144,nulls+minors,STM,MGIT,bootstrapped50,ALL,80.735208,0.769336,95.267691,0.366638,92.251254,0.571979,,,,,,,
140,nulls+minors,AMI,UKMYC,bootstrapped50,ALL,74.26731,1.010874,99.400998,0.107511,98.067741,0.344641,,,,,,,
136,nulls+minors,DLM,UKMYC,bootstrapped50,ALL,12.298169,1.115095,99.88839,0.049223,95.484431,1.849704,,,,,,,
132,nulls+minors,CFZ,UKMYC,bootstrapped50,ALL,8.079825,0.598057,98.008384,0.210772,62.36175,3.068161,,,,,,,
128,nulls+minors,LEV,UKMYC,bootstrapped50,ALL,82.316749,0.749602,96.431502,0.287389,91.743096,0.691066,,,,,,,
124,nulls+minors,MXF,UKMYC,bootstrapped50,ALL,85.765151,0.780545,93.824019,0.356465,84.916448,0.897091,,,,,,,
120,nulls+minors,LZD,UKMYC,bootstrapped50,ALL,28.114421,1.933896,99.770307,0.074055,94.310212,1.85737,,,,,,,


Now also produce a table for the UKMYC drugs with HIGH confidence MIC measurements. Because four drugs will not have values we have to manually insert the rows to ensure the graphs work.


In [7]:
df2 = results[(results.dataset=='bootstrapped50') & (results.quality=='HIGH') & (results.set=='nulls+minors')]

rows = []
for i in ['PZA', 'BDQ', 'STM', 'CAP']:
    row = ['nulls+minor', i, 'UKMYC', 'bootstrapped50', 'HIGH'] + [None for i in range(13)]
    rows.append(row)

df3 = pandas.DataFrame(rows, columns=df2.columns)
df2 = pandas.concat([df2, df3])

df2.drug = df2.drug.astype('category')
df2.drug = df2.drug.cat.set_categories(who_drugs)
df2.sort_values('drug', inplace=True)
df2 = df2.iloc[::-1]
df2['set'] = 'nulls+minor+high'
df2

  df2 = pandas.concat([df2, df3])


Unnamed: 0,set,drug,method,dataset,quality,sensitivity,sensitivity_sem,specificity,specificity_sem,PPV,PPV_sem,RR,SR,UR,RS,SS,US,Total
3,nulls+minor+high,CAP,UKMYC,bootstrapped50,HIGH,,,,,,,,,,,,,
151,nulls+minor+high,KAN,UKMYC,bootstrapped50,HIGH,84.298262,0.713334,98.322163,0.199675,95.369505,0.574964,,,,,,,
147,nulls+minor+high,ETH,UKMYC,bootstrapped50,HIGH,75.425695,0.999098,86.010415,0.532746,69.54641,1.094366,,,,,,,
2,nulls+minor+high,STM,UKMYC,bootstrapped50,HIGH,,,,,,,,,,,,,
141,nulls+minor+high,AMI,UKMYC,bootstrapped50,HIGH,83.340398,0.765663,99.293077,0.103791,98.014958,0.316433,,,,,,,
137,nulls+minor+high,DLM,UKMYC,bootstrapped50,HIGH,21.673766,1.971608,100.0,0.0,100.0,0.0,,,,,,,
133,nulls+minor+high,CFZ,UKMYC,bootstrapped50,HIGH,4.894349,0.743395,97.639388,0.217028,21.421913,3.400427,,,,,,,
129,nulls+minor+high,LEV,UKMYC,bootstrapped50,HIGH,87.264057,0.674621,96.563175,0.266662,92.600837,0.541725,,,,,,,
125,nulls+minor+high,MXF,UKMYC,bootstrapped50,HIGH,90.003945,0.548099,93.280991,0.376534,86.225812,0.761897,,,,,,,
121,nulls+minor+high,LZD,UKMYC,bootstrapped50,HIGH,42.467444,1.895582,99.816012,0.049593,95.592543,1.267028,,,,,,,


In [8]:
table = results[(results.dataset=='bootstrapped50') & (results.quality=='ALL') & (results.set!='nulls')]
table = pandas.concat([table,df2])
table = table[['set', 'drug', 'sensitivity',
       'sensitivity_sem', 'specificity', 'specificity_sem']]

for col in ['sensitivity', 'sensitivity_sem', 'specificity', 'specificity_sem']:
    table[col] = table[col].map('{:,.1f}'.format)
for col in ['sensitivity', 'specificity']:
    table[col] = table[col] + ' ±' + table[col+'_sem']
table = table[['set','drug', 'sensitivity',
        'specificity', ]]
table.drug = table.drug.astype('category')
table.drug = table.drug.cat.set_categories(who_drugs)
table.set = table.set.astype('category')
table.set = table.set.cat.set_categories(['basic', 'nulls+minors', 'nulls+minor+high'])
table.set_index(['set','drug'], inplace=True)
table = table.unstack(level=0)
table.columns = table.columns.swaplevel(0,1)
table.sort_index(axis=1, level=0, inplace=True)
table.replace('nan ±nan', '', inplace=True)
table.rename(drug_names_lookup, inplace=True)
table

set,basic,basic,nulls+minors,nulls+minors,nulls+minor+high,nulls+minor+high
Unnamed: 0_level_1,sensitivity,specificity,sensitivity,specificity,sensitivity,specificity
drug,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Isoniazid,91.2 ±0.5,95.4 ±0.4,92.5 ±0.5,94.6 ±0.4,95.2 ±0.4,95.6 ±0.4
Rifampicin,93.7 ±0.4,96.0 ±0.3,95.3 ±0.4,94.9 ±0.3,96.7 ±0.3,95.5 ±0.3
Pyrazinamide,81.7 ±0.7,97.4 ±0.3,85.1 ±0.7,96.7 ±0.4,,
Ethambutol,85.5 ±0.9,84.9 ±0.5,86.7 ±0.8,84.1 ±0.5,88.7 ±0.8,81.4 ±0.6
Bedaquiline,41.2 ±1.0,98.6 ±0.2,46.1 ±1.0,98.6 ±0.2,,
Linezolid,22.5 ±1.8,99.9 ±0.0,28.1 ±1.9,99.8 ±0.1,42.5 ±1.9,99.8 ±0.0
Moxifloxacin,81.2 ±1.0,94.6 ±0.3,85.8 ±0.8,93.8 ±0.4,90.0 ±0.5,93.3 ±0.4
Levofloxacin,77.6 ±0.9,96.9 ±0.3,82.3 ±0.7,96.4 ±0.3,87.3 ±0.7,96.6 ±0.3
Clofazimine,7.3 ±0.6,98.2 ±0.2,8.1 ±0.6,98.0 ±0.2,4.9 ±0.7,97.6 ±0.2
Delamanid,12.3 ±1.1,99.9 ±0.0,12.3 ±1.1,99.9 ±0.0,21.7 ±2.0,100.0 ±0.0


In [9]:
print(table.to_latex(column_format='r|rr|rr|rr', multirow=True))

\begin{tabular}{r|rr|rr|rr}
\toprule
set & \multicolumn{2}{r}{basic} & \multicolumn{2}{r}{nulls+minors} & \multicolumn{2}{r}{nulls+minor+high} \\
 & sensitivity & specificity & sensitivity & specificity & sensitivity & specificity \\
drug &  &  &  &  &  &  \\
\midrule
Isoniazid & 91.2 ±0.5 & 95.4 ±0.4 & 92.5 ±0.5 & 94.6 ±0.4 & 95.2 ±0.4 & 95.6 ±0.4 \\
Rifampicin & 93.7 ±0.4 & 96.0 ±0.3 & 95.3 ±0.4 & 94.9 ±0.3 & 96.7 ±0.3 & 95.5 ±0.3 \\
Pyrazinamide & 81.7 ±0.7 & 97.4 ±0.3 & 85.1 ±0.7 & 96.7 ±0.4 &  &  \\
Ethambutol & 85.5 ±0.9 & 84.9 ±0.5 & 86.7 ±0.8 & 84.1 ±0.5 & 88.7 ±0.8 & 81.4 ±0.6 \\
Bedaquiline & 41.2 ±1.0 & 98.6 ±0.2 & 46.1 ±1.0 & 98.6 ±0.2 &  &  \\
Linezolid & 22.5 ±1.8 & 99.9 ±0.0 & 28.1 ±1.9 & 99.8 ±0.1 & 42.5 ±1.9 & 99.8 ±0.0 \\
Moxifloxacin & 81.2 ±1.0 & 94.6 ±0.3 & 85.8 ±0.8 & 93.8 ±0.4 & 90.0 ±0.5 & 93.3 ±0.4 \\
Levofloxacin & 77.6 ±0.9 & 96.9 ±0.3 & 82.3 ±0.7 & 96.4 ±0.3 & 87.3 ±0.7 & 96.6 ±0.3 \\
Clofazimine & 7.3 ±0.6 & 98.2 ±0.2 & 8.1 ±0.6 & 98.0 ±0.2 & 4.9 ±0.7 & 97.

In [10]:
table = results[(results.dataset=='bootstrapped50') & (results.quality=='ALL') & (results.set!='nulls')]
table = pandas.concat([table,df2])
table = table[['set', 'drug', 'PPV',
       'PPV_sem']]

for col in ['PPV', 'PPV_sem']:
    table[col] = table[col].map('{:,.1f}'.format)
for col in ['PPV']:
    table[col] = table[col] + ' ±' + table[col+'_sem']
table = table[['set','drug', 'PPV']]
table.drug = table.drug.astype('category')
table.drug = table.drug.cat.set_categories(who_drugs)
table.set = table.set.astype('category')
table.set = table.set.cat.set_categories(['basic', 'nulls+minors', 'nulls+minor+high'])
table.set_index(['set','drug'], inplace=True)
table = table.unstack(level=0)
table.columns = table.columns.swaplevel(0,1)
table.sort_index(axis=1, level=0, inplace=True)
table.replace('nan ±nan', '', inplace=True)
table.rename(drug_names_lookup, inplace=True)
table

set,basic,nulls+minors,nulls+minor+high
Unnamed: 0_level_1,PPV,PPV,PPV
drug,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Isoniazid,95.2 ±0.4,94.5 ±0.4,95.7 ±0.4
Rifampicin,95.4 ±0.4,94.5 ±0.4,95.6 ±0.3
Pyrazinamide,96.4 ±0.5,95.7 ±0.5,
Ethambutol,69.3 ±1.0,68.4 ±1.0,67.3 ±1.0
Bedaquiline,96.8 ±0.5,97.2 ±0.4,
Linezolid,97.0 ±1.2,94.3 ±1.9,95.6 ±1.3
Moxifloxacin,86.0 ±0.9,84.9 ±0.9,86.2 ±0.8
Levofloxacin,92.4 ±0.7,91.7 ±0.7,92.6 ±0.5
Clofazimine,62.8 ±3.3,62.4 ±3.1,21.4 ±3.4
Delamanid,95.5 ±1.8,95.5 ±1.8,100.0 ±0.0


In [11]:
print(table.to_latex(column_format='r|rr|rr|rr', multirow=True))

\begin{tabular}{r|rr|rr|rr}
\toprule
set & basic & nulls+minors & nulls+minor+high \\
 & PPV & PPV & PPV \\
drug &  &  &  \\
\midrule
Isoniazid & 95.2 ±0.4 & 94.5 ±0.4 & 95.7 ±0.4 \\
Rifampicin & 95.4 ±0.4 & 94.5 ±0.4 & 95.6 ±0.3 \\
Pyrazinamide & 96.4 ±0.5 & 95.7 ±0.5 &  \\
Ethambutol & 69.3 ±1.0 & 68.4 ±1.0 & 67.3 ±1.0 \\
Bedaquiline & 96.8 ±0.5 & 97.2 ±0.4 &  \\
Linezolid & 97.0 ±1.2 & 94.3 ±1.9 & 95.6 ±1.3 \\
Moxifloxacin & 86.0 ±0.9 & 84.9 ±0.9 & 86.2 ±0.8 \\
Levofloxacin & 92.4 ±0.7 & 91.7 ±0.7 & 92.6 ±0.5 \\
Clofazimine & 62.8 ±3.3 & 62.4 ±3.1 & 21.4 ±3.4 \\
Delamanid & 95.5 ±1.8 & 95.5 ±1.8 & 100.0 ±0.0 \\
Amikacin & 98.6 ±0.3 & 98.1 ±0.3 & 98.0 ±0.3 \\
Streptomycin & 92.7 ±0.6 & 92.3 ±0.6 &  \\
Ethionamide & 69.8 ±1.0 & 69.4 ±1.0 & 69.5 ±1.1 \\
Kanamycin & 97.0 ±0.5 & 96.0 ±0.6 & 95.4 ±0.6 \\
Capreomycin & 94.3 ±0.7 & 95.0 ±0.6 &  \\
\bottomrule
\end{tabular}



In [12]:
colours = {'sensitivity': ['#990000', '#d7301f', '#ef6548'], 'specificity': ['#034e7b', '#0570b0','#3690c0'], 'PPV': ['#005a32', '#238443','#41ab5d']}


for metric in ['sensitivity', 'specificity','PPV']:
    fig = plt.figure(figsize=(2.8, 8.5))
    axes = plt.gca()
    axes.spines["top"].set_visible(False)
    axes.spines["right"].set_visible(False)
    axes.spines["bottom"].set_visible(False)
    axes.get_xaxis().set_visible(False)
    axes.plot([100,100], [-0.5, 14.5], color='#cccccc', linewidth=0.5, linestyle='-')
    y=numpy.arange(len(df[results.set=='basic']))
    axes.set_yticks(y, df[results.set=='basic']['drug'])
    e = (
        who[metric + "_high"] - who[metric],
        who[metric] - who[metric + "_low"],
    )
    axes.barh(y+0.3, who[metric], 0.2, label=who[metric], color='#cccccc', edgecolor='white', linewidth=1, alpha=0.5)
    subset = who[[metric]]
    subset.columns = ['x']
    y=0
    for idx,row in subset.iterrows():
        axes.text(row.x+2, y+0.3, "%.1f" % row.x, ha="left", va='center', color='#cccccc', fontweight='light')
        y+=1

    subset = df[results.set=='basic'][[metric, metric+'_sem']]
    subset.columns = ['x', 'e']
    y=0
    for idx,row in subset.iterrows():
        axes.plot([row.x, row.x], [y, y+0.2], color=colours[metric][0], linewidth=1)
        axes.add_patch(Rectangle((row.x-row.e, y), 2*row.e, 0.2, fc=colours[metric][0], alpha=0.2))
        axes.text(row.x+2, y+0.1, "%.1f" % row.x, ha="left", va='center', color=colours[metric][0], fontweight='heavy') #, backgroundcolor='white')
        y+=1

    subset = df[results.set=='nulls+minors'][[metric, metric+'_sem']]    
    subset.columns = ['x', 'e']
    y=0
    for idx,row in subset.iterrows():
        axes.plot([row.x, row.x], [y-0.2, y], color=colours[metric][1], linewidth=1)
        axes.add_patch(Rectangle((row.x-row.e, y-0.2), 2*row.e, 0.2, fc=colours[metric][1], alpha=0.2))
        axes.text(row.x+2, y-0.1, "%.1f" % row.x, ha="left", va='center', color=colours[metric][1], fontweight='bold')
        y+=1

    subset = df2[[metric, metric+'_sem']]
    subset.columns = ['x', 'e']
    y=0
    for idx,row in subset.iterrows():
        axes.plot([row.x, row.x], [y-0.4, y-0.2], color=colours[metric][2], linewidth=1)
        axes.add_patch(Rectangle((row.x-row.e, y-0.4), 2*row.e, 0.2, fc=colours[metric][2], alpha=0.2))
        axes.text(row.x+2, y-0.3, "%.1f" % row.x, ha="left", va='center', color=colours[metric][2], fontweight='bold')
        y+=1

    axes.set_ylim(-0.3, 14.5)

    
    fig.savefig('pdf/fig-results-main-'+metric+'.pdf', bbox_inches='tight')
    plt.close()

  y=numpy.arange(len(df[results.set=='basic']))
  axes.set_yticks(y, df[results.set=='basic']['drug'])
  subset = df[results.set=='basic'][[metric, metric+'_sem']]
  subset = df[results.set=='nulls+minors'][[metric, metric+'_sem']]
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
  y=numpy.arange(len(df[results.set=='basic']))
  axes.set_yticks(y, df[results.set=='basic']['drug'])
  subset = df[results.set=='basic'][[metric, metric+'_sem']]
  subset = df[results.set=='nulls+minors'][[metric, metric+'_sem']]
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx

In [13]:
for set in ['basic', 'nulls', 'nulls+minors']:

    df = results[(results.set==set)]

    plot_truthtables(df, ['ALL','HIGH'], filestem=f'table-{set}-', savefig=savefig)


In [14]:
df

Unnamed: 0,set,drug,method,dataset,quality,sensitivity,sensitivity_sem,specificity,specificity_sem,PPV,PPV_sem,RR,SR,UR,RS,SS,US,Total
104,nulls+minors,INH,UKMYC,bootstrapped50,ALL,92.533301,0.45206,94.636048,0.372694,94.533908,0.377959,,,,,,,
105,nulls+minors,INH,UKMYC,bootstrapped50,HIGH,95.194281,0.360159,95.629751,0.383949,95.725686,0.374093,,,,,,,
106,nulls+minors,INH,UKMYC,entire,ALL,92.4,0.0,94.789579,0.0,94.672131,0.0,462.0,27.0,11.0,26.0,444.0,29.0,1000.0
107,nulls+minors,INH,UKMYC,entire,HIGH,95.217391,0.0,95.505618,0.0,95.633188,0.0,438.0,17.0,5.0,20.0,400.0,25.0,906.0
108,nulls+minors,RIF,UKMYC,bootstrapped50,ALL,95.346262,0.375695,94.947131,0.342044,94.481232,0.40729,,,,,,,
109,nulls+minors,RIF,UKMYC,bootstrapped50,HIGH,96.671807,0.324791,95.452993,0.348014,95.55643,0.317734,,,,,,,
110,nulls+minors,RIF,UKMYC,entire,ALL,95.378151,0.0,94.827586,0.0,94.386694,0.0,454.0,21.0,1.0,27.0,477.0,18.0,1000.0
111,nulls+minors,RIF,UKMYC,entire,HIGH,96.590909,0.0,95.671982,0.0,95.720721,0.0,425.0,15.0,0.0,19.0,405.0,15.0,880.0
112,nulls+minors,PZA,MGIT,bootstrapped50,ALL,85.094046,0.693634,96.705314,0.369636,95.740886,0.479617,,,,,,,
113,nulls+minors,PZA,MGIT,entire,ALL,84.879725,0.0,96.72619,0.0,95.736434,0.0,247.0,30.0,14.0,11.0,322.0,3.0,627.0


In [14]:
UKMYC_PHENOTYPES = pandas.read_csv('dat/UKMYC_1000_phenotypes.csv')
UKMYC_PHENOTYPES.set_index('ENA_RUN_ACCESSION', inplace=True)

UKMYC_SAMPLES = pandas.read_csv('dat/UKMYC_1000_samples.csv')
UKMYC_SAMPLES.set_index('ENA_RUN_ACCESSION', inplace=True)

UKMYC_PHENOTYPES = UKMYC_PHENOTYPES.join(UKMYC_SAMPLES[['POS_AVG_GROWTH']])
UKMYC_PHENOTYPES.reset_index(inplace=True)
UKMYC_PHENOTYPES.set_index(['ENA_RUN_ACCESSION', 'DRUG'], inplace=True)
UKMYC_PHENOTYPES[:3]

Unnamed: 0_level_0,Unnamed: 1_level_0,UNIQUEID,BINARY_PHENOTYPE,PHENOTYPE_QUALITY,PHENOTYPE_METHOD,PLATEDESIGN,MIC,DILUTION,POS_AVG_GROWTH
ENA_RUN_ACCESSION,DRUG,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ERR4810791,INH,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>1.6,8,44.68
ERR4810791,RIF,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>4,8,44.68
ERR4810791,EMB,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>8,9,44.68


In [15]:
PREDICTIONS = pandas.read_csv('dat/PREDICTIONS.csv')
PREDICTIONS.set_index(['ENA_RUN_ACCESSION', 'DRUG'], inplace=True)
PREDICTIONS[:3]

Unnamed: 0_level_0,Unnamed: 1_level_0,SET,PREDICTION
ENA_RUN_ACCESSION,DRUG,Unnamed: 2_level_1,Unnamed: 3_level_1
ERR13286038,INH,basic,R
ERR13286038,RIF,basic,R
ERR13286038,PZA,basic,R


In [16]:
UKMYC_RESULTS = UKMYC_PHENOTYPES.join(PREDICTIONS)
UKMYC_RESULTS.reset_index(inplace=True)
UKMYC_RESULTS[:3]

Unnamed: 0,ENA_RUN_ACCESSION,DRUG,UNIQUEID,BINARY_PHENOTYPE,PHENOTYPE_QUALITY,PHENOTYPE_METHOD,PLATEDESIGN,MIC,DILUTION,POS_AVG_GROWTH,SET,PREDICTION
0,ERR4810791,INH,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>1.6,8,44.68,basic,R
1,ERR4810791,INH,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>1.6,8,44.68,nulls,R
2,ERR4810791,INH,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>1.6,8,44.68,nulls+minors,R


In [17]:
def define_outcome(row):
    if row.PREDICTION in ['S', 'U']:
        if row.BINARY_PHENOTYPE == 'R':
            return '(S+U)R'
        else:
            return '(S+U)S'
    else:
        return row.PREDICTION+row.BINARY_PHENOTYPE

UKMYC_RESULTS['OUTCOME'] = UKMYC_RESULTS.apply(define_outcome, axis=1)
UKMYC_RESULTS.OUTCOME.value_counts()

OUTCOME
(S+U)S    24867
RR         7073
(S+U)R     2975
RS         1065
FS           18
FR            2
Name: count, dtype: int64

In [18]:
DISCREPANCY_SET = UKMYC_RESULTS[(UKMYC_RESULTS.PHENOTYPE_METHOD=='UKMYC') & (UKMYC_RESULTS.OUTCOME.str[0]!='F')]
DISCREPANCY_SET[:3]

Unnamed: 0,ENA_RUN_ACCESSION,DRUG,UNIQUEID,BINARY_PHENOTYPE,PHENOTYPE_QUALITY,PHENOTYPE_METHOD,PLATEDESIGN,MIC,DILUTION,POS_AVG_GROWTH,SET,PREDICTION,OUTCOME
0,ERR4810791,INH,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>1.6,8,44.68,basic,R,RR
1,ERR4810791,INH,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>1.6,8,44.68,nulls,R,RR
2,ERR4810791,INH,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>1.6,8,44.68,nulls+minors,R,RR


In [19]:
for set in ['basic', 'nulls', 'nulls+minors']:
    for drug in DISCREPANCY_SET.DRUG.unique():
        for quality in ['HIGH','ALL']:
            if quality=='HIGH':
                df = copy.deepcopy(DISCREPANCY_SET[(DISCREPANCY_SET.SET==set) & (DISCREPANCY_SET.DRUG==drug) & (DISCREPANCY_SET.PHENOTYPE_QUALITY==quality)])
            else:

                df = copy.deepcopy(DISCREPANCY_SET[(DISCREPANCY_SET.SET==set) & (DISCREPANCY_SET.DRUG==drug)])
            plot_growth_boxplot(df, filename=f'growth-{set}-{drug}-{quality}.pdf', savefig=savefig)

In [20]:
for set in ['basic', 'nulls', 'nulls+minors']:
    for platedesign in ['UKMYC5', 'UKMYC6']:
        for drug in DISCREPANCY_SET.DRUG.unique():
            for quality in ['HIGH','ALL']:
                if quality=='HIGH':
                    df = copy.deepcopy(DISCREPANCY_SET[(DISCREPANCY_SET.SET==set) & (DISCREPANCY_SET.PLATEDESIGN==platedesign) & (DISCREPANCY_SET.DRUG==drug) & (DISCREPANCY_SET.PHENOTYPE_QUALITY==quality)])
                else:

                    df = copy.deepcopy(DISCREPANCY_SET[(DISCREPANCY_SET.SET==set) & (DISCREPANCY_SET.PLATEDESIGN==platedesign) & (DISCREPANCY_SET.DRUG==drug)])
                plot_dilution_boxplot(df, filename=f'mic-{set}-{drug}-{platedesign}-{quality}.pdf', savefig=savefig)

  df = df[["MIC", "OUTCOME", "ENA_RUN_ACCESSION"]].groupby(["OUTCOME", "MIC"]).count()
  df = df[["MIC", "OUTCOME", "ENA_RUN_ACCESSION"]].groupby(["OUTCOME", "MIC"]).count()
  df = df[["MIC", "OUTCOME", "ENA_RUN_ACCESSION"]].groupby(["OUTCOME", "MIC"]).count()
  df = df[["MIC", "OUTCOME", "ENA_RUN_ACCESSION"]].groupby(["OUTCOME", "MIC"]).count()
  df = df[["MIC", "OUTCOME", "ENA_RUN_ACCESSION"]].groupby(["OUTCOME", "MIC"]).count()
  df = df[["MIC", "OUTCOME", "ENA_RUN_ACCESSION"]].groupby(["OUTCOME", "MIC"]).count()
  df = df[["MIC", "OUTCOME", "ENA_RUN_ACCESSION"]].groupby(["OUTCOME", "MIC"]).count()
  df = df[["MIC", "OUTCOME", "ENA_RUN_ACCESSION"]].groupby(["OUTCOME", "MIC"]).count()
  df = df[["MIC", "OUTCOME", "ENA_RUN_ACCESSION"]].groupby(["OUTCOME", "MIC"]).count()
  df = df[["MIC", "OUTCOME", "ENA_RUN_ACCESSION"]].groupby(["OUTCOME", "MIC"]).count()
  df = df[["MIC", "OUTCOME", "ENA_RUN_ACCESSION"]].groupby(["OUTCOME", "MIC"]).count()
  df = df[["MIC", "OUTCOME", "ENA_RUN_ACCES