In [1]:
import pandas, numpy, copy


import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle

from src.utils import plot_truthtables, plot_growth_boxplot, plot_dilution_boxplot

import matplotlib
matplotlib.rcParams.update({'font.size': 7})

%load_ext autoreload
%autoreload 2

In [65]:
savefig=True

Read in the large `RESULTS` table created in the previous notebook

In [2]:
results = pandas.read_csv('dat/RESULTS.csv')
results[:3]

Unnamed: 0,set,drug,method,dataset,quality,sensitivity,sensitivity_sem,specificity,specificity_sem,PPV,PPV_sem,RR,SR,UR,RS,SS,US,Total
0,basic,INH,UKMYC,bootstrapped50,ALL,91.240875,0.480167,95.39515,0.367757,95.206597,0.370676,,,,,,,
1,basic,INH,UKMYC,bootstrapped50,HIGH,93.746796,0.336085,96.624866,0.352979,96.618858,0.342746,,,,,,,
2,basic,INH,UKMYC,entire,ALL,91.2,0.0,95.6,0.0,95.39749,0.0,456.0,33.0,11.0,22.0,448.0,30.0,1000.0


Now read in the list of drugs in the WHOv2 catalogue as well as the performance of the WHOv2 catalogue as reported in Annex 1 of the report

In [3]:
who_drugs = list(pandas.read_csv('dat/drugs/who2_drugs.csv').drug)

who = pandas.read_csv('dat/WHO2-Annex1-table.csv')
who = who[(who.catalogue=='WHO2') & (who.FRS==0.75)]

# reverse the order of the table so that the drugs are in the same order on all graphs
who = who.iloc[::-1]
who

Unnamed: 0,drug,catalogue,FRS,sensitivity,specificity,PPV,sensitivity_low,sensitivity_high,specificity_low,specificity_high,PPV_low,PPV_high
43,CAP,WHO2,0.75,66.2,97.8,80.1,64.1,68.2,97.6,98.1,78.1,81.9
40,KAN,WHO2,0.75,74.9,96.7,79.3,73.4,76.3,96.4,96.9,77.9,80.7
37,ETH,WHO2,0.75,74.8,85.9,63.9,73.6,76.0,85.3,86.4,62.7,65.1
34,STM,WHO2,0.75,79.7,94.1,89.9,78.9,80.5,93.7,94.4,89.3,90.5
31,AMI,WHO2,0.75,72.8,98.3,82.8,71.0,74.6,98.1,98.5,81.2,84.4
28,DLM,WHO2,0.75,14.7,99.9,72.5,10.6,19.7,99.8,99.9,58.3,84.1
25,CFZ,WHO2,0.75,17.0,98.7,38.1,14.2,20.0,98.5,98.9,32.6,43.8
22,LEV,WHO2,0.75,84.8,96.9,88.1,83.9,85.7,96.7,97.1,87.3,89.0
19,MXF,WHO2,0.75,85.7,93.5,74.0,84.6,86.8,93.2,93.9,72.7,75.2
16,LZD,WHO2,0.75,34.0,99.8,78.4,29.2,39.0,99.7,99.9,71.3,84.5


In [4]:
df = results[(results.dataset=='bootstrapped50') & (results.quality=='ALL')]

# reverse the order of the table so the drugs are in the correct order from top to bottom in the plot
df = df.iloc[::-1]
df[:15]

Unnamed: 0,set,drug,method,dataset,quality,sensitivity,sensitivity_sem,specificity,specificity_sem,PPV,PPV_sem,RR,SR,UR,RS,SS,US,Total
154,nulls+minors,CAP,MGIT,bootstrapped50,ALL,74.983621,0.941786,98.295397,0.200364,94.980289,0.593964,,,,,,,
150,nulls+minors,KAN,UKMYC,bootstrapped50,ALL,76.749387,1.047989,98.699913,0.182592,96.031846,0.551728,,,,,,,
146,nulls+minors,ETH,UKMYC,bootstrapped50,ALL,72.242845,1.038901,87.01596,0.512836,69.356946,0.993347,,,,,,,
144,nulls+minors,STM,MGIT,bootstrapped50,ALL,80.735208,0.769336,95.267691,0.366638,92.251254,0.571979,,,,,,,
140,nulls+minors,AMI,UKMYC,bootstrapped50,ALL,74.26731,1.010874,99.400998,0.107511,98.067741,0.344641,,,,,,,
136,nulls+minors,DLM,UKMYC,bootstrapped50,ALL,12.298169,1.115095,99.88839,0.049223,95.484431,1.849704,,,,,,,
132,nulls+minors,CFZ,UKMYC,bootstrapped50,ALL,8.079825,0.598057,98.008384,0.210772,62.36175,3.068161,,,,,,,
128,nulls+minors,LEV,UKMYC,bootstrapped50,ALL,82.316749,0.749602,96.431502,0.287389,91.743096,0.691066,,,,,,,
124,nulls+minors,MXF,UKMYC,bootstrapped50,ALL,85.765151,0.780545,93.824019,0.356465,84.916448,0.897091,,,,,,,
120,nulls+minors,LZD,UKMYC,bootstrapped50,ALL,28.114421,1.933896,99.770307,0.074055,94.310212,1.85737,,,,,,,


Now also produce a table for the UKMYC drugs with HIGH confidence MIC measurements. Because four drugs will not have values we have to manually insert the rows to ensure the graphs work.


In [5]:
df2 = results[(results.dataset=='bootstrapped50') & (results.quality=='HIGH') & (results.set=='nulls+minors')]

rows = []
for i in ['PZA', 'BDQ', 'STM', 'CAP']:
    row = ['nulls+minor', i, 'UKMYC', 'bootstrapped50', 'HIGH'] + [None for i in range(13)]
    rows.append(row)

df3 = pandas.DataFrame(rows, columns=df2.columns)
df2 = pandas.concat([df2, df3])

df2.drug = df2.drug.astype('category')
df2.drug = df2.drug.cat.set_categories(who_drugs)
df2.sort_values('drug', inplace=True)
df2 = df2.iloc[::-1]
df2

  df2 = pandas.concat([df2, df3])


Unnamed: 0,set,drug,method,dataset,quality,sensitivity,sensitivity_sem,specificity,specificity_sem,PPV,PPV_sem,RR,SR,UR,RS,SS,US,Total
3,nulls+minor,CAP,UKMYC,bootstrapped50,HIGH,,,,,,,,,,,,,
151,nulls+minors,KAN,UKMYC,bootstrapped50,HIGH,84.298262,0.713334,98.322163,0.199675,95.369505,0.574964,,,,,,,
147,nulls+minors,ETH,UKMYC,bootstrapped50,HIGH,75.425695,0.999098,86.010415,0.532746,69.54641,1.094366,,,,,,,
2,nulls+minor,STM,UKMYC,bootstrapped50,HIGH,,,,,,,,,,,,,
141,nulls+minors,AMI,UKMYC,bootstrapped50,HIGH,83.340398,0.765663,99.293077,0.103791,98.014958,0.316433,,,,,,,
137,nulls+minors,DLM,UKMYC,bootstrapped50,HIGH,21.673766,1.971608,100.0,0.0,100.0,0.0,,,,,,,
133,nulls+minors,CFZ,UKMYC,bootstrapped50,HIGH,4.894349,0.743395,97.639388,0.217028,21.421913,3.400427,,,,,,,
129,nulls+minors,LEV,UKMYC,bootstrapped50,HIGH,87.264057,0.674621,96.563175,0.266662,92.600837,0.541725,,,,,,,
125,nulls+minors,MXF,UKMYC,bootstrapped50,HIGH,90.003945,0.548099,93.280991,0.376534,86.225812,0.761897,,,,,,,
121,nulls+minors,LZD,UKMYC,bootstrapped50,HIGH,42.467444,1.895582,99.816012,0.049593,95.592543,1.267028,,,,,,,


In [6]:
colours = {'sensitivity': ['#990000', '#d7301f', '#ef6548'], 'specificity': ['#034e7b', '#0570b0','#3690c0'], 'PPV': ['#005a32', '#238443','#41ab5d']}


for metric in ['sensitivity', 'specificity','PPV']:
    fig = plt.figure(figsize=(2.8, 8.5))
    axes = plt.gca()
    axes.spines["top"].set_visible(False)
    axes.spines["right"].set_visible(False)
    axes.spines["bottom"].set_visible(False)
    axes.get_xaxis().set_visible(False)
    axes.plot([100,100], [-0.5, 14.5], color='#cccccc', linewidth=0.5, linestyle='-')
    y=numpy.arange(len(df[results.set=='basic']))
    axes.set_yticks(y, df[results.set=='basic']['drug'])
    e = (
        who[metric + "_high"] - who[metric],
        who[metric] - who[metric + "_low"],
    )
    axes.barh(y+0.3, who[metric], 0.2, label=who[metric], color='#cccccc', edgecolor='white', linewidth=1, alpha=0.5)
    subset = who[[metric]]
    subset.columns = ['x']
    y=0
    for idx,row in subset.iterrows():
        axes.text(row.x+2, y+0.3, "%.1f" % row.x, ha="left", va='center', color='#cccccc', fontweight='light')
        y+=1

    subset = df[results.set=='basic'][[metric, metric+'_sem']]
    subset.columns = ['x', 'e']
    y=0
    for idx,row in subset.iterrows():
        axes.plot([row.x, row.x], [y, y+0.2], color=colours[metric][0], linewidth=1)
        axes.add_patch(Rectangle((row.x-row.e, y), 2*row.e, 0.2, fc=colours[metric][0], alpha=0.2))
        axes.text(row.x+2, y+0.1, "%.1f" % row.x, ha="left", va='center', color=colours[metric][0], fontweight='heavy') #, backgroundcolor='white')
        y+=1

    subset = df[results.set=='nulls+minors'][[metric, metric+'_sem']]    
    subset.columns = ['x', 'e']
    y=0
    for idx,row in subset.iterrows():
        axes.plot([row.x, row.x], [y-0.2, y], color=colours[metric][1], linewidth=1)
        axes.add_patch(Rectangle((row.x-row.e, y-0.2), 2*row.e, 0.2, fc=colours[metric][1], alpha=0.2))
        axes.text(row.x+2, y-0.1, "%.1f" % row.x, ha="left", va='center', color=colours[metric][1], fontweight='bold')
        y+=1

    subset = df2[[metric, metric+'_sem']]
    subset.columns = ['x', 'e']
    y=0
    for idx,row in subset.iterrows():
        axes.plot([row.x, row.x], [y-0.4, y-0.2], color=colours[metric][2], linewidth=1)
        axes.add_patch(Rectangle((row.x-row.e, y-0.4), 2*row.e, 0.2, fc=colours[metric][2], alpha=0.2))
        axes.text(row.x+2, y-0.3, "%.1f" % row.x, ha="left", va='center', color=colours[metric][2], fontweight='bold')
        y+=1

    axes.set_ylim(-0.3, 14.5)

    
    fig.savefig('pdf/fig-results-main-'+metric+'.pdf', bbox_inches='tight')
    plt.close()

  y=numpy.arange(len(df[results.set=='basic']))
  axes.set_yticks(y, df[results.set=='basic']['drug'])
  subset = df[results.set=='basic'][[metric, metric+'_sem']]
  subset = df[results.set=='nulls+minors'][[metric, metric+'_sem']]
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
  y=numpy.arange(len(df[results.set=='basic']))
  axes.set_yticks(y, df[results.set=='basic']['drug'])
  subset = df[results.set=='basic'][[metric, metric+'_sem']]
  subset = df[results.set=='nulls+minors'][[metric, metric+'_sem']]
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx

In [7]:
results

Unnamed: 0,set,drug,method,dataset,quality,sensitivity,sensitivity_sem,specificity,specificity_sem,PPV,PPV_sem,RR,SR,UR,RS,SS,US,Total
0,basic,INH,UKMYC,bootstrapped50,ALL,91.240875,0.480167,95.395150,0.367757,95.206597,0.370676,,,,,,,
1,basic,INH,UKMYC,bootstrapped50,HIGH,93.746796,0.336085,96.624866,0.352979,96.618858,0.342746,,,,,,,
2,basic,INH,UKMYC,entire,ALL,91.200000,0.000000,95.600000,0.000000,95.397490,0.000000,456.0,33.0,11.0,22.0,448.0,30.0,1000.0
3,basic,INH,UKMYC,entire,HIGH,93.913043,0.000000,96.412556,0.000000,96.428571,0.000000,432.0,23.0,5.0,16.0,404.0,26.0,906.0
4,basic,RIF,UKMYC,bootstrapped50,ALL,93.729501,0.406094,95.957331,0.314197,95.445435,0.375315,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151,nulls+minors,KAN,UKMYC,bootstrapped50,HIGH,84.298262,0.713334,98.322163,0.199675,95.369505,0.574964,,,,,,,
152,nulls+minors,KAN,UKMYC,entire,ALL,76.470588,0.000000,98.730606,0.000000,96.086957,0.000000,221.0,56.0,12.0,9.0,650.0,50.0,1000.0
153,nulls+minors,KAN,UKMYC,entire,HIGH,84.615385,0.000000,98.543689,0.000000,96.069869,0.000000,220.0,34.0,6.0,9.0,563.0,46.0,880.0
154,nulls+minors,CAP,MGIT,bootstrapped50,ALL,74.983621,0.941786,98.295397,0.200364,94.980289,0.593964,,,,,,,


In [8]:
for set in ['basic', 'nulls', 'nulls+minors']:

    df = results[(results.set==set)]

    plot_truthtables(df, ['ALL','HIGH'], filenstem=f'table-{set}-', savefig=savefig)


In [41]:
UKMYC_PHENOTYPES = pandas.read_csv('dat/UKMYC_1000_phenotypes.csv')
UKMYC_PHENOTYPES.set_index('ENA_RUN_ACCESSION', inplace=True)

UKMYC_SAMPLES = pandas.read_csv('dat/UKMYC_1000_samples.csv')
UKMYC_SAMPLES.set_index('ENA_RUN_ACCESSION', inplace=True)

UKMYC_PHENOTYPES = UKMYC_PHENOTYPES.join(UKMYC_SAMPLES[['POS_AVG_GROWTH']])
UKMYC_PHENOTYPES.reset_index(inplace=True)
UKMYC_PHENOTYPES.set_index(['ENA_RUN_ACCESSION', 'DRUG'], inplace=True)
UKMYC_PHENOTYPES[:3]

Unnamed: 0_level_0,Unnamed: 1_level_0,UNIQUEID,BINARY_PHENOTYPE,PHENOTYPE_QUALITY,PHENOTYPE_METHOD,PLATEDESIGN,MIC,DILUTION,POS_AVG_GROWTH
ENA_RUN_ACCESSION,DRUG,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ERR4810791,INH,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>1.6,8,44.68
ERR4810791,RIF,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>4,8,44.68
ERR4810791,EMB,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>8,9,44.68


In [42]:
PREDICTIONS = pandas.read_csv('dat/PREDICTIONS.csv')
PREDICTIONS.set_index(['ENA_RUN_ACCESSION', 'DRUG'], inplace=True)
PREDICTIONS[:3]

Unnamed: 0_level_0,Unnamed: 1_level_0,SET,PREDICTION
ENA_RUN_ACCESSION,DRUG,Unnamed: 2_level_1,Unnamed: 3_level_1
ERR13286038,INH,basic,R
ERR13286038,RIF,basic,R
ERR13286038,PZA,basic,R


In [43]:
UKMYC_RESULTS = UKMYC_PHENOTYPES.join(PREDICTIONS)
UKMYC_RESULTS.reset_index(inplace=True)
# UKMYC_RESULTS.set_index(['SET', 'ENA_RUN_ACCESSION', 'DRUG'], inplace=True)
UKMYC_RESULTS[:3]

Unnamed: 0,ENA_RUN_ACCESSION,DRUG,UNIQUEID,BINARY_PHENOTYPE,PHENOTYPE_QUALITY,PHENOTYPE_METHOD,PLATEDESIGN,MIC,DILUTION,POS_AVG_GROWTH,SET,PREDICTION
0,ERR4810791,INH,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>1.6,8,44.68,basic,R
1,ERR4810791,INH,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>1.6,8,44.68,nulls,R
2,ERR4810791,INH,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>1.6,8,44.68,nulls+minors,R


In [44]:
def define_outcome(row):
    if row.PREDICTION in ['S', 'U']:
        if row.BINARY_PHENOTYPE == 'R':
            return '(S+U)R'
        else:
            return '(S+U)S'
    else:
        return row.PREDICTION+row.BINARY_PHENOTYPE

UKMYC_RESULTS['OUTCOME'] = UKMYC_RESULTS.apply(define_outcome, axis=1)
UKMYC_RESULTS.OUTCOME.value_counts()

OUTCOME
(S+U)S    24867
RR         7073
(S+U)R     2975
RS         1065
FS           18
FR            2
Name: count, dtype: int64

In [45]:
df = UKMYC_RESULTS[(UKMYC_RESULTS.PHENOTYPE_METHOD=='UKMYC') & (UKMYC_RESULTS.OUTCOME.str[0]!='F')]
df[:3]

Unnamed: 0,ENA_RUN_ACCESSION,DRUG,UNIQUEID,BINARY_PHENOTYPE,PHENOTYPE_QUALITY,PHENOTYPE_METHOD,PLATEDESIGN,MIC,DILUTION,POS_AVG_GROWTH,SET,PREDICTION,OUTCOME
0,ERR4810791,INH,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>1.6,8,44.68,basic,R,RR
1,ERR4810791,INH,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>1.6,8,44.68,nulls,R,RR
2,ERR4810791,INH,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>1.6,8,44.68,nulls+minors,R,RR


In [46]:
for set in ['basic', 'nulls', 'nulls+minors']:
    for drug in df.DRUG.unique():
        for quality in ['HIGH','ALL']:
            if quality=='HIGH':
                foo = copy.deepcopy(df[(df.SET==set) & (df.DRUG==drug) & (df.PHENOTYPE_QUALITY==quality)])
            else:

                foo = copy.deepcopy(df[(df.SET==set) & (df.DRUG==drug)])
            plot_growth_boxplot(foo, filename=f'growth-{set}-{drug}-{quality}.pdf', savefig=savefig)

In [47]:
df[:3]

Unnamed: 0,ENA_RUN_ACCESSION,DRUG,UNIQUEID,BINARY_PHENOTYPE,PHENOTYPE_QUALITY,PHENOTYPE_METHOD,PLATEDESIGN,MIC,DILUTION,POS_AVG_GROWTH,SET,PREDICTION,OUTCOME
0,ERR4810791,INH,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>1.6,8,44.68,basic,R,RR
1,ERR4810791,INH,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>1.6,8,44.68,nulls,R,RR
2,ERR4810791,INH,site.02.subj.0068.lab.22A018.iso.1,R,HIGH,UKMYC,UKMYC5,>1.6,8,44.68,nulls+minors,R,RR


In [37]:
def add_jitter(row, maximum=0.2):
    random_float = (numpy.random.normal()) - 0.5
    return row["DILUTION"] + (maximum * random_float)


df["DILUTION_JITTERED"] = df.apply(add_jitter, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["DILUTION_JITTERED"] = df.apply(add_jitter, axis=1)


In [64]:
for set in ['basic', 'nulls', 'nulls+minors']:
    for platedesign in ['UKMYC5', 'UKMYC6']:
        for drug in df.DRUG.unique():
            for quality in ['HIGH','ALL']:
                if quality=='HIGH':
                    foo = copy.deepcopy(df[(df.SET==set) & (df.PLATEDESIGN==platedesign) & (df.DRUG==drug) & (df.PHENOTYPE_QUALITY==quality)])
                else:

                    foo = copy.deepcopy(df[(df.SET==set) & (df.PLATEDESIGN==platedesign) & (df.DRUG==drug)])
                plot_dilution_boxplot(foo, filename=f'mic-{set}-{drug}-{platedesign}-{quality}.pdf', savefig=savefig)

  "(S+U)S": "#bbbbbb",
  "(S+U)S": "#bbbbbb",
  "(S+U)S": "#bbbbbb",
  "(S+U)S": "#bbbbbb",
  "(S+U)S": "#bbbbbb",
  "(S+U)S": "#bbbbbb",
  "(S+U)S": "#bbbbbb",
  "(S+U)S": "#bbbbbb",
  "(S+U)S": "#bbbbbb",
  "(S+U)S": "#bbbbbb",
  "(S+U)S": "#bbbbbb",
  "(S+U)S": "#bbbbbb",
  "(S+U)S": "#bbbbbb",
  "(S+U)S": "#bbbbbb",
  "(S+U)S": "#bbbbbb",
  "(S+U)S": "#bbbbbb",
  "(S+U)S": "#bbbbbb",
  "(S+U)S": "#bbbbbb",
  "(S+U)S": "#bbbbbb",
  "(S+U)S": "#bbbbbb",
  "(S+U)S": "#bbbbbb",
  "(S+U)S": "#bbbbbb",
  "(S+U)S": "#bbbbbb",
  "(S+U)S": "#bbbbbb",
  "(S+U)S": "#bbbbbb",
  "(S+U)S": "#bbbbbb",
  "(S+U)S": "#bbbbbb",
  "(S+U)S": "#bbbbbb",
  "(S+U)S": "#bbbbbb",
  "(S+U)S": "#bbbbbb",
  "(S+U)S": "#bbbbbb",
  "(S+U)S": "#bbbbbb",
  "(S+U)S": "#bbbbbb",
  "(S+U)S": "#bbbbbb",
  "(S+U)S": "#bbbbbb",
  "(S+U)S": "#bbbbbb",
  "(S+U)S": "#bbbbbb",
  "(S+U)S": "#bbbbbb",
  "(S+U)S": "#bbbbbb",
  "(S+U)S": "#bbbbbb",
  "(S+U)S": "#bbbbbb",
  "(S+U)S": "#bbbbbb",
  "(S+U)S": "#bbbbbb",
  "(S+U)S":