## Confusion Matrix Analysis of SpMAX4_Bh(s)

In [1]:
import pandas as pd
import numpy as np
import os

In [23]:
def calculate_confusion_matrix(dataframe):

    # New column to record position in confusion matrix: 
    # True positive(TP), false postive(FP), true negative(TN), false negative(FN) 
    dataframe.loc[:,"confusion_matrix"] = None

    spmax4_cat=dataframe.loc[:,"SpMAX4_Bh(s) Predicted Category"]
    exp_cat=dataframe.loc[:,"Experimental Category"]

    #count_[experimental_category]_[predicted_category]
    count_0_0=0 # TP
    count_0_1=0 # FP
    count_1_0=0 # FN
    count_1_1=0 # TP
    for i, row in enumerate(dataframe.iterrows()):
        i=i+1
        if exp_cat[i]==0:
            if spmax4_cat[i]==0:
                count_0_0=count_0_0+1
                dataframe.loc[i,"confusion_matrix"]="TN"
            else:
                count_0_1=count_0_1+1
                dataframe.loc[i,"confusion_matrix"]="FP"
        else:
            if spmax4_cat[i]==0:
                count_1_0=count_1_0+1
                dataframe.loc[i,"confusion_matrix"]="FN"
            else:
                count_1_1=count_1_1+1
                dataframe.loc[i,"confusion_matrix"]="TP"

    TP= count_1_1
    TN= count_0_0
    FN= count_1_0
    FP= count_0_1
            
    print "True Positive: ", TP
    print "True Negative: ", TN
    print "False Negative: ", FN
    print "False Positive: ", FP
    return(TP, TN, FN, FP)

def calculate_confusion_matrix_parameters(true_positive, true_negative, 
                                          false_negative, false_positive):

    TP=float(true_positive)
    TN=float(true_negative)
    FN=float(false_negative)
    FP=float(false_positive)

    #Accuracy
    ACC= (TP+TN)/(TP+TN+FP+FN)
    print "ACC: {0:.3f} (accuracy)".format(ACC) 

    #True positive rate = sensitivity
    TPR= TP/(TP+FN)
    print "TPR: {0:.3f} (sensitivity)".format(TPR) 

    #True negative rate = specificity
    TNR=TN/(FP+TN)
    print "TNR: {0:.3f} (specificity)".format(TNR) 

    #Positive predictive value (precision)
    PPV=TP/(TP+FP)
    print "PPV: {0:.3f} (precision)".format(PPV) 

    #False discovery rate
    FDR=1-PPV
    print "FDR: {0:.3f} (false discovery rate)".format(FDR) 

    #Negative predictive value
    NPV=TN/(TN+FN)
    print "NPV: {0:.3f}".format(NPV) 

    #False omission rate
    FOR=FN/(TN+FN)
    print "FOR: {0:.3f}".format(FOR) 

    #False negative rate
    FNR=FN/(TP+FP)
    print "FNR: {0:.3f}".format(FNR) 

    #False positive rate
    FPR=FP/(TN+FN)
    print "FPR: {0:.3f}".format(FPR) 

### 1. Training set (N = 16)
Experimental data of indocyanine nanoparticle(INP) formation of 16 drugs with IR783 was used as training set to decide on the decision threshold of SpMAX4_Bh(s) to predict INP formers and not formers. Drugs with SpMAX4_Bh(s) value equal or above 7.0 are predicted to form nanoparticles with IR783.
Experimental category of 1 indicates INP forming drugs and 0 indicates drugs that don't form INPs.

In [6]:
# Import experimental data and SpMAX4_Bh(s) values calculated by Dragon 6.
df_training_set=pd.DataFrame.from_csv('SpMAX4_training_set_N16_2decimal.csv', encoding='utf-8')

# Creating categorical values for SpMAX4_Bh(s). Threshold value is 7.0.
df_training_set.loc[:,"SpMAX4_Bh(s) Predicted Category"] = None

for i, row in enumerate(df_training_set.iterrows()):
    i=i+1
    spmax4=df_training_set.loc[i,"SpMAX4_Bh(s)"]
    if spmax4 >= 7.0:
        df_training_set.loc[i,"SpMAX4_Bh(s) Predicted Category"]=1
    else:
        df_training_set.loc[i,"SpMAX4_Bh(s) Predicted Category"]=0
        
df_training_set

Unnamed: 0,Drug Name,SpMAX4_Bh(s),Experimental Category,SpMAX4_Bh(s) Predicted Category
1,Silvesterol,4.97,0,0
2,Erlotinib,4.67,0,0
3,Idelalisib,4.83,0,0
4,Camptotecin,4.99,0,0
5,Binimetinib,6.46,0,0
6,Sunitinib,4.82,0,0
7,Taselisib,4.89,0,0
8,Lapatinib,5.12,0,0
9,Venetoclax,7.0,1,1
10,Docetaxel,7.43,1,1


In [24]:
TP, TN, FN, FP = calculate_confusion_matrix(df_training_set); print ""
calculate_confusion_matrix_parameters(TP, TN, FN, FP)

True Positive:  8
True Negative:  8
False Negative:  0
False Positive:  0

ACC: 1.000 (accuracy)
TPR: 1.000 (sensitivity)
TNR: 1.000 (specificity)
PPV: 1.000 (precision)
FDR: 0.000 (false discovery rate)
NPV: 1.000
FOR: 0.000
FNR: 0.000
FPR: 0.000


### 2. Validation set (N = 36)

In [28]:
# Import experimental data and SpMAX4_Bh(s) values calculated by Dragon 6.
df_validation_set=pd.DataFrame.from_csv('SpMAX4_validation_set_N36_2decimal.csv', encoding='utf-8')

# Creating categorical values for SpMAX4_Bh(s). Threshold value is 7.0.
df_validation_set.loc[:,"SpMAX4_Bh(s) Predicted Category"] = None

for i, row in enumerate(df_validation_set.iterrows()):
    i=i+1
    spmax4=df_validation_set.loc[i,"SpMAX4_Bh(s)"]
    if spmax4 >= 7.0:
        df_validation_set.loc[i,"SpMAX4_Bh(s) Predicted Category"]=1
    else:
        df_validation_set.loc[i,"SpMAX4_Bh(s) Predicted Category"]=0
        
df_validation_set.head()

Unnamed: 0,Drug Name,SpMAX4_Bh(s),Experimental Category,SpMAX4_Bh(s) Predicted Category
1,Avagacestat,8,1,1
2,Dutasteride,8,1,1
3,Regorafenib,8,1,1
4,RO4929097,8,1,1
5,TAK-632,8,1,1


In [29]:
TP, TN, FN, FP = calculate_confusion_matrix(df_validation_set); print ""
calculate_confusion_matrix_parameters(TP, TN, FN, FP)

True Positive:  18
True Negative:  18
False Negative:  0
False Positive:  0

ACC: 1.000 (accuracy)
TPR: 1.000 (sensitivity)
TNR: 1.000 (specificity)
PPV: 1.000 (precision)
FDR: 0.000 (false discovery rate)
NPV: 1.000
FOR: 0.000
FNR: 0.000
FPR: 0.000


### 3. All experimental data (N = 60)
Experimental dataset contains training and validation sets, and also 8 extra experimentally tested drugs. 

In [30]:
# Import experimental data and SpMAX4_Bh(s) values calculated by Dragon 6.
df_full_experimental_set=pd.DataFrame.from_csv('SpMAX4_full_experimental_set_N60_2decimal.csv', encoding='utf-8')

# Creating categorical values for SpMAX4_Bh(s). Threshold value is 7.0.
df_full_experimental_set.loc[:,"SpMAX4_Bh(s) Predicted Category"] = None

for i, row in enumerate(df_full_experimental_set.iterrows()):
    i=i+1
    spmax4=df_full_experimental_set.loc[i,"SpMAX4_Bh(s)"]
    if spmax4 >= 7.0:
        df_full_experimental_set.loc[i,"SpMAX4_Bh(s) Predicted Category"]=1
    else:
        df_full_experimental_set.loc[i,"SpMAX4_Bh(s) Predicted Category"]=0
        
df_full_experimental_set.head()

Unnamed: 0,Drug Name,SpMAX4_Bh(s),Experimental Category,SpMAX4_Bh(s) Analysis Group,NHISS Analysis Group,SpMAX4_Bh(s) Predicted Category
1,Venetoclax,7.0,1,training,training,1
2,Docetaxel,7.43,1,training,training,1
3,Paclitaxel,7.43,1,training,training,1
4,Trametinib,7.43,1,training,training,1
5,Enzalutamide,8.0,1,training,training,1


In [31]:
TP, TN, FN, FP = calculate_confusion_matrix(df_full_experimental_set); print ""
calculate_confusion_matrix_parameters(TP, TN, FN, FP)

True Positive:  26
True Negative:  33
False Negative:  1
False Positive:  0

ACC: 0.983 (accuracy)
TPR: 0.963 (sensitivity)
TNR: 1.000 (specificity)
PPV: 1.000 (precision)
FDR: 0.000 (false discovery rate)
NPV: 0.971
FOR: 0.029
FNR: 0.038
FPR: 0.000
