## Confusion Matrix Analysis of NHISS

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
def calculate_confusion_matrix(dataframe):

    # New column to record position in confusion matrix: 
    # True positive(TP), false postive(FP), true negative(TN), false negative(FN) 
    dataframe.loc[:,"confusion_matrix"] = None

    spmax4_cat=dataframe.loc[:,"NHISS Predicted Category"]
    exp_cat=dataframe.loc[:,"Experimental Category"]

    #count_[experimental_category]_[predicted_category]
    count_0_0=0 # TP
    count_0_1=0 # FP
    count_1_0=0 # FN
    count_1_1=0 # TP
    for i, row in enumerate(dataframe.iterrows()):
        i=i+1
        if exp_cat[i]==0:
            if spmax4_cat[i]==0:
                count_0_0=count_0_0+1
                dataframe.loc[i,"confusion_matrix"]="TN"
            else:
                count_0_1=count_0_1+1
                dataframe.loc[i,"confusion_matrix"]="FP"
        else:
            if spmax4_cat[i]==0:
                count_1_0=count_1_0+1
                dataframe.loc[i,"confusion_matrix"]="FN"
            else:
                count_1_1=count_1_1+1
                dataframe.loc[i,"confusion_matrix"]="TP"

    TP= count_1_1
    TN= count_0_0
    FN= count_1_0
    FP= count_0_1
            
    print "True Positive: ", TP
    print "True Negative: ", TN
    print "False Negative: ", FN
    print "False Positive: ", FP
    return(TP, TN, FN, FP)

def calculate_confusion_matrix_parameters(true_positive, true_negative, 
                                          false_negative, false_positive):

    TP=float(true_positive)
    TN=float(true_negative)
    FN=float(false_negative)
    FP=float(false_positive)

    #Accuracy
    ACC= (TP+TN)/(TP+TN+FP+FN)
    print "ACC: {0:.3f} (accuracy)".format(ACC) 

    #True positive rate = sensitivity
    TPR= TP/(TP+FN)
    print "TPR: {0:.3f} (sensitivity)".format(TPR) 

    #True negative rate = specificity
    TNR=TN/(FP+TN)
    print "TNR: {0:.3f} (specificity)".format(TNR) 

    #Positive predictive value (precision)
    PPV=TP/(TP+FP)
    print "PPV: {0:.3f} (precision)".format(PPV) 

    #False discovery rate
    FDR=1-PPV
    print "FDR: {0:.3f} (false discovery rate)".format(FDR) 

    #Negative predictive value
    NPV=TN/(TN+FN)
    print "NPV: {0:.3f}".format(NPV) 

    #False omission rate
    FOR=FN/(TN+FN)
    print "FOR: {0:.3f}".format(FOR) 

    #False negative rate
    FNR=FN/(TP+FP)
    print "FNR: {0:.3f}".format(FNR) 

    #False positive rate
    FPR=FP/(TN+FN)
    print "FPR: {0:.3f}".format(FPR) 

### 1. Training set (N = 52)
Experimental data of indocyanine nanoparticle(INP) formation of 52 drugs with IR783 was used as training set to decide on the decision threshold of NHISS to predict INP formers and not formers. All the data available at the time, training and validation sets of SpMAX4_Bh(s) descriptor, were combined together. Drugs with NHISS value equal or above 4 are predicted to form nanoparticles with IR783.
Experimental category of 1 indicates INP forming drugs and 0 indicates drugs that don't form INPs.

In [3]:
# Import experimental data and SpMAX4_Bh(s) values calculated by Dragon 6.
df_training_set=pd.DataFrame.from_csv('NHISS_training_set_N52_2decimal.csv', encoding='utf-8')

# Creating categorical values for SpMAX4_Bh(s). Threshold value is 7.0.
df_training_set.loc[:,"NHISS Predicted Category"] = None

for i, row in enumerate(df_training_set.iterrows()):
    i=i+1
    NHISS=df_training_set.loc[i,"NHISS"]
    if NHISS >= 4:
        df_training_set.loc[i,"NHISS Predicted Category"]=1
    else:
        df_training_set.loc[i,"NHISS Predicted Category"]=0
        
df_training_set

Unnamed: 0,Drug Name,SpMAX4_Bh(s),NHISS,Experimental Category,SpMAX4_Bh(s) Analysis Group,NHISS Analysis Group,NHISS Predicted Category
1,Venetoclax,7.0,5,1.0,training,training,1
2,Docetaxel,7.43,5,1.0,training,training,1
3,Paclitaxel,7.43,6,1.0,training,training,1
4,Trametinib,7.43,5,1.0,training,training,1
5,Enzalutamide,8.0,6,1.0,training,training,1
6,Sorafenib,7.43,5,1.0,training,training,1
7,Fulvestrant,8.0,7,1.0,training,training,1
8,Rapamycin,7.42,5,1.0,training,training,1
9,Silvesterol,4.97,1,0.0,training,training,0
10,Erlotinib,4.67,0,0.0,training,training,0


In [4]:
TP, TN, FN, FP = calculate_confusion_matrix(df_training_set); print ""
calculate_confusion_matrix_parameters(TP, TN, FN, FP)

True Positive:  26
True Negative:  26
False Negative:  1
False Positive:  0

ACC: 0.981 (accuracy)
TPR: 0.963 (sensitivity)
TNR: 1.000 (specificity)
PPV: 1.000 (precision)
FDR: 0.000 (false discovery rate)
NPV: 0.963
FOR: 0.037
FNR: 0.038
FPR: 0.000


### 2. Validation set (N = 8)
Experiments of these 8 drugs were performed after QSNAP Model 1 analysis was completed and SpMAX4_Bh(s) and NHISS threshold values were already determined.

In [7]:
# Import experimental data and NHISS values.
df_validation_set=pd.DataFrame.from_csv('NHISS_validation_set_N8_2decimal.csv', encoding='utf-8')

# Creating categorical values for NHISS prediction. Threshold value is 4.
df_validation_set.loc[:,"NHISS Predicted Category"] = None

for i, row in enumerate(df_validation_set.iterrows()):
    i=i+1
    NHISS=df_validation_set.loc[i,"NHISS"]
    if NHISS >= 4:
        df_validation_set.loc[i,"NHISS Predicted Category"]=1
    else:
        df_validation_set.loc[i,"NHISS Predicted Category"]=0
        
df_validation_set.head()

Unnamed: 0,Drug Name,SpMAX4_Bh(s),NHISS,Experimental Category,SpMAX4_Bh(s) Analysis Group,NHISS Analysis Group,NHISS Predicted Category
1,Avasimibe,4.79,3,1,validation,validation,0
2,Mubritinib,5.17,3,0,validation,validation,0
3,Bithionol,4.94,0,0,validation,validation,0
4,Ospemifene,4.71,0,0,validation,validation,0
5,Probucol,4.59,0,0,validation,validation,0


In [8]:
TP, TN, FN, FP = calculate_confusion_matrix(df_validation_set); print ""
calculate_confusion_matrix_parameters(TP, TN, FN, FP)

True Positive:  0
True Negative:  7
False Negative:  1
False Positive:  0

ACC: 0.875 (accuracy)
TPR: 0.000 (sensitivity)
TNR: 1.000 (specificity)


ZeroDivisionError: float division by zero

### 3. All experimental data (N = 60)
Experimental dataset contains training and validation sets.

In [9]:
# Import experimental data and SpMAX4_Bh(s) values calculated by Dragon 6.
df_full_experimental_set=pd.DataFrame.from_csv('NHISS_full_experimental_set_N60_2decimal.csv', encoding='utf-8')

# Creating categorical values for NHISS. Threshold value is 4.
df_full_experimental_set.loc[:,"NHISS Predicted Category"] = None

for i, row in enumerate(df_full_experimental_set.iterrows()):
    i=i+1
    NHISS=df_full_experimental_set.loc[i,"NHISS"]
    if NHISS >= 4:
        df_full_experimental_set.loc[i,"NHISS Predicted Category"]=1
    else:
        df_full_experimental_set.loc[i,"NHISS Predicted Category"]=0
        
df_full_experimental_set.head()

Unnamed: 0,Drug Name,SpMAX4_Bh(s),NHISS,Experimental Category,SpMAX4_Bh(s) Analysis Group,NHISS Analysis Group,NHISS Predicted Category
1,Venetoclax,7.0,5,1,training,training,1
2,Docetaxel,7.43,5,1,training,training,1
3,Paclitaxel,7.43,6,1,training,training,1
4,Trametinib,7.43,5,1,training,training,1
5,Enzalutamide,8.0,6,1,training,training,1


In [10]:
TP, TN, FN, FP = calculate_confusion_matrix(df_full_experimental_set); print ""
calculate_confusion_matrix_parameters(TP, TN, FN, FP)

True Positive:  26
True Negative:  33
False Negative:  2
False Positive:  0

ACC: 0.967 (accuracy)
TPR: 0.929 (sensitivity)
TNR: 1.000 (specificity)
PPV: 1.000 (precision)
FDR: 0.000 (false discovery rate)
NPV: 0.943
FOR: 0.057
FNR: 0.077
FPR: 0.000
