In [1]:
import numpy as np

In [2]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, confusion_matrix
import sklearn.metrics as metrics
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

In [3]:
metrics_to_calculate = ['auroc', 'aupr']

# Load the drug features

In [4]:
# drug_features = np.loadtxt('../autoML/network_inference_datasets/DPII/X1.txt')
drug_features = np.loadtxt('dpie_X1.txt')

In [5]:
drug_features

array([[1.        , 0.02575215, 0.02157473, ..., 0.02354165, 0.02012412,
        0.0206892 ],
       [0.02575215, 1.        , 0.01832524, ..., 0.02518895, 0.01770358,
        0.02352885],
       [0.02157473, 0.01832524, 1.        , ..., 0.01711643, 0.01601936,
        0.01656275],
       ...,
       [0.02354165, 0.02518895, 0.01711643, ..., 1.        , 0.01800374,
        0.01888761],
       [0.02012412, 0.01770358, 0.01601936, ..., 0.01800374, 1.        ,
        0.01459084],
       [0.0206892 , 0.02352885, 0.01656275, ..., 0.01888761, 0.01459084,
        1.        ]])

In [6]:
drug_features.shape

(664, 664)

# Load the interaction matrix

In [7]:
# interaction_matrix = np.loadtxt('../autoML/network_inference_datasets/DPII/Y.txt')
interaction_matrix = np.loadtxt('dpie_Y.txt')

In [8]:
interaction_matrix

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [9]:
metric_values_per_fold = {}
if 'auroc' in metrics_to_calculate:
    metric_values_per_fold['auroc_micro'] = []
    metric_values_per_fold['auroc_macro'] = []
if 'aupr' in metrics_to_calculate:
    metric_values_per_fold['aupr_micro'] = []
    metric_values_per_fold['aupr_macro'] = []

In [10]:
kf = KFold(n_splits=10, shuffle=False, random_state=42)



In [11]:
fold_counter = 0
for train_index, test_index in kf.split(drug_features):
    print('======================= Fold '+str(fold_counter)+' =======================')
    
    # split the dataset
    X_train, X_test = drug_features[train_index], drug_features[test_index]
    y_train, y_test = interaction_matrix[train_index], interaction_matrix[test_index]
    
    # define the oneVSrest classifier with the base classifier
    # you can comment-uncomment the 4 following lines to use the different classifiers
    # clf = OneVsRestClassifier(RandomForestClassifier())
    # clf = OneVsRestClassifier(LogisticRegression(random_state=0))
    clf = OneVsRestClassifier(MLPClassifier(random_state=1, hidden_layer_sizes=(256), solver='adam', learning_rate='adaptive', max_iter=300)) # binary relevance approach that uses a neural network as the base classifier (so it creates as many neural networks as there are labels)
    # clf = MLPClassifier(random_state=1, hidden_layer_sizes=(512, 512), solver='adam', learning_rate='adaptive', max_iter=300) # standard neural network
    
    # fit the classifier on the training set
    clf.fit(X_train, y_train)
    
    # generate probability predictions for every sample in the test set
    y_pred = clf.predict_proba(X_test)
    y_pred_binarized = clf.predict(X_test)
    
    print(str(y_pred.shape))
    
    # calculate the performance metrics on the test set
    if 'auroc' in metrics_to_calculate:
        metric_values_per_fold['auroc_micro'].append(roc_auc_score(y_test, y_pred, average='micro'))
        print('auroc_micro: '+str(roc_auc_score(y_test, y_pred, average='micro')))
        # This is not really important as we are only interested in the micro measures.
        # Nevertheless, I basically do the macro averaging by hand so that I can skip labels that have only samples with one class
        roc_auc_per_label = []
        for label_idx in range(interaction_matrix.shape[1]):
            if len(set(y_test[:, label_idx])) >= 2:
                roc_auc_per_label.append(roc_auc_score(y_test[:, label_idx], y_pred[:, label_idx]))
        print(str(len(roc_auc_per_label))+' out of the '+str(y_test.shape[1])+' total labels have more than one classes present')
        
        metric_values_per_fold['auroc_macro'].append(np.mean(roc_auc_per_label))
        print('auroc_macro: '+str(np.mean(roc_auc_per_label)))
        
    print('                   |                    ')
    
    if 'aupr' in metrics_to_calculate:
        metric_values_per_fold['aupr_micro'].append(average_precision_score(y_test, y_pred, average='micro'))
        print('aupr_micro: '+str(roc_auc_score(y_test, y_pred, average='micro')))
        aupr_per_label = []
        for label_idx in range(interaction_matrix.shape[1]):
            if len(set(y_test[:, label_idx])) >= 2:
                aupr_per_label.append(average_precision_score(y_test[:, label_idx], y_pred[:, label_idx]))
        
        metric_values_per_fold['aupr_macro'].append(np.mean(aupr_per_label))
        print('aupr_macro: '+str(np.mean(aupr_per_label)))
        
    print('                   |                    ')

    print(str(confusion_matrix(y_test.reshape(-1, 1), y_pred_binarized.reshape(-1, 1))))
    

    
    fold_counter += 1
    print('========================================================================')
    print('')





(67, 445)
auroc_micro: 0.9526313458429019
86 out of the 445 total labels have more than one classes present
auroc_macro: 0.9368268022732346
                   |                    
aupr_micro: 0.9526313458429019
aupr_macro: 0.8173954642068372
                   |                    
[[29648    14]
 [   59    94]]





(67, 445)
auroc_micro: 0.8460218785950018
137 out of the 445 total labels have more than one classes present
auroc_macro: 0.6281234381097517
                   |                    
aupr_micro: 0.8460218785950018
aupr_macro: 0.29652502828887933
                   |                    
[[29145    13]
 [  186   471]]





(67, 445)
auroc_micro: 0.9006191718244155
78 out of the 445 total labels have more than one classes present
auroc_macro: 0.8163953769115717
                   |                    
aupr_micro: 0.9006191718244155
aupr_macro: 0.5878635797398574
                   |                    
[[29517     4]
 [   78   216]]





(67, 445)
auroc_micro: 0.8384328867078739
116 out of the 445 total labels have more than one classes present
auroc_macro: 0.8644470862867841
                   |                    
aupr_micro: 0.8384328867078739
aupr_macro: 0.6382143398664216
                   |                    
[[29571     8]
 [  129   107]]





(66, 445)
auroc_micro: 0.6834288330118232
119 out of the 445 total labels have more than one classes present
auroc_macro: 0.7298580817916965
                   |                    
aupr_micro: 0.6834288330118232
aupr_macro: 0.43195913779908557
                   |                    
[[29125    11]
 [  122   112]]





(66, 445)
auroc_micro: 0.9095829871735122
95 out of the 445 total labels have more than one classes present
auroc_macro: 0.7901714372448295
                   |                    
aupr_micro: 0.9095829871735122
aupr_macro: 0.547865171046661
                   |                    
[[29021     9]
 [  116   224]]





(66, 445)
auroc_micro: 0.7595253502470706
55 out of the 445 total labels have more than one classes present
auroc_macro: 0.7395298942459114
                   |                    
aupr_micro: 0.7595253502470706
aupr_macro: 0.5113206448980533
                   |                    
[[29056    55]
 [   95   164]]





(66, 445)
auroc_micro: 0.6196026810356291
112 out of the 445 total labels have more than one classes present
auroc_macro: 0.720832167424912
                   |                    
aupr_micro: 0.6196026810356291
aupr_macro: 0.3809622557752065
                   |                    
[[29076    25]
 [  175    94]]





(66, 445)
auroc_micro: 0.7614936121766513
90 out of the 445 total labels have more than one classes present
auroc_macro: 0.7976754760737735
                   |                    
aupr_micro: 0.7614936121766513
aupr_macro: 0.5656379043451978
                   |                    
[[29166    10]
 [   87   107]]





(66, 445)
auroc_micro: 0.8759025043874211
101 out of the 445 total labels have more than one classes present
auroc_macro: 0.8715346073398558
                   |                    
aupr_micro: 0.8759025043874211
aupr_macro: 0.7336441686267535
                   |                    
[[29076     4]
 [  106   184]]



In [12]:
metric_values_per_fold

{'auroc_micro': [0.9526313458429019,
  0.8460218785950018,
  0.9006191718244155,
  0.8384328867078739,
  0.6834288330118232,
  0.9095829871735122,
  0.7595253502470706,
  0.6196026810356291,
  0.7614936121766513,
  0.8759025043874211],
 'auroc_macro': [0.9368268022732346,
  0.6281234381097517,
  0.8163953769115717,
  0.8644470862867841,
  0.7298580817916965,
  0.7901714372448295,
  0.7395298942459114,
  0.720832167424912,
  0.7976754760737735,
  0.8715346073398558],
 'aupr_micro': [0.7805228759047986,
  0.7661618144621092,
  0.8390794493099832,
  0.5419631250972821,
  0.549496572895033,
  0.7409853237065122,
  0.5738515365086105,
  0.42359771384809997,
  0.6184554554580336,
  0.7827850416001182],
 'aupr_macro': [0.8173954642068372,
  0.29652502828887933,
  0.5878635797398574,
  0.6382143398664216,
  0.43195913779908557,
  0.547865171046661,
  0.5113206448980533,
  0.3809622557752065,
  0.5656379043451978,
  0.7336441686267535]}

In [13]:
# calculate the mean and std for every metric measured during training and validation
for metric_name in metric_values_per_fold.keys():
    print(metric_name+': '+ str(np.mean(metric_values_per_fold[metric_name])) +' ('+ str(np.std(metric_values_per_fold[metric_name])) +')')
    print('')

auroc_micro: 0.8147241251002301 (0.10093078670622918)

auroc_macro: 0.7895394367702322 (0.08464892204316107)

aupr_micro: 0.6616898908790582 (0.13063270436452423)

aupr_macro: 0.5511387694592953 (0.1493084481476985)

