In [1]:
import numpy as np

In [2]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, confusion_matrix
import sklearn.metrics as metrics
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression

In [3]:
metrics_to_calculate = ['auroc', 'aupr']

# Load the drug features

In [4]:
drug_features = np.loadtxt('/Users/mac/OneDrive/thesis/scripts/dataset/DPI_enzyme/drug_feature.txt')

In [5]:
drug_features

array([[1.        , 0.02575215, 0.02157473, ..., 0.02354165, 0.02012412,
        0.0206892 ],
       [0.02575215, 1.        , 0.01832524, ..., 0.02518895, 0.01770358,
        0.02352885],
       [0.02157473, 0.01832524, 1.        , ..., 0.01711643, 0.01601936,
        0.01656275],
       ...,
       [0.02354165, 0.02518895, 0.01711643, ..., 1.        , 0.01800374,
        0.01888761],
       [0.02012412, 0.01770358, 0.01601936, ..., 0.01800374, 1.        ,
        0.01459084],
       [0.0206892 , 0.02352885, 0.01656275, ..., 0.01888761, 0.01459084,
        1.        ]])

In [6]:
interaction_matrix = np.loadtxt('/Users/mac/OneDrive/thesis/scripts/dataset/DPI_enzyme/dpie_Y.txt')

In [7]:
interaction_matrix

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [8]:
metric_values_per_fold = {}
if 'auroc' in metrics_to_calculate:
    metric_values_per_fold['auroc_micro'] = []
    metric_values_per_fold['auroc_macro'] = []
if 'aupr' in metrics_to_calculate:
    metric_values_per_fold['aupr_micro'] = []
    metric_values_per_fold['aupr_macro'] = []

In [22]:
kf = KFold(n_splits=10, shuffle=True,random_state=42)

SyntaxError: invalid syntax (<ipython-input-22-7aea7040662e>, line 1)

In [23]:
fold_counter = 0
for train_index, test_index in kf.split(drug_features):
    print('======================= Fold '+str(fold_counter)+' =======================')
    
    # split the dataset
    X_train, X_test = drug_features[train_index], drug_features[test_index]
    y_train, y_test = interaction_matrix[train_index], interaction_matrix[test_index]
    
    # define the oneVSrest classifier with the base classifier
    clf = OneVsRestClassifier(RandomForestClassifier())
    
    # clf = OneVsRestClassifier(LogisticRegression(random_state=0))
    
    # fit the classifier on the training set
    clf.fit(X_train, y_train)
    
    # generate probability predictions for every sample in the test set
    y_pred = clf.predict_proba(X_test)
    
    print(str(y_pred.shape))
    
    # calculate the performance metrics on the test set
    if 'auroc' in metrics_to_calculate:
        metric_values_per_fold['auroc_micro'].append(roc_auc_score(y_test, y_pred, average='micro'))
        
        # This is not really important as we are only interested in the micro measures.
        # Nevertheless, I basically do the macro averaging by hand so that I can skip labels that have only samples with one class
        roc_auc_per_label = []
        for label_idx in range(interaction_matrix.shape[1]):
            if len(set(y_test[:, label_idx])) >= 2:
                roc_auc_per_label.append(roc_auc_score(y_test[:, label_idx], y_pred[:, label_idx]))
        print(str(len(roc_auc_per_label))+' out of the '+str(y_test.shape[1])+' total labels has more than one classes present')
        
        metric_values_per_fold['auroc_macro'].append(np.mean(roc_auc_per_label))

        
    if 'aupr' in metrics_to_calculate:
        metric_values_per_fold['aupr_micro'].append(average_precision_score(y_test, y_pred, average='micro'))
        
        aupr_per_label = []
        for label_idx in range(interaction_matrix.shape[1]):
            if len(set(y_test[:, label_idx])) >= 2:
                aupr_per_label.append(average_precision_score(y_test[:, label_idx], y_pred[:, label_idx]))
        
        metric_values_per_fold['aupr_macro'].append(np.mean(aupr_per_label))

    
    fold_counter += 1
    print('========================================================================')
    print('')

(67, 445)
86 out of the 445 total labels has more than one classes present

(67, 445)
137 out of the 445 total labels has more than one classes present

(67, 445)
78 out of the 445 total labels has more than one classes present

(67, 445)
116 out of the 445 total labels has more than one classes present

(66, 445)
119 out of the 445 total labels has more than one classes present

(66, 445)
95 out of the 445 total labels has more than one classes present

(66, 445)
55 out of the 445 total labels has more than one classes present

(66, 445)
112 out of the 445 total labels has more than one classes present

(66, 445)
90 out of the 445 total labels has more than one classes present

(66, 445)
101 out of the 445 total labels has more than one classes present



In [24]:
metric_values_per_fold

{'auroc_micro': [0.9178056043871273,
  0.9048886327423128,
  0.8105207578583873,
  0.8678018708297418,
  0.9262069739954057,
  0.899674992885821,
  0.9229693025138971,
  0.8271235301284579,
  0.9527973443573754,
  0.9000575719560606,
  0.9439448064753961,
  0.8470234547450133,
  0.9063987540749845,
  0.8061327149758676,
  0.6902393637618103,
  0.9019219468703774,
  0.9199736622532129,
  0.6003492643043348,
  0.7449108185233451,
  0.8872020110989898,
  0.9411171971092168,
  0.8889592816255487,
  0.9309642830066547,
  0.8566944539787447,
  0.7745198468015602,
  0.8960726732994266,
  0.8275798703643849,
  0.7406435655643101,
  0.8216494138665025,
  0.8958525233600532,
  0.936890381082197,
  0.8900742378452859,
  0.9291698726169102,
  0.8548822286310547,
  0.7689308201561084,
  0.8900985795627241,
  0.828722746606021,
  0.732367364577847,
  0.8276323888579512,
  0.9047549566001044,
  0.9454398643011921,
  0.8865494331361918,
  0.9281000703523169,
  0.8582449556230055,
  0.7692763116208339,

In [25]:
# calculate the mean and std for every metric measured during training and validation
for metric_name in metric_values_per_fold.keys():
    print(metric_name+': '+ str(np.mean(metric_values_per_fold[metric_name])) +' ('+ str(np.std(metric_values_per_fold[metric_name])) +')')
    print('')

auroc_micro: 0.8576068626723664 (0.07480293149117931)

auroc_macro: 0.7938695918951919 (0.08103710863859921)

aupr_micro: 0.6151273401689754 (0.12773729630136732)

aupr_macro: 0.5624406998257213 (0.15253879717886165)

