In [1]:
import numpy as np
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, confusion_matrix
import sklearn.metrics as metrics
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression

metrics_to_calculate = ['auroc', 'aupr']

gene_features = np.loadtxt('/Users/mac/OneDrive/thesis/scripts/dataset/ERN/X1.txt',delimiter=",")
gene_TF_interaction_matrix = np.loadtxt('/Users/mac/OneDrive/thesis/scripts/dataset/ERN/Y.txt',delimiter=",")
metric_values_per_fold = {}

In [2]:
np.shape(gene_features)

(1164, 445)

In [3]:
np.shape(gene_TF_interaction_matrix)

(1164, 154)

In [9]:
if 'auroc' in metrics_to_calculate:
    metric_values_per_fold['auroc_micro'] = []
    metric_values_per_fold['auroc_macro'] = []
if 'aupr' in metrics_to_calculate:
    metric_values_per_fold['aupr_micro'] = []
    metric_values_per_fold['aupr_macro'] = []
kf = KFold(n_splits=10, shuffle=True, random_state=42)
fold_counter = 0
for train_index, test_index in kf.split(gene_features):
    print('======================= Fold '+str(fold_counter)+' =======================')
    
    # split the dataset
    X_train, X_test = gene_features[train_index], gene_features[test_index]
    y_train, y_test = gene_TF_interaction_matrix[train_index], gene_TF_interaction_matrix[test_index]
    
    # define the oneVSrest classifier with the base classifier
    clf = OneVsRestClassifier(RandomForestClassifier())
    
    #clf = OneVsRestClassifier(LogisticRegression(random_state=0))
    
    # fit the classifier on the training set
    clf.fit(X_train, y_train)
    
    # generate probability predictions for every sample in the test set
    y_pred = clf.predict_proba(X_test)
    
    print(str(y_pred.shape))
    
    # calculate the performance metrics on the test set
    if 'auroc' in metrics_to_calculate:
        metric_values_per_fold['auroc_micro'].append(roc_auc_score(y_test, y_pred, average='micro'))
        
        # This is not really important as we are only interested in the micro measures.
        # Nevertheless, I basically do the macro averaging by hand so that I can skip labels that have only samples with one class
        roc_auc_per_label = []
        for label_idx in range(gene_TF_interaction_matrix.shape[1]):
            if len(set(y_test[:, label_idx])) >= 2:
                roc_auc_per_label.append(roc_auc_score(y_test[:, label_idx], y_pred[:, label_idx]))
        print(str(len(roc_auc_per_label))+' out of the '+str(y_test.shape[1])+' total labels has more than one classes present')
        
        metric_values_per_fold['auroc_macro'].append(np.mean(roc_auc_per_label))

        
    if 'aupr' in metrics_to_calculate:
        metric_values_per_fold['aupr_micro'].append(average_precision_score(y_test, y_pred, average='micro'))
        
        aupr_per_label = []
        for label_idx in range(gene_TF_interaction_matrix.shape[1]):
            if len(set(y_test[:, label_idx])) >= 2:
                aupr_per_label.append(average_precision_score(y_test[:, label_idx], y_pred[:, label_idx]))
        
        metric_values_per_fold['aupr_macro'].append(np.mean(aupr_per_label))

    
    fold_counter += 1
    print('========================================================================')
    print('')
    #calculate the mean and std for every metric measured during training and validation
for metric_name in metric_values_per_fold.keys():
    print(metric_name+': '+ str(np.mean(metric_values_per_fold[metric_name])) +' ('+ str(np.std(metric_values_per_fold[metric_name])) +')')
    print('')


(117, 154)
74 out of the 154 total labels has more than one classes present

(117, 154)
77 out of the 154 total labels has more than one classes present

(117, 154)
67 out of the 154 total labels has more than one classes present

(117, 154)
83 out of the 154 total labels has more than one classes present

(116, 154)
74 out of the 154 total labels has more than one classes present

(116, 154)
80 out of the 154 total labels has more than one classes present

(116, 154)
75 out of the 154 total labels has more than one classes present

(116, 154)
84 out of the 154 total labels has more than one classes present

(116, 154)
83 out of the 154 total labels has more than one classes present

(116, 154)
84 out of the 154 total labels has more than one classes present

auroc_micro: 0.9042554250191991 (0.011687330363991562)

auroc_macro: 0.7490821377444645 (0.033442111197105336)

aupr_micro: 0.5184487737147939 (0.027891080859796386)

aupr_macro: 0.34968235325748404 (0.03323237988967994)

