In [6]:
# Load packages
import numpy as np
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, confusion_matrix
import sklearn.metrics as metrics
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [7]:
# Load datasets
dataname = 'dpie'

# define method
## 'random_forest', 'logistic_regression', 'mlp_per_target', 'mlp'
method_name = 'mlp'

# define scale
## 'standard', 'minmax'
scale_type = '111'

In [8]:
def load_data(dataname):
    if dataname in ['ERN', 'SRN']:
        X1 = np.loadtxt('./dataset/'+str(dataname)+'/X1.txt',delimiter=",")
        Y = np.loadtxt('./dataset/'+str(dataname)+'/Y.txt',delimiter=",")
        X2 = np.loadtxt('./dataset/'+str(dataname)+'/X2.txt',delimiter=",")
    else:
        X1 = np.loadtxt('./dataset/'+str(dataname)+'/'+str(dataname)+'_X1.txt')
        Y = np.loadtxt('./dataset/'+str(dataname)+'/'+str(dataname)+'_Y.txt')
        X2 = np.loadtxt('./dataset/'+str(dataname)+'/'+str(dataname)+'_X2.txt')
    return X1, X2, Y

In [9]:
X1, X2, Y = load_data(dataname)
print(Y)

[[1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [11]:
shuffled = [True, False]

In [12]:
setting = 'B'
if setting == 'C':
    X1, Y = X2, np.transpose(Y)

In [13]:
for item in shuffled:
    
    metrics_to_calculate = ['auroc', 'aupr']
    metric_values_per_fold = {}
    if 'auroc' in metrics_to_calculate:
        metric_values_per_fold['auroc_micro'] = []
        metric_values_per_fold['auroc_macro'] = []
    if 'aupr' in metrics_to_calculate:
        metric_values_per_fold['aupr_micro'] = []
        metric_values_per_fold['aupr_macro'] = []

    kf = KFold(n_splits=10, shuffle=item,random_state=42)
    fold_counter = 0

    for train_index, test_index in kf.split(X1):
        print('======================= Fold '+str(fold_counter)+' =========================================')

        # split the dataset
        X_train, X_test = X1[train_index], X1[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        
        # scaler
        scaler = None
        if scale_type == 'standard':
            scaler = StandardScaler()
        elif scale_type == 'minmax':
            scaler = MinMaxScaler()
        if scaler is not None:
            scaler.fit(X_train)
            X_train = scaler.transform(X_train)
            X_test = scaler.transform(X_test)

        clf = None
        # define the oneVSrest classifier with the base classifier
        if method_name == 'random_forest':
            clf = OneVsRestClassifier(RandomForestClassifier())
        elif method_name == 'logistic_regression':
            clf = OneVsRestClassifier(LogisticRegression(random_state=0))
        elif method_name == 'mlp_per_target':
            clf = OneVsRestClassifier(MLPClassifier(random_state=1, hidden_layer_sizes=(256), solver='adam', learning_rate='adaptive', max_iter=300)) # binary relevance approach that uses a neural network as the base classifier (so it creates as many neural networks as there are labels)
        elif method_name == 'mlp':
            clf = MLPClassifier(random_state=1, hidden_layer_sizes=(512), solver='adam', learning_rate='adaptive', max_iter=300) # standard neural network
        else:
            raise ValueError("invalid method name given")
            
        # fit the classifier on the training set
        clf.fit(X_train, y_train)

        # generate probability predictions for every sample in the test set
        y_pred = clf.predict_proba(X_test)

        print(str(y_pred.shape))

        # calculate the performance metrics on the test set
        if 'auroc' in metrics_to_calculate:
            metric_values_per_fold['auroc_micro'].append(roc_auc_score(y_test, y_pred, average='micro'))

            # This is not really important as we are only interested in the micro measures.
            # Nevertheless, I basically do the macro averaging by hand so that I can skip labels that have only samples with one class
            roc_auc_per_label = []
            for label_idx in range(Y.shape[1]): # 0是行 1是列
                if len(set(y_test[:, label_idx])) >= 2: # here test is validation
                    roc_auc_per_label.append(roc_auc_score(y_test[:, label_idx], y_pred[:, label_idx]))
            print(str(len(roc_auc_per_label))+' out of the '+str(y_test.shape[1])+' total labels has more than one classes present')

            metric_values_per_fold['auroc_macro'].append(np.mean(roc_auc_per_label))


        if 'aupr' in metrics_to_calculate:
            metric_values_per_fold['aupr_micro'].append(average_precision_score(y_test, y_pred, average='micro'))

            aupr_per_label = []
            for label_idx in range(Y.shape[1]):
                if len(set(y_test[:, label_idx])) >= 2:
                    aupr_per_label.append(average_precision_score(y_test[:, label_idx], y_pred[:, label_idx]))

            metric_values_per_fold['aupr_macro'].append(np.mean(aupr_per_label))


        fold_counter += 1
        print('========================================================================')
        print('')
        
    # calculate the mean and std for every metric measured during training and validation
    print('setting' + str(setting), str(dataname), str(method_name), 'shuffle = ' + str(item))
    for metric_name in metric_values_per_fold.keys():
        print(metric_name+': '+ str('%.4f' % np.mean(metric_values_per_fold[metric_name])) +' ('+ str('%.4f' % np.std(metric_values_per_fold[metric_name])) +')')
        print('')





(21, 210)
72 out of the 210 total labels has more than one classes present





(21, 210)
82 out of the 210 total labels has more than one classes present





(21, 210)
96 out of the 210 total labels has more than one classes present





(21, 210)
42 out of the 210 total labels has more than one classes present





(20, 210)
76 out of the 210 total labels has more than one classes present





(20, 210)
88 out of the 210 total labels has more than one classes present





(20, 210)
99 out of the 210 total labels has more than one classes present





(20, 210)
89 out of the 210 total labels has more than one classes present





(20, 210)
72 out of the 210 total labels has more than one classes present





(20, 210)
53 out of the 210 total labels has more than one classes present

settingB dpii mlp shuffle = True
auroc_micro: 0.9327 (0.0332)

auroc_macro: 0.9151 (0.0433)

aupr_micro: 0.8303 (0.0494)

aupr_macro: 0.8308 (0.0614)





(21, 210)
52 out of the 210 total labels has more than one classes present





(21, 210)
39 out of the 210 total labels has more than one classes present





(21, 210)
36 out of the 210 total labels has more than one classes present





(21, 210)
45 out of the 210 total labels has more than one classes present





(20, 210)
32 out of the 210 total labels has more than one classes present





(20, 210)
42 out of the 210 total labels has more than one classes present





(20, 210)
50 out of the 210 total labels has more than one classes present





(20, 210)
64 out of the 210 total labels has more than one classes present





(20, 210)
35 out of the 210 total labels has more than one classes present

(20, 210)
65 out of the 210 total labels has more than one classes present

settingB dpii mlp shuffle = False
auroc_micro: 0.7850 (0.1375)

auroc_macro: 0.6989 (0.1682)

aupr_micro: 0.5828 (0.1806)

aupr_macro: 0.5870 (0.1837)



