In [7]:
# Options
addRawFeat = True
base_path = ''
feature_networks_integration = ['clinical', 'cna', 'exp','coe','met','mut'] # datatypes to concatanate node features from
node_networks = ['clinical', 'cna', 'exp','coe','met','mut'] # datatypes to use networks from
int_method = 'MLP' # Machine Learning method to integrate node embeddings: 'MLP' or 'XGBoost' or 'RF' or 'SVM'

# optimize for hyperparameter tuning
learning_rates = [0.01, 0.001, 0.0001] # learning rates to tune for GCN
hid_sizes = [32, 64, 128, 256] # hidden sizes to tune for GCN
xtimes = 50 #number of times Machine Learning algorithm will be tuned for each combination
xtimes2 = 3 # number of times each evaluation metric will be repeated (for standard deviation of evaluation metrics)

# optimize for optional feature selection of node features
feature_selection_per_network = [False, False, False,False, False, False]
top_features_per_network = [50, 50, 50,50,50,50]
optional_feat_selection = False
boruta_runs = 100
boruta_top_features = 50

# fixed
max_epochs = 500
min_epochs = 200
patience = 30

# fixed to get the same results from the tool each time
random_state = 404

# SUPREME run
print('SUPREME is setting up!')
from lib import module
import time
import os, itertools
import pickle
from sklearn.metrics import f1_score, accuracy_score
import statistics
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, train_test_split, RandomizedSearchCV, GridSearchCV
import pandas as pd
import numpy as np
from torch_geometric.data import Data
import os
import torch
import argparse
from tqdm import tqdm
import errno
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

if ((True in feature_selection_per_network) or (optional_feat_selection == True)):
    import rpy2
    import rpy2.robjects as robjects
    from rpy2.robjects.packages import importr
    utils = importr('utils')
    rFerns = importr('rFerns')
    Boruta = importr('Boruta')
    pracma = importr('pracma')
    dplyr = importr('dplyr')
    import re

dataset_name = 'full_data'

path = base_path + "data/" + dataset_name
if not os.path.exists(path):
    raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), path)
        
device = torch.device('cuda:1')

SUPREME is setting up!


In [8]:
def train():
    model.train()
    optimizer.zero_grad()
    out, emb1 = model(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return emb1


def validate():
    model.eval()
    with torch.no_grad():
        out, emb2 = model(data)
        pred = out.argmax(dim=1)
        loss = criterion(out[data.valid_mask], data.y[data.valid_mask])        
    return loss, emb2

criterion = torch.nn.CrossEntropyLoss()

data_path_node =  base_path + 'data/' + dataset_name +'/'
run_name = 'SUPREME_'+  dataset_name + '_results'
save_path = base_path + run_name + '/'

if not os.path.exists(base_path + run_name):
    os.makedirs(base_path + run_name + '/')

file = base_path + 'data/' + dataset_name +'/labels.pkl'
with open(file, 'rb') as f:
    labels = pickle.load(f)

file = base_path + 'data/' + dataset_name + '/mask_values.pkl'
if os.path.exists(file):
    with open(file, 'rb') as f:
        train_valid_idx, test_idx = pickle.load(f)
    print('use pre-defined split')
else:
    train_valid_idx, test_idx= train_test_split(np.arange(len(labels)), test_size=0.20, shuffle=True, stratify=labels)
    print('use random split')
start = time.time()

is_first = 0

print('SUPREME is running..')
# Node feature generation - Concatenating node features from all the input datatypes            
for netw in node_networks:
    file = base_path + 'data/' + dataset_name +'/'+ netw +'.pkl'
    with open(file, 'rb') as f:
        feat = pickle.load(f)
        if feature_selection_per_network[node_networks.index(netw)] and top_features_per_network[node_networks.index(netw)] < feat.values.shape[1]:     
            feat_flat = [item for sublist in feat.values.tolist() for item in sublist]
            feat_temp = robjects.FloatVector(feat_flat)
            robjects.globalenv['feat_matrix'] = robjects.r('matrix')(feat_temp)
            robjects.globalenv['feat_x'] = robjects.IntVector(feat.shape)
            robjects.globalenv['labels_vector'] = robjects.IntVector(labels.tolist())
            robjects.globalenv['top'] = top_features_per_network[node_networks.index(netw)]
            robjects.globalenv['maxBorutaRuns'] = boruta_runs
            robjects.r('''
                require(rFerns)
                require(Boruta)
                labels_vector = as.factor(labels_vector)
                feat_matrix <- Reshape(feat_matrix, feat_x[1])
                feat_data = data.frame(feat_matrix)
                colnames(feat_data) <- 1:feat_x[2]
                feat_data <- feat_data %>%
                    mutate('Labels' = labels_vector)
                boruta.train <- Boruta(feat_data$Labels ~ ., data= feat_data, doTrace = 0, getImp=getImpFerns, holdHistory = T, maxRuns = maxBorutaRuns)
                thr = sort(attStats(boruta.train)$medianImp, decreasing = T)[top]
                boruta_signif = rownames(attStats(boruta.train)[attStats(boruta.train)$medianImp >= thr,])
                    ''')
            boruta_signif = robjects.globalenv['boruta_signif']
            robjects.r.rm("feat_matrix")
            robjects.r.rm("labels_vector")
            robjects.r.rm("feat_data")
            robjects.r.rm("boruta_signif")
            robjects.r.rm("thr")
            topx = []
            for index in boruta_signif:
                t_index=re.sub("`","",index)
                topx.append((np.array(feat.values).T)[int(t_index)-1])
            topx = np.array(topx)
            values = torch.tensor(topx.T, device=device)
        elif feature_selection_per_network[node_networks.index(netw)] and top_features_per_network[node_networks.index(netw)] >= feat.values.shape[1]:
            values = feat.values
        else:
            values = feat.values
    
    if is_first == 0:
        new_x = torch.tensor(values, device=device).float()
        is_first = 1
    else:
        new_x = torch.cat((new_x, torch.tensor(values, device=device).float()), dim=1)
    
# Node embedding generation using GCN for each input network with hyperparameter tuning   
for n in range(len(node_networks)):
    netw_base = node_networks[n]
    with open(data_path_node + 'edges_' + netw_base + '.pkl', 'rb') as f:
        edge_index = pickle.load(f)
    best_ValidLoss = np.Inf
    learning_rate = 0.001
    hid_size = 128
    av_valid_losses = list()

    for ii in range(xtimes2):
        data = Data(x=new_x, edge_index=torch.tensor(edge_index[edge_index.columns[0:2]].transpose().values, device=device).long(),
                    edge_attr=torch.tensor(edge_index[edge_index.columns[2]].transpose().values, device=device).float(), y=labels) 
        X = data.x[train_valid_idx]
        y = data.y[train_valid_idx]
        rskf = RepeatedStratifiedKFold(n_splits=4, n_repeats=1)

        for train_part, valid_part in rskf.split(X, y):
            train_idx = train_valid_idx[train_part]
            valid_idx = train_valid_idx[valid_part]
            break

        train_mask = np.array([i in set(train_idx) for i in range(data.x.shape[0])])
        valid_mask = np.array([i in set(valid_idx) for i in range(data.x.shape[0])])
        data.valid_mask = torch.tensor(valid_mask, device=device)
        data.train_mask = torch.tensor(train_mask, device=device)
        test_mask = np.array([i in set(test_idx) for i in range(data.x.shape[0])])
        data.test_mask = torch.tensor(test_mask, device=device)

        in_size = data.x.shape[1]
        out_size = torch.unique(data.y).shape[0]
        model = module.GCN(in_size=in_size, hid_size=hid_size, out_size=out_size)
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
        model = model.to(device)
        data = data.to(device)
        min_valid_loss = np.Inf
        patience_count = 0

        for epoch in tqdm(range(max_epochs)):
            emb = train()
            this_valid_loss, emb = validate()

            if this_valid_loss < min_valid_loss:
                min_valid_loss = this_valid_loss
                patience_count = 0
                this_emb = emb
            else:
                patience_count += 1

            if epoch >= min_epochs and patience_count >= patience:
                break

        av_valid_losses.append(min_valid_loss.item())

    av_valid_loss = round(statistics.median(av_valid_losses), 3)

    if av_valid_loss < best_ValidLoss:
        best_ValidLoss = av_valid_loss
        best_emb_lr = learning_rate
        best_emb_hs = hid_size
        selected_emb = this_emb

    
    emb_file = save_path + 'Emb_' +  netw_base + '.pkl'
    with open(emb_file, 'wb') as f:
        pickle.dump(selected_emb, f)
        pd.DataFrame(selected_emb.cpu()).to_csv(emb_file[:-4] + '.csv')
    
start2 = time.time()    
print('It took ' + str(round(start2 - start, 1)) + ' seconds for node embedding generation (' + str(len(learning_rates)*len(hid_sizes))+ ' trials for ' + str(len(node_networks)) + ' seperate GCNs).')

print('SUPREME is integrating the embeddings..')
 

use random split
SUPREME is running..


 40%|████      | 201/500 [00:02<00:03, 90.45it/s] 
 40%|████      | 200/500 [00:01<00:01, 162.09it/s]
 49%|████▉     | 247/500 [00:01<00:01, 158.19it/s]
 40%|████      | 200/500 [00:01<00:02, 145.31it/s]
 40%|████      | 200/500 [00:01<00:01, 154.05it/s]
 40%|████      | 200/500 [00:01<00:01, 163.36it/s]
 40%|████      | 200/500 [00:01<00:01, 156.29it/s]
 40%|████      | 200/500 [00:01<00:01, 153.59it/s]
 40%|████      | 200/500 [00:01<00:01, 151.50it/s]
 40%|████      | 200/500 [00:01<00:01, 162.79it/s]
 40%|████      | 200/500 [00:01<00:01, 168.84it/s]
 40%|████      | 200/500 [00:01<00:02, 147.71it/s]
 40%|████      | 200/500 [00:01<00:01, 167.68it/s]
 40%|████      | 200/500 [00:01<00:01, 159.71it/s]
 40%|████      | 200/500 [00:01<00:01, 155.45it/s]
 40%|████      | 200/500 [00:01<00:01, 150.53it/s]
 40%|████      | 200/500 [00:01<00:01, 155.55it/s]
 40%|████      | 200/500 [00:01<00:01, 161.77it/s]


It took 28.2 seconds for node embedding generation (12 trials for 6 seperate GCNs).
SUPREME is integrating the embeddings..


In [9]:
# Running Machine Learning for each possible combination of input network
# Input for Machine Learning algorithm is the concatanation of node embeddings (specific to each combination) and node features (if node feature integration is True)    
addFeatures = []
t = range(len(node_networks))
trial_combs = []
for r in range(1, len(t) + 1):
    trial_combs.extend([list(x) for x in itertools.combinations(t, r)])

for trials in range(len(trial_combs)):
    node_networks2 = [node_networks[i] for i in trial_combs[trials]]
    netw_base = node_networks2[0]
    emb_file = save_path + 'Emb_' +  netw_base + '.pkl'
    with open(emb_file, 'rb') as f:
        emb = pickle.load(f)

    if len(node_networks2) > 1:
        for netw_base in node_networks2[1:]:
            emb_file = save_path + 'Emb_' +  netw_base + '.pkl'
            with open(emb_file, 'rb') as f:
                cur_emb = pickle.load(f)
            emb = torch.cat((emb, cur_emb), dim=1)
            
    if addRawFeat == True:
        is_first = 0
        addFeatures = feature_networks_integration
        for netw in addFeatures:
            file = base_path + 'data/' + dataset_name +'/'+ netw +'.pkl'
            with open(file, 'rb') as f:
                feat = pickle.load(f)
            if is_first == 0:
                allx = torch.tensor(feat.values, device=device).float()
                is_first = 1
            else:
                allx = torch.cat((allx, torch.tensor(feat.values, device=device).float()), dim=1)   
        
        if optional_feat_selection == True:     
            allx_flat = [item for sublist in allx.tolist() for item in sublist]
            allx_temp = robjects.FloatVector(allx_flat)
            robjects.globalenv['allx_matrix'] = robjects.r('matrix')(allx_temp)
            robjects.globalenv['allx_x'] = robjects.IntVector(allx.shape)
            robjects.globalenv['labels_vector'] = robjects.IntVector(labels.tolist())
            robjects.globalenv['top'] = boruta_top_features
            robjects.globalenv['maxBorutaRuns'] = boruta_runs
            robjects.r('''
                require(rFerns)
                require(Boruta)
                labels_vector = as.factor(labels_vector)
                allx_matrix <- Reshape(allx_matrix, allx_x[1])
                allx_data = data.frame(allx_matrix)
                colnames(allx_data) <- 1:allx_x[2]
                allx_data <- allx_data %>%
                    mutate('Labels' = labels_vector)
                boruta.train <- Boruta(allx_data$Labels ~ ., data= allx_data, doTrace = 0, getImp=getImpFerns, holdHistory = T, maxRuns = maxBorutaRuns)
                thr = sort(attStats(boruta.train)$medianImp, decreasing = T)[top]
                boruta_signif = rownames(attStats(boruta.train)[attStats(boruta.train)$medianImp >= thr,])
                    ''')
            boruta_signif = robjects.globalenv['boruta_signif']
            robjects.r.rm("allx_matrix")
            robjects.r.rm("labels_vector")
            robjects.r.rm("allx_data")
            robjects.r.rm("boruta_signif")
            robjects.r.rm("thr")
            topx = []
            for index in boruta_signif:
                t_index=re.sub("`","",index)
                topx.append((np.array(allx).T)[int(t_index)-1])
            topx = np.array(topx)
            emb = torch.cat((emb, torch.tensor(topx.T, device=device)), dim=1)
            print('Top ' + str(boruta_top_features) + " features have been selected.")
        else:
            emb = torch.cat((emb, allx), dim=1)
    
    data = Data(x=emb, y=labels)
    train_mask = np.array([i in set(train_valid_idx) for i in range(data.x.shape[0])])
    data.train_mask = torch.tensor(train_mask, device=device)
    test_mask = np.array([i in set(test_idx) for i in range(data.x.shape[0])])
    data.test_mask = torch.tensor(test_mask, device=device)
    X_train = pd.DataFrame(data.x[data.train_mask].cpu().numpy())
    X_test = pd.DataFrame(data.x[data.test_mask].cpu().numpy())
    y_train = pd.DataFrame(data.y[data.train_mask].cpu().numpy()).values.ravel()
    y_test = pd.DataFrame(data.y[data.test_mask].cpu().numpy()).values.ravel()
    
    if int_method == 'MLP':
        params = {'hidden_layer_sizes': [(16,), (32,),(64,),(128,),(256,),(512,), (32, 32), (64, 32), (128, 32), (256, 32), (512, 32)]}
        search = RandomizedSearchCV(estimator = MLPClassifier(solver = 'adam', activation = 'relu', early_stopping = True), 
                                    return_train_score = True, scoring = 'f1_macro', 
                                    param_distributions = params, cv = 4, n_iter = xtimes, verbose = 0)
        search.fit(X_train, y_train)
        model = MLPClassifier(solver = 'adam', activation = 'relu', early_stopping = True,
                              hidden_layer_sizes = search.best_params_['hidden_layer_sizes'])
        
    elif int_method == 'XGBoost':
        params = {'reg_alpha':range(0,6,1), 'reg_lambda':range(1,5,1),
                  'learning_rate':[0, 0.001, 0.01, 1]}
        fit_params = {'early_stopping_rounds': 10,
                     'eval_metric': 'mlogloss',
                     'eval_set': [(X_train, y_train)]}
        
              
        search = RandomizedSearchCV(estimator = XGBClassifier(use_label_encoder=False, n_estimators = 1000, 
                                                                  fit_params = fit_params, objective="multi:softprob", eval_metric = "mlogloss", 
                                                                  verbosity = 0), return_train_score = True, scoring = 'f1_macro',
                                        param_distributions = params, cv = 4, n_iter = xtimes, verbose = 0)
        
        search.fit(X_train, y_train)
        
        model = XGBClassifier(use_label_encoder=False, objective="multi:softprob", eval_metric = "mlogloss", verbosity = 0,
                              n_estimators = 1000, fit_params = fit_params,
                              reg_alpha = search.best_params_['reg_alpha'],
                              reg_lambda = search.best_params_['reg_lambda'],
                              learning_rate = search.best_params_['learning_rate'])
                            
    elif int_method == 'RF':
        max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
        max_depth.append(None)
        params = {'n_estimators': [int(x) for x in np.linspace(start = 200, stop = 2000, num = 100)]}
        search = RandomizedSearchCV(estimator = RandomForestClassifier(), return_train_score = True,
                                    scoring = 'f1_macro', param_distributions = params, cv=4,  n_iter = xtimes, verbose = 0)
        search.fit(X_train, y_train)
        model=RandomForestClassifier(n_estimators = search.best_params_['n_estimators'])

    elif int_method == 'SVM':
        params = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                  'gamma': [1, 0.1, 0.01, 0.001]}
        search = RandomizedSearchCV(SVC(), return_train_score = True,
                                    scoring = 'f1_macro', param_distributions = params, cv=4, n_iter = xtimes, verbose = 0)
        search.fit(X_train, y_train)
        model=SVC(C = search.best_params_['C'],
                  gamma = search.best_params_['gamma'])

 
    av_result_acc = list()
    av_result_wf1 = list()
    av_result_mf1 = list()
    av_tr_result_acc = list()
    av_tr_result_wf1 = list()
    av_tr_result_mf1 = list()
 
        
    for ii in range(xtimes2):
        model.fit(X_train,y_train)
        predictions = model.predict(X_test)
        y_pred = [round(value) for value in predictions]
        preds = model.predict(pd.DataFrame(data.x.cpu().numpy()))
        av_result_acc.append(round(accuracy_score(y_test, y_pred), 3))
        av_result_wf1.append(round(f1_score(y_test, y_pred, average='weighted'), 3))
        av_result_mf1.append(round(f1_score(y_test, y_pred, average='macro'), 3))
        tr_predictions = model.predict(X_train)
        tr_pred = [round(value) for value in tr_predictions]
        av_tr_result_acc.append(round(accuracy_score(y_train, tr_pred), 3))
        av_tr_result_wf1.append(round(f1_score(y_train, tr_pred, average='weighted'), 3))
        av_tr_result_mf1.append(round(f1_score(y_train, tr_pred, average='macro'), 3))
        
    if xtimes2 == 1:
        av_result_acc.append(round(accuracy_score(y_test, y_pred), 3))
        av_result_wf1.append(round(f1_score(y_test, y_pred, average='weighted'), 3))
        av_result_mf1.append(round(f1_score(y_test, y_pred, average='macro'), 3))
        av_tr_result_acc.append(round(accuracy_score(y_train, tr_pred), 3))
        av_tr_result_wf1.append(round(f1_score(y_train, tr_pred, average='weighted'), 3))
        av_tr_result_mf1.append(round(f1_score(y_train, tr_pred, average='macro'), 3))
        

    result_acc = str(round(statistics.median(av_result_acc), 3)) + '+-' + str(round(statistics.stdev(av_result_acc), 3))
    result_wf1 = str(round(statistics.median(av_result_wf1), 3)) + '+-' + str(round(statistics.stdev(av_result_wf1), 3))
    result_mf1 = str(round(statistics.median(av_result_mf1), 3)) + '+-' + str(round(statistics.stdev(av_result_mf1), 3))
    tr_result_acc = str(round(statistics.median(av_tr_result_acc), 3)) + '+-' + str(round(statistics.stdev(av_tr_result_acc), 3))
    tr_result_wf1 = str(round(statistics.median(av_tr_result_wf1), 3)) + '+-' + str(round(statistics.stdev(av_tr_result_wf1), 3))
    tr_result_mf1 = str(round(statistics.median(av_tr_result_mf1), 3)) + '+-' + str(round(statistics.stdev(av_tr_result_mf1), 3))
    
    print('Combination ' + str(trials) + ' ' + str(node_networks2) + ' >  selected parameters = ' + str(search.best_params_) + 
      ', train accuracy = ' + str(tr_result_acc) + ', train weighted-f1 = ' + str(tr_result_wf1) +
      ', train macro-f1 = ' +str(tr_result_mf1) + ', test accuracy = ' + str(result_acc) + 
      ', test weighted-f1 = ' + str(result_wf1) +', test macro-f1 = ' +str(result_mf1))


end = time.time()
print('It took ' + str(round(end - start, 1)) + ' seconds in total.')
print('SUPREME is done.')

Combination 0 ['clinical'] >  selected parameters = {'hidden_layer_sizes': (64,)}, train accuracy = 0.984+-0.002, train weighted-f1 = 0.984+-0.002, train macro-f1 = 0.967+-0.011, test accuracy = 0.812+-0.011, test weighted-f1 = 0.811+-0.011, test macro-f1 = 0.667+-0.008
Combination 1 ['cna'] >  selected parameters = {'hidden_layer_sizes': (512, 32)}, train accuracy = 0.975+-0.009, train weighted-f1 = 0.975+-0.01, train macro-f1 = 0.962+-0.059, test accuracy = 0.79+-0.014, test weighted-f1 = 0.788+-0.015, test macro-f1 = 0.647+-0.108
Combination 2 ['exp'] >  selected parameters = {'hidden_layer_sizes': (32,)}, train accuracy = 0.966+-0.014, train weighted-f1 = 0.965+-0.014, train macro-f1 = 0.962+-0.029, test accuracy = 0.804+-0.01, test weighted-f1 = 0.802+-0.009, test macro-f1 = 0.653+-0.093
Combination 3 ['coe'] >  selected parameters = {'hidden_layer_sizes': (32, 32)}, train accuracy = 0.966+-0.018, train weighted-f1 = 0.966+-0.019, train macro-f1 = 0.938+-0.062, test accuracy = 0.7

Combination 30 ['clinical', 'met', 'mut'] >  selected parameters = {'hidden_layer_sizes': (256,)}, train accuracy = 0.979+-0.012, train weighted-f1 = 0.979+-0.012, train macro-f1 = 0.964+-0.031, test accuracy = 0.819+-0.015, test weighted-f1 = 0.815+-0.013, test macro-f1 = 0.658+-0.005
Combination 31 ['cna', 'exp', 'coe'] >  selected parameters = {'hidden_layer_sizes': (512,)}, train accuracy = 0.974+-0.01, train weighted-f1 = 0.974+-0.01, train macro-f1 = 0.958+-0.022, test accuracy = 0.797+-0.004, test weighted-f1 = 0.796+-0.001, test macro-f1 = 0.741+-0.079
Combination 32 ['cna', 'exp', 'met'] >  selected parameters = {'hidden_layer_sizes': (16,)}, train accuracy = 0.917+-0.036, train weighted-f1 = 0.918+-0.036, train macro-f1 = 0.86+-0.062, test accuracy = 0.801+-0.018, test weighted-f1 = 0.801+-0.019, test macro-f1 = 0.659+-0.074
Combination 33 ['cna', 'exp', 'mut'] >  selected parameters = {'hidden_layer_sizes': (32,)}, train accuracy = 0.974+-0.051, train weighted-f1 = 0.974+-0.

Combination 59 ['clinical', 'cna', 'coe', 'met', 'mut'] >  selected parameters = {'hidden_layer_sizes': (32, 32)}, train accuracy = 0.978+-0.002, train weighted-f1 = 0.978+-0.003, train macro-f1 = 0.961+-0.022, test accuracy = 0.826+-0.004, test weighted-f1 = 0.824+-0.003, test macro-f1 = 0.673+-0.004
Combination 60 ['clinical', 'exp', 'coe', 'met', 'mut'] >  selected parameters = {'hidden_layer_sizes': (128,)}, train accuracy = 0.985+-0.005, train weighted-f1 = 0.985+-0.005, train macro-f1 = 0.987+-0.005, test accuracy = 0.815+-0.006, test weighted-f1 = 0.815+-0.006, test macro-f1 = 0.667+-0.006
Combination 61 ['cna', 'exp', 'coe', 'met', 'mut'] >  selected parameters = {'hidden_layer_sizes': (32, 32)}, train accuracy = 0.921+-0.002, train weighted-f1 = 0.92+-0.003, train macro-f1 = 0.854+-0.06, test accuracy = 0.79+-0.018, test weighted-f1 = 0.788+-0.017, test macro-f1 = 0.653+-0.094
Combination 62 ['clinical', 'cna', 'exp', 'coe', 'met', 'mut'] >  selected parameters = {'hidden_laye

In [10]:
train_valid_idx, test_idx

(array([1292, 1294, 1193, ...,  442,  321,  831]),
 array([ 817, 1049,  777,   17,  413,  836, 1140,  333, 1340,  894, 1094,
         217,  395,  904,  526,  561,  409,   69,  111, 1246, 1310, 1282,
         986,  696,  982,  547,   82,   18,  331,  211, 1335,  198,  197,
         370,  866, 1319,  403,  807,  337,   54,  458, 1251,  615,  186,
         858, 1204,  766, 1287,  162,  169,  202, 1304,  150,  312,  737,
         572,  677, 1007, 1168,  417,  775,  954,    6,  221,  160,  709,
         511,  863,  228,  580, 1347,  518, 1181,  489,  189, 1067,  440,
         854,  153,  405,   36,  471,  231,  539,  657,  268, 1136,  330,
         159,  315,  987, 1265,  711, 1029, 1351, 1207,  372, 1075, 1232,
          70,  924,  497,  896,  180, 1103,  885,  276, 1215,  181,  123,
         456, 1368, 1010, 1020,  769,  569, 1260,  250, 1255, 1302,  755,
         812,  675,  998,  981,  126,  512,  488,  394,  869,  452,  946,
        1150,  158,  151, 1356,   66,  277,  260, 1338,  963,

In [11]:
file = base_path + 'data/' + dataset_name + '/mask_values.pkl'
if not os.path.exists(file):
    with open(file, 'wb') as f:
        pickle.dump((train_valid_idx, test_idx ),f)


In [12]:
file = base_path + 'data/' + dataset_name + '/mask_values.pkl'
if os.path.exists(file):
    with open(file, 'rb') as f:
        new_train_valid_idx, new_test_idx = pickle.load(f)
    print('use pre-defined split')

use pre-defined split


In [13]:
new_train_valid_idx

array([1292, 1294, 1193, ...,  442,  321,  831])