In [3]:
from lib import module
import time
import os, itertools
import pickle
from sklearn.metrics import f1_score, accuracy_score
import statistics
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold, train_test_split, RandomizedSearchCV, GridSearchCV
import pandas as pd
import numpy as np
from torch_geometric.data import Data
import os
import torch
import argparse
from tqdm import tqdm
import errno
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

base_path = ''
feature_networks_integration = ['clinical', 'cna', 'exp','coe','met','mut'] # datatypes to concatanate node features from
node_networks = ['clinical', 'cna', 'exp','coe','met','mut'] # datatypes to use networks from
learning_rate = 0.001

# optimize for optional feature selection of node features
feature_selection_per_network = [False, False, False,False, False, False]
top_features_per_network = [50, 50, 50,50,50,50]
optional_feat_selection = False
boruta_runs = 100
boruta_top_features = 50


# fixed
max_epochs = 500
min_epochs = 200
patience = 30

random_state = 404

# SUPREME run
print('SUPREME is setting up!')

dataset_name = 'full_data'

path = base_path + "data/" + dataset_name
if not os.path.exists(path):
    raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), path)
        
device = torch.device('cuda:1')

# node_networks = ['exp','coe','met'] # datatypes to use networks from
modelname = 'GCN'

SUPREME is setting up!


In [4]:
data_path_node =  base_path + 'data/' + dataset_name +'/'
run_name = 'SUPREME_'+  dataset_name + '_results'
save_path = base_path + run_name + '/'

if not os.path.exists(base_path + run_name):
    os.makedirs(base_path + run_name + '/')

file = base_path + 'data/' + dataset_name +'/labels.pkl'
with open(file, 'rb') as f:
    labels = pickle.load(f)

file = base_path + 'data/' + dataset_name + '/mask_values.pkl'
if os.path.exists(file):
    with open(file, 'rb') as f:
        train_valid_idx, test_idx = pickle.load(f)
    print('use pre-defined split')
else:
    train_valid_idx, test_idx= train_test_split(np.arange(len(labels)), test_size=0.20, shuffle=True, stratify=labels)
    print('use random split')
start = time.time()


x_lists = []
for netw in node_networks:
    file = base_path + 'data/' + dataset_name +'/'+ netw +'.pkl'
    with open(file, 'rb') as f:
        feat = pickle.load(f)
        values = feat.values
        x_lists.append(values)
new_x = np.concatenate(x_lists,-1)
scaler = StandardScaler()
scaled_new_x = scaler.fit_transform(new_x)
new_x = torch.tensor(scaled_new_x,dtype=torch.float32).to(device)

print('data load done')

train_idx,valid_idx = train_test_split(train_valid_idx,test_size=0.2)
train_mask = np.array([i in set(train_idx) for i in range(new_x.shape[0])])
valid_mask = np.array([i in set(valid_idx) for i in range(new_x.shape[0])])
test_mask = np.array([i in set(test_idx) for i in range(new_x.shape[0])])
y_test = pd.DataFrame(labels[test_mask].cpu().numpy()).values.ravel()

in_size = new_x.shape[1]
out_size = torch.unique(labels).shape[0]

graphs = []
for n in range(len(node_networks)):
    netw_base = node_networks[n]
    with open(data_path_node + 'edges_' + netw_base + '.pkl', 'rb') as f:
        edge_index = pickle.load(f)
    data = Data(x=new_x, edge_index=torch.tensor(edge_index[edge_index.columns[0:2]].transpose().values, device=device).long(),
                edge_attr=torch.tensor(edge_index[edge_index.columns[2]].transpose().values, device=device).float(), y=labels) 
    data.valid_mask = torch.tensor(valid_mask, device=device)
    data.train_mask = torch.tensor(train_mask, device=device)
    data.test_mask = torch.tensor(test_mask, device=device)
    graphs.append(data)

use pre-defined split
data load done


In [5]:
hid_sizes = [16]
from lib import module
model = module.xGNN(in_size=in_size, hid_sizes=hid_sizes, out_size=out_size, nmb_networks=len(graphs),encoder='GSAT')
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
model = model.to(device)
data = data.to(device)


In [6]:
best_ValidLoss = np.Inf
av_valid_losses = list()    
min_valid_loss = np.Inf
patience_count = 0


def train():
    model.train()
    optimizer.zero_grad()
    out = model(graphs)
    
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()


def validate():
    model.eval()
    with torch.no_grad():
        out = model(graphs)
        pred = out.argmax(dim=1)
        loss = criterion(out[data.valid_mask], data.y[data.valid_mask])        
    return loss

def test():
    model.eval()
    with torch.no_grad():
        out = model(graphs)
        pred = out.argmax(dim=1)
        y_pred = pred[data.test_mask].cpu()
        return accuracy_score(y_pred, y_test)



criterion = torch.nn.CrossEntropyLoss()


for epoch in tqdm(range(max_epochs)):
    train()
    this_valid_loss = validate()

    if this_valid_loss < min_valid_loss:
        min_valid_loss = this_valid_loss
        patience_count = 0
    else:
        patience_count += 1

    if epoch >= min_epochs and patience_count >= patience:
        break

av_valid_losses.append(min_valid_loss.item())

av_valid_loss = round(statistics.median(av_valid_losses), 3)

if av_valid_loss < best_ValidLoss:
    best_ValidLoss = av_valid_loss
    test_acc = test()
    print('test acc',test_acc)

 40%|████      | 200/500 [00:06<00:10, 29.59it/s]

test acc 0.8007246376811594



