In [1]:
import os, torch
import numpy as np
import pandas as pd
from torch import nn
import torch.nn.functional as F
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [3]:
print(torch.version.cuda)

11.7


In [4]:
torch.cuda.is_available()

True

## Data Preperation

In [2]:
def preprocessing (path, tissue):
    ### pathway datasets
    if (path == "GO"):
        pathway = pd.read_csv("./data/pathway_go_bp.csv", header=0)
    elif (path == "KEGG"):
        pathway = pd.read_csv("./data/pathway_kegg.csv", header=0)       
    print(">> Pathway Data :",path)

    pathway_info = pathway.iloc[:,1:]
    pathway_info = pathway_info.values
    pathway_info = np.transpose (pathway_info)
    pathway_info = torch.FloatTensor(pathway_info)
    print("pathway matrix shape : ",pathway_info.shape)
    print("num_pathway : ",pathway_info.shape[0])

    ### expression datasets
    print(">> Expression Data :",tissue)
    if (tissue == "brain"):
        data = pd.read_csv("./data/brain_expression.csv", header=0)

    expression = data.iloc[:,1:]
    gene = data.iloc[:,1]
    expression = expression.values
    expression = np.transpose(expression)

    scaler = MinMaxScaler()
    scaler = scaler.fit(expression)
    expression = scaler.transform(expression)

    sample_dim = expression.shape[0]
    input_dim = expression.shape[1]

    #print dimension of sample and number of genes
    print("sample_dim : ",sample_dim)
    print("input_size (number of genes): ",input_dim)
    
    if (tissue == "brain"):
        status = np.append(np.zeros((157)),np.ones((310)),axis = 0)
        status = status.reshape(467,1)

    patient = list(data.iloc[:,1:].columns.values.tolist()) 
    print("patient list : ",patient[1:6])
    
    return pathway_info, expression, status

In [3]:
pathway = "GO"
tissue = "brain"
pathway_info, expression, status = preprocessing(pathway, tissue)

>> Pathway Data : GO
pathway matrix shape :  torch.Size([4046, 8922])
num_pathway :  4046
>> Expression Data : brain
sample_dim :  467
input_size (number of genes):  8922
patient list :  ['GSM1424091', 'GSM1424092', 'GSM1424093', 'GSM1424094', 'GSM1424095']


## Training (check the device first!)

In [4]:
trainArgs = {}
trainArgs['x_data'] = expression
trainArgs['y_data'] = status
trainArgs['pathway_info'] = pathway_info
# trainArgs['num_fc_list'] = [32, 64, 128]
# trainArgs['lr_list'] = [0.0001,0.0005,0.001]
trainArgs['num_fc_list'] = [32]
trainArgs['lr_list'] = [0.0001]
trainArgs['device'] = '0'
trainArgs['seed'] = 0
trainArgs['pathway'] = pathway
trainArgs['tissue'] = tissue
trainArgs['filename'] = 'result.csv'

In [5]:
from train import *

In [6]:
train = train_kfold(trainArgs)

In [7]:
result = train.kfold()

****************************************************************************
Fold 1 / 10
****************************************************************************
Validation AUC increased (0.000000 --> 0.891129).  Saving model ...
Validation AUC increased (0.891129 --> 0.893145).  Saving model ...
Validation AUC increased (0.893145 --> 0.907258).  Saving model ...
Validation AUC increased (0.907258 --> 0.915323).  Saving model ...
Validation AUC increased (0.915323 --> 0.921371).  Saving model ...
Validation AUC increased (0.921371 --> 0.923387).  Saving model ...
Validation AUC increased (0.923387 --> 0.925403).  Saving model ...
Validation AUC increased (0.925403 --> 0.931452).  Saving model ...
Validation AUC increased (0.931452 --> 0.941532).  Saving model ...
Validation AUC increased (0.941532 --> 0.949597).  Saving model ...
Validation AUC increased (0.949597 --> 0.953629).  Saving model ...
Validation AUC increased (0.953629 --> 0.955645).  Saving model ...
Validation AUC inc

In [8]:
result

Unnamed: 0,hyperparam,Fold,Valid_AUC,Valid_Precision,Valid_Recall,Valid_F1,Test_AUC,Test_Precision,Test_Recall,Test_F1
0,lr:0.0001 / num_fc:32,0,0.967742,0.935484,0.935484,0.935484,0.971774,0.911765,1.0,0.953846
1,lr:0.0001 / num_fc:32,1,0.977823,1.0,0.903226,0.949153,0.917339,0.962963,0.83871,0.896552
2,lr:0.0001 / num_fc:32,2,0.96371,0.9375,0.967742,0.952381,0.931452,0.9375,0.967742,0.952381
3,lr:0.0001 / num_fc:32,3,0.97379,0.911765,1.0,0.953846,1.0,1.0,1.0,1.0
4,lr:0.0001 / num_fc:32,4,0.975806,0.966667,0.935484,0.95082,0.981855,0.9375,0.967742,0.952381
5,lr:0.0001 / num_fc:32,5,0.977823,0.9375,0.967742,0.952381,0.997984,0.96875,1.0,0.984127
6,lr:0.0001 / num_fc:32,6,0.943548,0.935484,0.935484,0.935484,0.945565,0.885714,1.0,0.939394
7,lr:0.0001 / num_fc:32,7,0.983871,0.911765,1.0,0.953846,0.982796,0.939394,1.0,0.96875
8,lr:0.0001 / num_fc:32,8,1.0,1.0,1.0,1.0,0.976344,0.909091,0.967742,0.9375
9,lr:0.0001 / num_fc:32,9,0.97379,0.909091,0.967742,0.9375,0.997849,0.96875,1.0,0.984127


In [13]:
result.to_csv(trainArgs['filename'], mode='w')

In [15]:
idx = result.groupby(['Fold'])['Valid_AUC'].transform(max) == result['Valid_AUC']
result = result[idx]
print('****************************************************************************')
print(">>[Test Result] avg.auc : {:.4f}, f1 : {:.4f}\n".format(result['Test_AUC'].mean(), result['Test_F1'].mean()))

****************************************************************************
>>[Test Result] avg.auc : 0.9717, f1 : 0.9619



In [9]:
idx = result.groupby(['Fold'])['Valid_AUC'].transform(max) == result['Valid_AUC']
result = result[idx]
print('****************************************************************************')
print(">>[Test Result] avg.auc : {:.4f}, f1 : {:.4f}\n".format(result['Test_AUC'].mean(), result['Test_F1'].mean()))

****************************************************************************
>>[Test Result] avg.auc : 0.9703, f1 : 0.9569

