In [32]:
# import libraries
import torch
import os
import sys
import random
import numpy as np
import torch.nn as nn
import torch.optim as optim  # For all Optimization algorithms, SGD, Adam, etc.
import torch.nn.functional as F  # All functions that don't have any parameters
from sklearn.metrics import accuracy_score

In [30]:
# import modules
import encoding as enc
import model
import functions as func

ModuleNotFoundError: No module named 'esm'

In [33]:
if torch.cuda.is_available():
    device = torch.device('cuda')
    print('There are %d GPU(s) avalable.' % torch.cuda.device_count())
else:
    print('No GPUs available. Using CPU instead.')
    device = torch.device('cpu')

No GPUs available. Using CPU instead.


In [34]:
#-------- Seeds --------#
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [35]:
#-------- Parameters --------#

# Just an example below, we can always change later
EPOCHS = 1000
MINI_BATCH_SIZE = 512
LEARNING_RATE = 0.0001
PATIENCE = EPOCHS // 10
N_HIDDEN_NEURONS = 128

# Can be converted to BCE or another
criterion = nn.MSELoss()

DATADIR = "/data/"
MATRICES = "/data/Matrices"


In [37]:
#-------- Unzip Train --------#

try:
    if len(os.listdir(TRAINDIR)) != 0:
        print("{} already exist.".format(TRAINDIR))
except:
    !unzip ../data/train.zip -d ../data/train

    
#-------- Unzip Validation --------#
try:
    if len(os.listdir(VALIDATIONDIR)) != 0:
        print("{} already exist.".format(VALIDATIONDIR))
except:
    !unzip ../data/validation.zip -d ../data/validation

'unzip' is not recognized as an internal or external command,
operable program or batch file.
'unzip' is not recognized as an internal or external command,
operable program or batch file.


In [22]:
print("train:", os.listdir('../data/train'), '\n')
print("validation:",os.listdir('../data/validation'))

train: ['P2_labels.npz', 'P3_input.npz', 'P4_input.npz', 'P2_input.npz', '__MACOSX', 'P1_input.npz', 'P3_labels.npz', 'P4_labels.npz', 'P1_labels.npz'] 

validation: ['P4_input.npz', '__MACOSX', 'P4_labels.npz']


In [38]:
#-------- Import Dataset --------#

data_list = []
target_list = []

import glob
for fp in glob.glob("../data/train/*input.npz"):
    data = np.load(fp)["arr_0"]
    targets = np.load(fp.replace("input", "labels"))["arr_0"]
    data_list.append(data)
    target_list.append(targets)
    
#print(len(data_list))
#print(len(target_list))

X_train = np.concatenate(data_list[ :-1])
y_train = np.concatenate(target_list[:-1])
nsamples, nx, ny = X_train.shape
print("Training set shape:", nsamples,nx,ny)

X_val = np.concatenate(data_list[-1: ])
y_val = np.concatenate(target_list[-1: ])
nsamples, nx, ny = X_val.shape
print("val set shape:", nsamples,nx,ny)

p_neg = len(y_train[y_train == 1])/len(y_train)*100
print("Percent positive samples in train:", p_neg)

p_pos = len(y_val[y_val == 1])/len(y_val)*100
print("Percent positive samples in val:", p_pos)

# make the data set into one dataset that can go into dataloader
train_ds = []
for i in range(len(X_train)):
    train_ds.append([np.transpose(X_train[i]), y_train[i]])

val_ds = []
for i in range(len(X_val)):
    val_ds.append([np.transpose(X_val[i]), y_val[i]])

bat_size = 64
print("\nNOTE:\nSetting batch-size to", bat_size)
train_ldr = torch.utils.data.DataLoader(train_ds,batch_size=bat_size, shuffle=True)
val_ldr = torch.utils.data.DataLoader(val_ds,batch_size=bat_size, shuffle=True)


# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device (CPU/GPU):", device)
#device = torch.device("cpu")

Training set shape: 4174 420 54
val set shape: 1532 420 54
Percent positive samples in train: 24.96406324868232
Percent positive samples in val: 25.0

NOTE:
Setting batch-size to 64
Using device (CPU/GPU): cpu


In [39]:
#-------- Define network --------#

print("Initializing network")

# Hyperparameters
input_size = 420
num_classes = 1
learning_rate = 0.01

# Initialize network
from model import Net
net = Net(num_classes=num_classes).to(device)

# Loss and optimizer
criterion = nn.BCELoss()
optimizer = optim.SGD(net.parameters(), lr=learning_rate)


Initializing network


In [29]:
#-------- Train network --------#

print("Training")

num_epochs = 5

train_acc, train_loss = [], []
valid_acc, valid_loss = [], []
losses = []
val_losses = []

for epoch in range(num_epochs):
    cur_loss = 0
    val_loss = 0
    
    net.train()
    train_preds, train_targs = [], [] 
    for batch_idx, (data, target) in enumerate(train_ldr):
        X_batch =  data.float().detach().requires_grad_(True)
        target_batch = torch.tensor(np.array(target), dtype = torch.float).unsqueeze(1)
        
        optimizer.zero_grad()
        output = net(X_batch)
        
        batch_loss = criterion(output, target_batch)
        batch_loss.backward()
        optimizer.step()
        
        preds = np.round(output.detach().cpu())
        train_targs += list(np.array(target_batch.cpu()))
        train_preds += list(preds.data.numpy().flatten())
        cur_loss += batch_loss.detach()

    losses.append(cur_loss / len(train_ldr.dataset))
        
    
    net.eval()
    ### Evaluate validation
    val_preds, val_targs = [], []
    with torch.no_grad():
        for batch_idx, (data, target) in enumerate(val_ldr): ###
            x_batch_val = data.float().detach()
            y_batch_val = target.float().detach().unsqueeze(1)
            
            output = net(x_batch_val)
            
            val_batch_loss = criterion(output, y_batch_val)
            
            preds = np.round(output.detach())
            val_preds += list(preds.data.numpy().flatten()) 
            val_targs += list(np.array(y_batch_val))
            val_loss += val_batch_loss.detach()
            
        val_losses.append(val_loss / len(val_ldr.dataset))
        print("\nEpoch:", epoch+1)
        
        train_acc_cur = accuracy_score(train_targs, train_preds)  
        valid_acc_cur = accuracy_score(val_targs, val_preds) 

        train_acc.append(train_acc_cur)
        valid_acc.append(valid_acc_cur)
        
        from sklearn.metrics import matthews_corrcoef
        print("Training loss:", losses[-1].item(), "Validation loss:", val_losses[-1].item(), end = "\n")
        print("MCC Train:", matthews_corrcoef(train_targs, train_preds), "MCC val:", matthews_corrcoef(val_targs, val_preds))
        
print('\nFinished Training ...')

Training

Epoch: 1
Training loss: 0.00769993057474494 Validation loss: 0.007803560234606266
MCC Train: 0.430046049926052 MCC val: 0.4381855638265855

Epoch: 2
Training loss: 0.007039184682071209 Validation loss: 0.00878058560192585
MCC Train: 0.5069319945480502 MCC val: 0.4078798191616063

Epoch: 3
Training loss: 0.006805058103054762 Validation loss: 0.007479188498109579
MCC Train: 0.5243772935095986 MCC val: 0.4913678541941079

Epoch: 4
Training loss: 0.006943217478692532 Validation loss: 0.007849294692277908
MCC Train: 0.4964261068738395 MCC val: 0.4728387880375803

Epoch: 5
Training loss: 0.006267018150538206 Validation loss: 0.009424594230949879
MCC Train: 0.567543437154431 MCC val: 0.33107035174996935

Finished Training ...
