In [1]:
#-------- Import Libraries --------#

import torch
import time
import os
import sys
import random
import pickle
import mlflow
import numpy as np
import pandas as pd
import torch.nn as nn
import seaborn as sn
import matplotlib.pyplot as plt
from datetime import date
from sklearn.metrics import matthews_corrcoef
import torch.optim as optim  # For all Optimization algorithms, SGD, Adam, etc.
import torch.nn.functional as F  # All functions that don't have any parameters
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, auc

In [2]:
#-------- Import Modules from project--------#
import encoding as enc
from model import Net, Net_thesis, Net_project
import functions as func

In [3]:
#-------- Set Device --------#

if torch.cuda.is_available():
    device = torch.device('cuda')
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
else:
    print('No GPUs available. Using CPU instead.')
    device = torch.device('cpu')

No GPUs available. Using CPU instead.


In [4]:
#-------- Seeds --------#

seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

torch.use_deterministic_algorithms(True)

In [5]:
#-------- Import Modules from project--------#
import encoding as enc
from model import Net_project
import functions as func


In [6]:
#-------- Import Dataset --------#             

data_list = []
target_list = []

import glob
for fp in glob.glob("../data/train/*input.npz"):
    data = np.load(fp)["arr_0"]
    targets = np.load(fp.replace("input", "labels"))["arr_0"]
    data_list.append(data)
    target_list.append(targets)
    
for fp in glob.glob("../data/validation/*input.npz"):
    data = np.load(fp)["arr_0"]
    targets = np.load(fp.replace("input", "labels"))["arr_0"]
    data_list.append(data)
    target_list.append(targets)
    
print(len(data_list))
print(len(target_list))

data_partitions = len(data_list)


5
5


In [7]:
#-------- Select the network you would like to use -------#

CNN = False # ONLY CNN
CNN_RNN = True # CNN + RNN

# Hyperparameters to fine-tune
embedding = "Baseline"
numHN=32
numFilter=100
dropOutRate=0.1

##--- parameters fixed
keep_energy=True
cross_validation = False
bat_size = 128
num_classes=1
learning_rate=0.001
epochs = 100
patience=10
criterion = nn.BCEWithLogitsLoss()

In [8]:
#embedding of data

#create directory to fetch/store embedded
embedding_dir= '../data/embeddedFiles/'
try:
    os.mkdir(embedding_dir)
except:
    pass


#try to fecth if already exist
if embedding == "Baseline":   
    data_list_enc = data_list
else:
    try:
        infile = open(embedding_dir+'dataset-{}'.format(embedding), 'rb')
        data_list_enc =  pickle.load(infile)
        infile.close()

    #if no prior file, use encoder script to encode:
    except:
        print("embedded file not found")
        sys.exit()

In [9]:
print(len(data_list_enc))
print(len(data_list_enc[0]))
print(len(data_list_enc[0][0]))
print(len(data_list_enc[0][0][0]))


5
1526
420
54


In [10]:
# Add energy terms from original dataset         
if keep_energy:
    if embedding == "Baseline":
        energy_set =''
        pad = 0
        pass
    else:
        for i in range (len(data_list_enc)):
            energy_set = func.extract_energy_terms(data_list[i]) 
            for j in range(0, len(energy_set)):
                pad = 420 - len(energy_set[j])
                energy_set[j] = np.pad(energy_set[j], ((0, pad), (0, 0)), 'constant')
                data_list_enc[i][j] = np.concatenate((data_list_enc[i][j], energy_set[j]), axis=1)

                
        

In [11]:
print(len(data_list_enc))
print(len(data_list_enc[0]))
print(len(data_list_enc[0][0]))
print(len(data_list_enc[0][0][0]))

5
1526
420
54


In [12]:
import gc
del data_list, energy_set, pad
gc.collect()

200

In [13]:
X_train = np.concatenate(data_list_enc[0:3])
y_train = np.concatenate(target_list[0:3])
nsamples, nx, ny = X_train.shape
print("Training set shape:", nsamples,nx,ny)

X_valid = np.concatenate(data_list_enc[3:4])
y_valid = np.concatenate(target_list[3:4])
nsamples, nx, ny = X_valid.shape
print("Validation set shape:", nsamples,nx,ny)


X_test = np.concatenate(data_list_enc[4:])
y_test = np.concatenate(target_list[4:])
nsamples, nx, ny = X_test.shape
print("Test set shape:", nsamples,nx,ny)

# features and residues
features = list(range(ny))
residues = list(range(nx)) 
n_features = len(features)
input_size = len(residues)

# Dataloader
train_ds = []
for i in range(len(X_train)):
    train_ds.append([np.transpose(X_train[i][:,features]), y_train[i]])
val_ds = []
for i in range(len(X_valid)):
    val_ds.append([np.transpose(X_valid[i][:,features]), y_valid[i]])
test_ds = []
for i in range(len(X_test)):
    test_ds.append([np.transpose(X_test[i][:,features]), y_test[i]])
    
    
train_ldr = torch.utils.data.DataLoader(train_ds,batch_size=bat_size, shuffle=True)
val_ldr = torch.utils.data.DataLoader(val_ds,batch_size=bat_size, shuffle=True)
test_ldr = torch.utils.data.DataLoader(test_ds,batch_size=bat_size, shuffle=True)

Training set shape: 4174 420 54
Validation set shape: 1532 420 54
Test set shape: 1207 420 54


In [14]:

###############################
###    CNN+RNN (thesis)     ###
###############################
start = time.time()

if cross_validation == False:
    
    print("Parameters:")
    print("cross_validation", cross_validation)
    print("embedding", embedding)
    print("numHN", numHN)
    print("numFilter", numFilter)
    print("dropOutRate", dropOutRate)
    print("keep_energy", keep_energy)
    print("num_classes", num_classes)
    print("learning_rate", learning_rate)
    print("bat_size", bat_size)
    print("patience", patience)
    print("criterion", criterion)
    print("\n\n")
    
    #-------- Train --------#

    print("Train")
    # Dataloader
    train_ds = []
    for i in range(len(X_train)):
        train_ds.append([np.transpose(X_train[i][:,features]), y_train[i]])
    val_ds = []
    for i in range(len(X_valid)):
        val_ds.append([np.transpose(X_valid[i][:,features]), y_valid[i]])
    train_ldr = torch.utils.data.DataLoader(train_ds,batch_size=len(train_ds), shuffle=True)
    val_ldr = torch.utils.data.DataLoader(val_ds,batch_size=bat_size, shuffle=True)

    # Initialize network
    net = Net_project(num_classes=num_classes, 
             n_features=n_features, 
             numHN=numHN, 
             numFilter=numFilter,
             dropOutRate=dropOutRate).to(device)
    
    optimizer = optim.Adam(net.parameters(), lr=learning_rate,
                           weight_decay=0.0005,
                           amsgrad=True,)
    
    train_acc, train_losses, train_auc, valid_acc, valid_losses, valid_auc, val_preds, val_targs, test_preds, test_targs, test_loss, test_acc, test_auc = func.train_project(net, optimizer, train_ldr, val_ldr, test_ldr, X_valid, epochs, criterion, patience)

else:
    pass

print("Done in", round((time.time()-start)/60,2), "mins." )

print("test_loss, test_acc, test_auc:")
print(test_loss.item(), ",", test_acc[0], ",", test_auc[0])


Parameters:
cross_validation False
embedding Baseline
numHN 32
numFilter 100
dropOutRate 0.1
keep_energy True
num_classes 1
learning_rate 0.001
bat_size 128
patience 10
criterion BCEWithLogitsLoss()



Train
Epoch 0  	 Train loss: 0.00018 	 Validation loss: 0.00582
Epoch 10  	 Train loss: 0.00013 	 Validation loss: 0.00451
Epoch 20  	 Train loss: 0.00013 	 Validation loss: 0.00415
Epoch 30  	 Train loss: 0.00012 	 Validation loss: 0.00404
Epoch 40  	 Train loss: 0.00012 	 Validation loss: 0.00388
Epoch 50  	 Train loss: 0.00011 	 Validation loss: 0.00362
Epoch 60  	 Train loss: 0.00010 	 Validation loss: 0.00350
Epoch 70  	 Train loss: 0.00009 	 Validation loss: 0.00341
Epoch 80  	 Train loss: 0.00007 	 Validation loss: 0.00346
Early stopping

0
1
2
3
4
5
6
7
8
9
Done in 11.16 mins.
test_loss, test_acc, test_auc:
0.38532382249832153 , 0.875 , 0.8142857142857143


In [17]:
#metrics
#AUC = roc_auc_score(results['target'], results['pred'])
#MCC = matthews_corrcoef(results['target'], results['pred'])
print("AUC: ", test_auc[0])
print("MCC: ", test_acc[0])

AUC:  0.8142857142857143
MCC:  0.875


In [23]:
#storing values
import mlflow
import mlflow.sklearn

with mlflow.start_run():
    mlflow.log_param('embedding', embedding) 
    mlflow.log_param('Hidden Neurons', numHN)
    mlflow.log_param('filters CNN', numFilter)
    mlflow.log_param('Dropout rate', dropOutRate)
    mlflow.log_metric('AUC', test_auc[0])
    mlflow.log_metric('MCC', test_acc[0])
    #ADD ARTIFACTS (PLOTS)

#### 