In [1]:
#-------- Import Libraries --------#

import torch
import time
import os
import sys
import random
import pickle
import mlflow
import gc
import numpy as np
import pandas as pd
import torch.nn as nn
import seaborn as sn
import matplotlib.pyplot as plt
from datetime import date
from sklearn.metrics import matthews_corrcoef
import torch.optim as optim  # For all Optimization algorithms, SGD, Adam, etc.
import torch.nn.functional as F  # All functions that don't have any parameters
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, auc

In [2]:
#-------- Import Modules from project--------#
import encoding as enc
from model import Net, Net_thesis, Net_project
import functions as func

In [3]:
#-------- Set Device --------#

if torch.cuda.is_available():
    device = torch.device('cuda')
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
else:
    print('No GPUs available. Using CPU instead.')
    device = torch.device('cpu')

No GPUs available. Using CPU instead.


In [4]:
#-------- Seeds --------#

seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

torch.use_deterministic_algorithms(True)

In [5]:
#-------- Import target Dataset --------#             

target_list = []

import glob
for index in range(4):
    for fp in glob.glob("../data/train/*{}*labels.npz".format(index+1)):
        print("Read file", fp)
        targets = np.load(fp)["arr_0"]
        target_list.append(targets[0:500])
    
for fp in glob.glob("../data/validation/*5*labels.npz"):
    print("Read file", fp)
    targets = np.load(fp)["arr_0"]
    target_list.append(targets[0:500])
    
print("\n")
print("target_list", len(target_list))
print("\n")

data_partitions = len(target_list)
for i in range(len(target_list)):
    print("Size of file", i+1, len(target_list[i]))

del targets
gc.collect()

Read file ../data/train\P1_labels.npz
Read file ../data/train\P2_labels.npz
Read file ../data/train\P3_labels.npz
Read file ../data/train\P4_labels.npz
Read file ../data/validation\P5_labels.npz


target_list 5


Size of file 1 500
Size of file 2 500
Size of file 3 500
Size of file 4 500
Size of file 5 500


33

In [6]:
#-------- Select the network you would like to use -------#

CNN = False # ONLY CNN
CNN_RNN = True # CNN + RNN

# Hyperparameters to fine-tune
embedding = "esm_1b"
numHN=32
numFilter=100
dropOutRate=0.1
learning_rate=0.001
weight_decay = 0.0001

#for ml-flow
name_experiment = "PCA"


##--- parameters fixed
keep_energy=True
bat_size = 128
num_classes=1
epochs = 100
patience=10
criterion = nn.BCEWithLogitsLoss()

In [10]:
# getting embedding of data

#create directory to fetch/store embedded
embedding_dir= '../data/embeddedFiles/'
data_list_enc = list()

try:
    os.mkdir(embedding_dir)
except:
    pass

#try to fecth if already exist
if embedding == "Baseline":   
    pass
else:
    try:
        for i in range(5):
            #infile = open(embedding_dir + 'esm-energies-file-updated-{}.pkl'.format(i+1), 'rb')
            infile = open(embedding_dir + 'dataset-esm-1b_{}.pkl'.format(i), 'rb')
            enc = pickle.load(infile)
            data_list_enc.append(enc[0:100])
            print("File", i+1, "is uploaded. Size:", len(enc))
            infile.close()        
        '''
        infile = open(embedding_dir+'dataset-{}'.format(embedding), 'rb')
        data_list_enc =  pickle.load(infile)
        infile.close()
        '''
    #if no prior file, embedding code needs to be run before:
    except:
        print("Embedded file not found")

#del enc
gc.collect()        

File 1 is uploaded. Size: 1530
File 2 is uploaded. Size: 1170
File 3 is uploaded. Size: 1480
File 4 is uploaded. Size: 1540
File 5 is uploaded. Size: 1210


0

In [11]:
#-------- Apply PCA to the dataset -------#

PCA_do = True
pca_dir = '../data/PCA_models/'

try:
    os.mkdir(pca_dir)
except:
    pass

if PCA_do:
    matrix_train = np.empty((0, len(data_list_enc[0][0][0]))); total_observations = 0
    for partition in range(3):
        n_obser, partition_matrix = func.prepare_data_pca(data_list_enc[partition])
        matrix_train = np.append(matrix_train, partition_matrix, axis = 0)
        total_observations += n_obser

    var_vector, number_components, model, fitted_train = func.run_PCA(matrix_train)
    # and now this model will be applied to validation dataset by running command model.transfrom(X_val)
    
    try:
        infile = open(pca_dir+'model_{}.pkl'.format(embedding), 'rb')
        pickle.load(infile)
        infile.close()

    #if no prior file, save the model as a pickle:
    except:
        with open(pca_dir + 'model_{}.pkl'.format(embedding), 'wb') as f:
            pickle.dump(model, f)    

In [12]:
#-------- Get train, validation and test datasets with PCA applied -------#

X_train = func.back_to_original_size(fitted_train, [total_observations, len(data_list_enc[0][0]), number_components])
y_train = np.concatenate(target_list[0:3])
nsamples, nx, ny = X_train.shape
print("Training set shape:", nsamples,nx,ny)

n_obser, matrix_valid = func.prepare_data_pca(data_list_enc[3])
X_valid = model.transform(matrix_valid)
X_valid = func.back_to_original_size(X_valid, [n_obser, len(data_list_enc[0][0]), number_components])
y_valid = target_list[3]
nsamples, nx, ny = X_valid.shape
print("Validation set shape:", nsamples,nx,ny)

n_obser, matrix_test = func.prepare_data_pca(data_list_enc[4])
X_test = model.transform(matrix_test)
X_test = func.back_to_original_size(X_test, [n_obser, len(data_list_enc[0][0]), number_components])
y_test = target_list[4]
nsamples, nx, ny = X_test.shape
print("Test set shape:", nsamples,nx,ny) 

# features and residues
features = list(range(ny))
residues = list(range(nx)) 
n_features = len(features)
input_size = len(residues)

del data_list_enc
gc.collect()

# Dataloader
train_ds = []
for i in range(len(X_train)):
    train_ds.append([np.transpose(X_train[i][:,features]), y_train[i]])
val_ds = []
for i in range(len(X_valid)):
    val_ds.append([np.transpose(X_valid[i][:,features]), y_valid[i]])
test_ds = []
for i in range(len(X_test)):
    test_ds.append([np.transpose(X_test[i][:,features]), y_test[i]])
    
del X_train, X_test, y_train, y_test 
gc.collect()

train_ldr = torch.utils.data.DataLoader(train_ds,batch_size=bat_size, shuffle=True)
val_ldr = torch.utils.data.DataLoader(val_ds,batch_size=bat_size, shuffle=True)
test_ldr = torch.utils.data.DataLoader(test_ds,batch_size=len(test_ds), shuffle=True)


Training set shape: 300 420 20
Validation set shape: 100 420 20
Test set shape: 100 420 20


In [13]:
###############################
###    CNN+RNN (thesis)     ###
###############################
start = time.time()

print("Parameters:")
print("embedding", embedding)
print("numHN", numHN)
print("numFilter", numFilter)
print("dropOutRate", dropOutRate)
print("keep_energy", keep_energy)
print("num_classes", num_classes)
print("learning_rate", learning_rate)
print("bat_size", bat_size)
print("patience", patience)
print("criterion", criterion)
print("\n\n")

#-------- Train --------#

# Initialize network
net = Net_project(num_classes=num_classes, 
            n_features=n_features, 
            numHN=numHN, 
            numFilter=numFilter,
            dropOutRate=dropOutRate).to(device)

optimizer = optim.Adam(net.parameters(), lr=learning_rate,
                        weight_decay=weight_decay,
                        amsgrad=True,)

train_acc, train_losses, train_auc, valid_acc, valid_losses, valid_auc, val_preds, val_targs, test_preds, test_targs, test_loss, test_acc, test_auc = func.train_project(net, optimizer, train_ldr, val_ldr, test_ldr, X_valid, epochs, criterion, patience)


print("Done in", round((time.time()-start)/60,2), "mins." )

print("test_loss, test_acc, test_auc:")
print(test_loss.item(), ",", test_acc[0], ",", test_auc[0])

Parameters:
embedding esm_1b
numHN 32
numFilter 100
dropOutRate 0.1
keep_energy True
num_classes 1
learning_rate 0.001
bat_size 128
patience 10
criterion BCEWithLogitsLoss()





ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.

In [14]:
#-------- Performance --------#

epoch = np.arange(1,len(train_losses)+1)
plt.figure()
plt.plot(epoch, train_losses, 'r', epoch, valid_losses, 'b')
plt.legend(['Train Loss','Validation Loss'])
plt.xlabel('Epoch'), plt.ylabel('Loss')

epoch = np.arange(1,len(train_auc)+1)
plt.figure()
plt.plot(epoch, train_auc, 'r', epoch, valid_auc, 'b')
plt.legend(['Train AUC','Validation AUC'])
plt.xlabel('Epoch'), plt.ylabel('AUC')

epoch = np.arange(1,len(train_acc)+1)
plt.figure()
plt.plot(epoch, train_acc, 'r', epoch, valid_acc, 'b')
plt.legend(['Train Accuracy','Validation Accuracy'])
plt.xlabel('Epoch'), plt.ylabel('Acc')
plt.show()


#-------- Save results --------#

results_dir = '../results'

try:
    os.mkdir(results_dir)
except:
    pass

plots_dir = '../results/plots'

try:
    os.mkdir(plots_dir)
except:
    pass

results = pd.DataFrame(list(zip( (int(x) for x in test_targs), (int(x) for x in test_preds))),columns =['target', 'pred'])


results.to_csv('../results/PCA_emb_{}_HN_{}_nFilt_{}_do_{}_energ_{}.csv'.format(embedding,numHN,numFilter,int(dropOutRate*10), keep_energy), index=False)


#-------- Performance Evaluation --------#
# The results change every time we train, we should check why (maybe we missed something or did wrong with the seeds?)

print("Number of principal components for PCA", number_components)
print("AUC: ", roc_auc_score(results['target'], results['pred']))
print("MCC: ", matthews_corrcoef(results['target'], results['pred']))

confusion_matrix = pd.crosstab(results['target'], results['pred'], rownames=['Actual'], colnames=['Predicted'])
sn.heatmap(confusion_matrix, annot=True, cmap='Blues', fmt='g')
plt.show()

# Plot roc curve

fpr, tpr, thres = roc_curve(results['target'], results['pred'])
print('AUC: {:.3f}'.format(roc_auc_score(results['target'], results['pred'])))

print( len([i for i, (a, b) in enumerate(zip(results['pred'], results['target'])) if a != b]))

plt.figure(figsize=(8,6))

# roc curve
plt.plot(fpr, tpr, "b", label='ROC Curve')
plt.plot([0,1],[0,1], "k--", label='Random Guess')
plt.xlabel("false positive rate")
plt.ylabel("true positive rate")
plt.legend(loc="best")
plt.title("ROC curve")


plt.savefig('../results/plots/PCA_emb_{}_HN_{}_nFilt_{}_do_{}_energ_{}_ROC.png'.format(embedding,numHN,numFilter,int(dropOutRate*10), keep_energy))
plt.show()

AUC = roc_auc_score(results['target'], results['pred'])
MCC = matthews_corrcoef(results['target'], results['pred'])
ACC = accuracy_score(results['target'], results['pred'])
print("AUC: ", AUC)
print("MCC: ", MCC)
print("ACC: ", ACC)

NameError: name 'train_losses' is not defined

In [15]:
#storing values
import mlflow
import mlflow.sklearn

experiment_id = mlflow.set_experiment(name_experiment)
experiment = mlflow.get_experiment(experiment_id)

print("Name: {}".format(experiment.name))
print("Experiment_id: {}".format(experiment.experiment_id))
print("Artifact Location: {}".format(experiment.artifact_location))

with mlflow.start_run():
    mlflow.log_param('embedding', embedding) 
    mlflow.log_param('Number principal components', number_components)
    mlflow.log_param('Hidden Neurons', numHN)
    mlflow.log_param('filters CNN', numFilter)
    mlflow.log_param('Dropout rate', dropOutRate)
    mlflow.log_param('learning rate', learning_rate)
    mlflow.log_param('Weight decay', weight_decay)
    mlflow.log_metric('test AUC', AUC)
    mlflow.log_metric('test MCC', MCC)
    mlflow.log_metric('test ACC', ACC)
    
    mlflow.log_metric('train ACC', train_acc[-1])
    mlflow.log_metric('train AUC', train_auc[-1])
    mlflow.log_metric('valid ACC', valid_acc[-1])
    mlflow.log_metric('valid AUC', valid_auc[-1])

from csv import writer


print("\n")
print('embedding', embedding)
print('principal components',number_components) 
print('Hidden Neurons', numHN)
print('filters CNN', numFilter)
print('Dropout rate', dropOutRate)
print('learning rate', learning_rate)
print('Weight decay', weight_decay)

print('test AUC', AUC)
print('test MCC', MCC)
print('test ACC', ACC)

print('train ACC', train_acc[-1])
print('train AUC', train_auc[-1])
print('valid ACC', valid_acc[-1])
print('valid AUC', valid_auc[-1])


#List = ['embedding', 'numHN', 'numFilter', 'dropOutRate', 'learning_rate', 'weight_decay', 'AUC', 'MCC', 'ACC', 'train_acc', 'train_auc', 'valid_acc', 'valid_auc' ]
List = [embedding, number_components, numHN, numFilter, dropOutRate, learning_rate, weight_decay, AUC, MCC, ACC, train_acc[-1], train_auc[-1], valid_acc[-1], valid_auc[-1] ]

with open('../results/PCA_results.csv', 'a') as f_object:
    writer_object = writer(f_object)
  
    writer_object.writerow(List)
  
    f_object.close()

Name: Default
Experiment_id: 0
Artifact Location: file:///home/shannara/DL_02456/dayana_run/scripts/mlruns/0


NameError: name 'AUC' is not defined