In [1]:
#-------- Import Libraries --------#
import torch
import os
import sys
import random
import pickle
import mlflow
import numpy as np
import pandas as pd
import torch.nn as nn
import seaborn as sn
import matplotlib.pyplot as plt
from datetime import date
from sklearn.metrics import matthews_corrcoef
import torch.optim as optim  # For all Optimization algorithms, SGD, Adam, etc.
import torch.nn.functional as F  # All functions that don't have any parameters
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, auc

In [2]:
#-------- Import Modules from project--------#
import encoding as enc
from model import Net, Net_thesis, Net_project
import functions as func

In [3]:
#-------- Set Device --------#

if torch.cuda.is_available():
    device = torch.device('cuda')
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
else:
    print('No GPUs available. Using CPU instead.')
    device = torch.device('cpu')

No GPUs available. Using CPU instead.


In [3]:
#-------- Seeds --------#

seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

torch.use_deterministic_algorithms(True)

In [4]:
#-------- Directories --------#

DATADIR = '/data/'
TRAINDIR = '../data/train'
VALIDATIONDIR = '../data/validation'
MATRICES = '/data/Matrices'


In [6]:
#-------- Unzip Train --------#

try:
    if len(os.listdir(TRAINDIR)) != 0:
        print("{} already unzipped.".format(TRAINDIR))
except:
    !unzip ../data/train.zip -d ../data/train

    
#-------- Unzip Validation --------#


try:
    if len(os.listdir(VALIDATIONDIR)) != 0:
        print("{} already unzipped.".format(VALIDATIONDIR))
except:
    !unzip ../data/validation.zip -d ../data/validation
    
print('Train directory:\n\n', '\n'.join(str(p) for p in os.listdir(TRAINDIR)), '\n\n')
print('Validation directory:\n\n','\n'.join(str(p) for p in os.listdir(VALIDATIONDIR)))

../data/train already unzipped.
../data/validation already unzipped.
Train directory:

 P1_input.npz
P1_labels.npz
P2_input.npz
P2_labels.npz
P3_input.npz
P3_labels.npz
P4_input.npz
P4_labels.npz
__MACOSX 


Validation directory:

 P5_input.npz
P5_labels.npz
__MACOSX


## Functions

In [14]:
def prepare_data_pca(embedded_list):
    """
    Get result from embedded to have the proper size in 2D 
    ready to run the PCA analysis afterwards.
    """
    n_observations = len(embedded_list)
    n_residues = len(embedded_list[0])
    embedded_att = len(embedded_list[0][0])
    embedded_matrix = torch.tensor(embedded_list).reshape(n_observations * n_residues, embedded_att).numpy()
    return n_observations, embedded_matrix

In [15]:
from sklearn.decomposition import PCA
def run_PCA(data, variance_required = 0.9, max_components = 100):
    """
    Run PCA and get the minimum number of components required to reach 
    the minimum variance required.
    """
    first_model = PCA(n_components = max_components)
    first_model.fit_transform(data)

    variances = first_model.explained_variance_ratio_.cumsum()
    optimal_components = np.argmax(variances > variance_required)

    reduced_model = PCA(n_components = optimal_components)
    fitted_data = reduced_model.fit_transform(data)

    return variances, optimal_components, reduced_model, fitted_data

In [26]:
def back_to_tensor_size(matrix, final_size):
    """
    Reshape matrix back to original 3D shape with the reduced dimensionality
    for the embedded variable. *Option to return a matrix instead of tensor
    by adding .numpy() in the end
    """
    final_tensor = torch.tensor(matrix).reshape(final_size[0], final_size[1], final_size[2])
    return final_tensor

## ESM-1b embedding

In [42]:
# work on esm-1b for PCA
embedding_dir= '../data/embeddedFiles/'; embedding = 'esm-1b'
data_list_embedded = [] 
for i in range(3):
    print("Writing parititon", i + 1)
    infile = open(embedding_dir + 'dataset-{}_{}.pkl'.format(embedding, i), 'rb')
    data_list_embedded.append(pickle.load(infile))
    infile.close()

Writing parititon 1
Writing parititon 2
Writing parititon 3


In [30]:
# doing PCA on our data
final_matrix = np.empty((0, len(data_list_embedded[0][0][0]))); total_observations = 0
for partition in range(3):
    n_obser, partition_matrix = prepare_data_pca(data_list_embedded[partition])
    final_matrix = np.append(final_matrix, partition_matrix, axis = 0)
    total_observations += n_obser
    print(final_matrix.shape)

(642600, 1280)


In [31]:
var_vector, number_components, model, fitted_train = run_PCA(final_matrix)
# and now this model will be applied to validation dataset by running command model.transfrom(X_val)

In [27]:
esm_1b_tensor = back_to_tensor_size(fitted_train, [total_observations, len(data_list_embedded[0][0]), number_components])

## ASM embedding

In [29]:
# now for the ASM embedding
embedding = "esm_ASM"
infile = open(embedding_dir + 'dataset-{}.pkl'.format(embedding), 'rb')
data_ASM_embedded = pickle.load(infile)
infile.close()

In [38]:
# doing PCA on our data

esm_ASM_matrix = np.empty((0, len(data_ASM_embedded[0][0][0]))); total_observations_ASM = 0
for partition in range(3):
    n_obser, partition_matrix = prepare_data_pca(data_ASM_embedded[partition])
    esm_ASM_matrix = np.append(esm_ASM_matrix, partition_matrix, axis = 0)
    print(esm_ASM_matrix.shape)
    total_observations_ASM += n_obser

var_vector_ASM, number_components_ASM, model_ASM, fitted_train_ASM = run_PCA(esm_ASM_matrix)

esm_ASM_tensor = back_to_tensor_size(fitted_train_ASM, [total_observations_ASM, len(data_ASM_embedded[0][0]), number_components_ASM])

(640920, 768)
(1131480, 768)
(1753080, 768)


ValueError: Expected 2D array, got 1D array instead:
array=[-9.02885675e-01 -2.57112563e-01  6.08861864e-01  2.04494047e+00
 -3.94119620e-01  9.94090736e-02  2.52101451e-01  6.98611081e-01
  3.76258284e-01 -5.51021755e-01  8.91607761e+00 -5.86801358e-02
  9.33266699e-01  9.89569664e-01 -6.80327177e-01 -2.65359372e-01
  2.93442547e-01  6.01338243e+00 -6.66363299e-01  9.93139893e-02
 -3.47648501e-01  7.80296326e-02 -1.91798612e-01  9.09580290e-02
 -6.40300870e-01  1.98617592e-01  2.25818455e-02 -3.57028067e-01
  9.09937501e-01  1.53998449e-01  9.24888730e-01 -8.34516764e-01
  6.48360133e-01 -9.47537303e-01 -8.57919455e-01 -5.88815957e-02
 -3.13379735e-01 -1.62997097e-01  1.88168511e-02 -3.53987664e-01
  2.41288334e-01  3.81852649e-02  8.40624571e-02 -3.54742318e-01
  6.85155094e-02 -6.12714648e-01  1.19219303e+00  1.58587798e-01
  1.65693223e-01 -7.23241508e-01 -6.88836396e-01 -7.11757183e-01
 -1.36612788e-01 -5.34582973e-01 -6.69512868e-01  9.60549563e-02
  8.13812986e-02  3.12858343e-01  3.04789156e-01  7.25705504e-01
  3.52237165e-01 -1.14449072e+00  6.93998456e-01  3.59774470e-01
 -2.78251767e-01 -5.77069297e-02 -8.50223720e-01 -6.54085219e-01
 -5.23171648e-02 -1.90561339e-01 -5.96117556e-01 -1.15835369e+00
 -2.63218433e-01 -1.38693750e-01 -2.74057418e-01 -1.57886016e+00
  5.94832540e-01 -1.13483563e-01 -9.37102497e-01  6.59110308e-01
  8.50147545e-01  1.14740086e+00 -4.61544693e-01 -6.97037041e-01
  4.38047677e-01 -4.22108978e-01 -1.20991027e+00  6.34583056e-01
  6.17849112e-01  5.75993001e-01 -2.14729637e-01 -1.00883150e+00
 -5.71342051e-01  4.04209763e-01  5.07387996e-01 -1.07317209e+00
  1.26116872e-01 -5.55910110e-01  8.25295806e-01  3.16127837e-01
  7.99035013e-01 -1.02503121e+00  3.02135348e-01  5.10988712e-01
 -1.52404070e-01 -5.93031704e-01 -8.59170914e-01 -7.22388446e-01
 -3.74322683e-01  5.60144544e-01  3.88273239e-01  6.41512945e-02
  3.87083799e-01  3.18212599e-01  6.54648781e-01  9.49580967e-01
 -4.79991883e-01 -3.78314674e-01 -6.69026077e-01  5.07539630e-01
  5.58363974e-01  6.10046327e-01  1.04332006e+00 -2.23200977e-01
  2.96536863e-01 -3.66250992e-01  3.90968233e-01 -4.09408599e-01
 -1.64060384e-01 -1.00703742e-02 -2.35904500e-01 -1.26892924e-01
  2.22327530e-01  4.51127291e-01 -2.99266651e-02 -8.33652377e-01
  6.02642119e-01  4.29069042e-01  1.20092022e+00 -5.94303071e-01
 -3.89334917e-01  5.07649243e-01  4.99270618e-01  7.63054669e-01
  4.35495466e-01  6.58851862e-02  1.25423893e-01  1.68119818e-01
 -1.19473469e+00 -1.00753999e+00  2.40029836e+00  8.25602636e-02
  3.89022119e-02 -1.27775431e+00 -2.70413280e-01 -8.13749611e-01
 -3.47007781e-01 -5.08746445e-01 -5.04218757e-01  7.05984175e-01
  1.58689082e-01  9.55744147e-01 -3.19437623e-01 -4.18812186e-01
  4.58175391e-01 -9.92199481e-01 -2.02232212e-01 -5.48516095e-01
  1.99032739e-01 -7.78192937e-01 -1.27655059e-01 -1.07149333e-02
 -3.89844000e-01 -1.13327324e-01  1.56309709e-01  5.47949553e-01
 -5.25455996e-02  4.97746229e-01 -3.75310928e-01  7.53138065e-02
 -5.23312330e-01  4.91372943e-01 -4.93025452e-01 -5.70688367e-01
 -1.23422325e+00 -4.84240144e-01  8.33903432e-01  9.61766317e-02
 -2.79634893e-02 -4.66910899e-01 -4.56721067e-01 -2.79616207e-01
  9.58745062e-01  8.31003070e-01  1.21827257e+00 -1.23468459e+00
  6.93531513e-01 -5.50003350e-01  1.25656679e-01 -5.92041075e-01
 -7.12977350e-01 -5.44759512e-01  8.39151204e-01 -1.19309820e-01
 -7.00775012e-02  6.07856512e-01  7.06647430e-03 -4.69840348e-01
 -1.02639186e+00 -2.92110085e-01 -4.70470935e-02 -7.15685263e-02
 -2.34326079e-01 -5.77808321e-01 -4.15413678e-01 -1.29656136e-01
  3.91914338e-01 -1.16444409e+00 -1.07649669e-01  5.16621888e-01
  4.14568961e-01 -4.04986829e-01 -4.82718319e-01  8.32399964e-01
 -2.09694743e-01  4.72377121e-01 -1.86547071e-01  1.44137233e-01
 -4.22315300e-01 -1.41877905e-02 -3.69351536e-01 -5.90266287e-01
  8.99816692e-01 -3.84669632e-01  1.08252242e-01  7.27240264e-01
 -2.23562837e-01  4.15522695e-01 -6.25060141e-01 -8.46031964e-01
 -7.24191844e-01 -3.44117820e-01  5.94466865e-01  1.14101738e-01
  1.27219296e+00  1.04426408e+00 -9.47031379e-01  1.25036931e+00
  1.28975868e-01 -3.78465056e-01  1.13127041e+00  9.63945165e-02
 -2.72325337e-01  1.01345077e-01 -8.40715945e-01  1.79890049e+00
 -1.92257002e-01  2.62250006e-02  5.83135664e-01 -7.54982114e-01
  6.05804240e-03 -1.03979044e-01  3.64105910e-01  1.50709063e-01
  2.38318443e-01  5.71839809e-01  3.25785726e-02  6.11548722e-01
  4.50985909e-01  8.30407262e-01 -1.09692442e+00 -6.32050276e-01
  2.35225767e-01 -4.71102178e-01 -8.25894535e-01 -2.83923596e-01
 -1.03264618e+00  2.32648551e-02  5.97364724e-01  2.05368787e-01
  7.74463177e-01  5.12213588e-01 -9.08727050e-01  5.30178070e-01
 -2.12565452e-01 -3.07163626e-01 -1.72918648e-01  8.28434825e-01
  1.93709016e+00  1.34050846e+00  3.63259554e-01 -1.06039512e+00
  1.11871076e+00 -9.66169834e-01 -2.03677982e-01 -7.35416561e-02
 -4.28083599e-01  3.52681398e-01  4.16324943e-01 -1.15676594e+00
 -1.50522220e+00 -6.39837563e-01  3.32226992e-01 -7.02021539e-01
 -7.70009696e-01 -5.14599085e-01 -9.94507670e-01  2.65039206e-01
  9.65320468e-02 -4.94636267e-01  5.82577288e-01  8.35793674e-01
 -1.59914955e-01  8.57062995e-01 -8.41009712e+00 -1.02281475e+00
 -1.14033234e+00  7.22446918e-01  6.78035021e-01 -1.41545832e-01
 -9.71423090e-01 -1.06646764e+00  7.27939785e-01  4.92117733e-01
 -2.35639483e-01  3.72070193e-01  1.06819117e+00 -1.53099430e+00
 -4.59859259e-02  2.64247566e-01 -2.44550675e-01 -1.29412103e+00
  5.61128497e-01 -1.17760432e+00  2.18559861e-01  9.36684608e-01
 -2.76418567e-01 -3.19235563e-01  1.88955843e+00  9.57882330e-02
  1.10181987e+00  3.47045869e-01  8.17973197e-01  6.58346713e-01
  1.83619037e-01  7.43841946e-01 -8.95085186e-03  5.16946018e-01
  1.88375592e-01  1.39209002e-01 -5.32584310e-01 -9.02436674e-01
  2.56335437e-01  1.02629364e+00  5.95592380e-01 -2.82185435e-01
  4.87526983e-01 -1.09334242e+00  4.68636811e-01 -4.41559106e-01
 -6.56665206e-01  1.22460568e+00 -1.51432264e+00  3.47073376e-01
  3.27053934e-01  1.59161463e-01 -5.96065342e-01  1.07270014e+00
  1.56993294e+00 -9.37576830e-01  1.41677350e-01  7.82844782e-01
 -7.16059566e-01 -2.27070749e-01  4.54378873e-01  4.58735257e-01
  8.99610579e-01  3.62259507e-01 -9.77231741e-01 -1.17536807e+00
  3.62675279e-01  2.04154998e-01  9.32809830e-01 -1.13918149e+00
  4.81794059e-01  4.51127112e-01 -1.27773106e+00 -8.27492297e-01
  6.59462154e-01 -8.18142354e-01 -5.25000215e-01 -1.18006244e-01
  9.21670437e-01 -8.38334203e-01  5.34698844e-01 -4.20434922e-01
  3.72021168e-01  2.19827399e-01 -1.70563012e-02  1.40123814e-01
 -1.06812334e+00 -1.17410138e-01 -6.15628898e-01 -1.52920270e+00
 -2.11953253e-01 -7.75602520e-01 -2.16700837e-01  5.34494579e-01
 -3.55724841e-02 -2.39982799e-01  1.62404329e-01 -2.45806172e-01
  3.17600876e-01 -3.77413258e-02  7.35003233e-01 -2.24573880e-01
 -3.73848349e-01 -3.10607910e-01 -4.42101955e-02 -9.29644287e-01
 -1.13013673e+00  5.10601029e-02 -3.67085636e-02  4.88502800e-01
  2.02178746e-01 -1.22555904e-01  2.57196128e-01  5.53266287e-01
 -6.28706098e-01  1.24940801e+00 -3.75499725e-01 -4.27009761e-02
  1.92263782e-01  9.23163414e-01 -9.98407483e-01 -5.94857693e-01
 -4.96962219e-02  8.31319630e-01 -9.73160088e-01  4.41137552e-01
 -6.76390767e-01 -6.03610456e-01 -4.64066714e-01 -1.17077053e+00
  4.12063032e-01  1.24821401e+00 -1.53495109e+00  3.48688096e-01
  8.77518356e-02  1.42368758e+00 -3.21217589e-02 -3.65412176e-01
  3.92661631e-01  1.08082116e+00  9.14116919e-01 -5.66164613e-01
 -3.17208916e-01 -6.78258598e-01  1.00061905e+00 -7.00126946e-01
 -1.78365529e-01  6.38015568e-01 -1.59198594e+00  1.06318548e-01
 -4.03292596e-01 -1.47221565e-01 -7.97887027e-01  1.56575489e+00
 -6.84408724e-01 -8.33916485e-01 -1.10718027e-01 -2.34254494e-01
  8.95827234e-01 -3.48999441e-01 -7.70807385e-01  3.94041002e-01
 -6.99472189e-01  5.24433672e-01  7.02645004e-01 -4.22748148e-01
  2.82472581e-01 -1.24320030e+00  3.13760042e-01 -9.32465851e-01
  8.78335416e-01  4.07813931e+00  6.49227977e-01  1.53536767e-01
 -1.02247667e+00 -1.50432467e+00 -5.45857847e-03 -1.21252790e-01
  6.26207054e-01  1.21305788e+00 -1.50094140e+00  7.71231800e-02
 -1.15787351e+00 -4.14393619e-02  5.93673766e-01 -6.28326535e-01
 -1.60349801e-01 -3.37929368e-01 -3.36342990e-01 -5.09841144e-01
  1.18808353e+00  3.39330494e-01 -2.36829191e-01  3.22838783e-01
 -2.15404749e-01 -1.00876808e+00  1.32305503e+00 -4.21708405e-01
  3.45118642e-02  3.66365463e-01 -5.22590540e-02 -1.05979836e+00
 -2.86259651e-01 -4.85222071e-01  1.90553069e-01  3.04198086e-01
 -6.91793740e-01  1.58466235e-01  4.38291669e-01  9.40898418e-01
 -2.60351360e-01  1.26956415e+00  7.31395543e-01 -3.56075764e-01
  3.62131536e-01 -9.04499233e-01  1.35857558e+00  1.24078655e+00
  1.51617497e-01  6.84322894e-01  1.42760348e+00 -3.13159198e-01
  2.59320050e-01  1.83683615e-02  6.29426688e-02  1.45132184e-01
 -1.22693293e-02 -8.79069149e-01  7.09633589e-01 -3.28780189e-02
  4.95006770e-01 -3.55894268e-01 -6.81749344e-01 -2.62179106e-01
  3.11728537e-01  4.93297040e-01 -4.60570902e-01 -9.72406447e-01
  4.64849383e-01  4.90398593e-02  1.20757997e-01 -3.86027396e-01
  3.99361670e-01  5.90868816e-02 -9.15040612e-01 -1.88934386e-01
  5.36990643e-01  2.59037971e-01 -8.07852894e-02 -1.69225836e+00
  6.49118900e-01 -2.28798658e-01 -6.43583775e-01 -3.83058786e-02
  1.49895072e-01 -7.25560367e-01 -1.31883502e+00 -1.75936595e-01
 -3.62513810e-01 -1.49394482e-01 -6.22077703e-01 -7.06710756e-01
 -4.24531907e-01  1.06615520e+00  2.22250111e-02 -8.15321803e-01
  2.46833473e-01 -6.18204474e-03 -2.28139833e-01  5.70601165e-01
  5.90372205e-01 -3.20230365e-01 -5.25513530e-01  1.99156031e-01
 -1.39999285e-01  3.27235788e-01 -2.78662175e-01 -1.06093693e+00
 -5.20791233e-01 -3.19078892e-01  1.11506358e-01  2.36783147e-01
  6.27185702e-02  6.99535072e-01  3.53554934e-01  5.62476873e-01
  1.07440555e+00 -4.93301749e-01 -2.57460102e-02  1.52998760e-01
  4.27074403e-01 -7.53739536e-01 -1.06075525e+00  3.80531490e-01
  4.80943471e-01  1.98702633e-01  2.85610318e-01  5.07839993e-02
  4.67251599e-01  4.59725261e-02  9.61571336e-01 -3.21236163e-01
 -1.06638975e-01  4.48092282e-01  5.03710508e-01 -1.32513428e+00
  9.49946404e-01 -1.23784900e+00  2.37384439e-03 -2.60342896e-01
  5.57053089e-01  3.83131772e-01  6.45175755e-01  6.44745290e-01
 -2.67108172e-01 -2.26449028e-01  2.74211854e-01  9.96693730e-01
  7.88437128e-01 -6.10007234e-02  5.99255621e-01  1.91668823e-01
 -2.18488276e-01  2.78902590e-01  1.05678546e+00  3.72985184e-01
  9.76215601e-01 -2.72455394e-01 -5.60558736e-01  2.97697127e-01
  2.28190705e-01  1.22641608e-01 -6.29107893e-01 -3.62217098e-01
 -5.76749444e-01  3.89705807e-01  5.50880432e-02 -2.80460745e-01
  6.40749708e-02  7.44460762e-01  2.66652107e-01  1.08489072e+00
  2.13331401e-01  2.79970944e-01  1.45746148e+00 -9.72874522e-01
 -2.46427611e-01  6.38134837e-01  2.42268518e-01 -2.69827247e-01
 -1.32559747e-01  6.00429714e-01  5.30968726e-01  2.87710547e-01
 -4.55076993e-01  1.00547338e+00  6.22559547e-01 -2.18202636e-01
 -1.85483086e+00  3.79099011e-01 -5.12390956e-02 -2.60300130e-01
 -3.12334836e-01  5.96231520e-02  9.03849185e-01  8.75706613e-01
 -1.89183936e-01  4.17752653e-01 -2.47213215e-01 -6.86338544e-01
  5.56243777e-01 -1.63477823e-01  2.52487138e-02 -4.18893367e-01
 -1.93849742e-01  1.29687858e+00  7.45943844e-01 -7.78919235e-02
 -4.98386025e-02 -3.58973294e-01  4.48347658e-01 -1.72175348e-01
  2.56847590e-01  6.10598743e-01 -1.27591395e+00  4.48672026e-01
 -5.33771694e-01  8.54255795e-01 -9.82223868e-01 -1.86485901e-01
 -5.13136327e-01  1.06957686e+00 -3.75542641e-01  7.34213710e-01
 -1.29435398e-02 -4.66458142e-01 -1.57279640e-01 -1.33646810e+00
  2.49753758e-01  1.02730501e+00  1.29970396e+00 -1.79631084e-01
 -1.78066060e-01 -9.97601748e-02  1.45249581e+00 -1.33077872e+00
  2.03067929e-01 -5.76364517e-01  1.55709043e-01  6.53919041e-01
  9.66904938e-01  5.31799436e-01  1.14253986e+00  1.69194624e-01
  7.26992428e-01 -8.44909132e-01 -1.33965266e+00  1.33417040e-01
  3.83030266e-01 -5.87780714e-01 -2.79000819e-01 -1.85759887e-01
 -2.80934647e-02 -5.91772124e-02 -2.80452549e-01  8.24826509e-02
 -2.67031014e-01  1.45427394e+00  1.39315093e+00 -3.81184042e-01
  1.83732793e-01 -1.99668825e-01 -6.53849185e-01  6.22759819e-01
 -5.70567131e-01 -6.96975887e-01 -1.20153919e-01 -6.14278436e-01
 -5.96600294e-01 -1.11369893e-01 -6.61035538e-01  4.24851239e-01
 -1.31509924e+00 -4.47400361e-01 -6.48909926e-01  7.19022214e-01
 -1.55780226e-01 -7.74746895e-01 -5.99145234e-01  5.00549555e-01
  3.44083518e-01 -2.33866051e-01 -1.32876480e+00 -1.52961636e+00].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.