# Package imports

In [2]:
import torch
from torch import nn
from torch import optim

from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from sklearn import metrics
from sklearn.preprocessing import StandardScaler

import pandas as pd
import numpy as np
import time
import os

import h5py


import matplotlib.pyplot as plt
%matplotlib inline


# Hyperparameters configuration

In [3]:

args = {
    'epoch_num': 200,     # Number of times that the model will see the entire dataset during training 
    'lr': 5e-5,           # Learning rate: controls the size of the step the optimizer takes towards the local minimum during training.
    'weight_decay': 5e-4, # L2 penalty: helps prevent overfitting by adding a penalty to the value of the model's weights
    'num_workers': 3, # Número de threads do dataloader.
    'num_classes' : 10, 
    'batch_size': 20,     # Tamanho do batch.
}


# Definition of the default hardware used (must be GPU??)

In [4]:
if torch.cuda.is_available():
    args['device'] = torch.device('cuda')
else:
    args['device'] = torch.device('cpu')

print(args['device'])

cpu


# Dataset

In [5]:
# Open the HDF5 file for reading
with h5py.File('split30_prott5.h5', 'r') as f:

# Load the embeddings (keys in the HDF5 file) dataset into a variable
    embeddings = list(f.keys())
    #print(embeddings)
    
# Get the length of the embeddings (number of items in the list)
    embeddings_length = len(embeddings)
    print("Number of embeddings:", embeddings_length)
     
#Print the first 10 embeddings
    for i in range(min(10, embeddings_length)):
        print(f"Embedding {i+1}:", embeddings[i])


Number of embeddings: 9204
Embedding 1: A0A024RBG1
Embedding 2: A0A024SMV2
Embedding 3: A0A060S684
Embedding 4: A0A075TXZ3
Embedding 5: A0A077K8G3
Embedding 6: A0A077YBL0
Embedding 7: A0A096ZEC9
Embedding 8: A0A096ZED0
Embedding 9: A0A0A1GNF2
Embedding 10: A0A0B0QJN8


# Train and Test splits

In [6]:
torch.manual_seed(1)
indices = torch.randperm(len(embeddings)).tolist() #list called indice with random separation of the embeddings 

train_size = int(0.8*len(embeddings)) #training dataset will be 80% 

embeddings_train = [embeddings[i] for i in indices[:train_size]]  # Select the first 80% of embeddings
embeddings_test = [embeddings[i] for i in indices[train_size:]]   # Select the last 20% of embeddings

#print(len(embeddings_train), len(embeddings_test))

# Save the training and test datasets in separate variables (lists)
train_data = embeddings_train
test_data = embeddings_test

#print(embeddings_train)
#print(embeddings_test)

# Transform list train/data into tensor

In [7]:
#Transform the list into a tensor with float data type
tensor_train = torch.tensor(train_data, dtype=torch.float32)
tensor_test = torch.tensor(test_data, dtype=torch.float32)

print(tensor_train)
print(tensor_test)


ValueError: too many dimensions 'str'

# MLP

In [12]:
class ECclassifier(nn.Module):

  def __init__(self, input_size, hidden_size, out_size):
    super(ECclassifier, self).__init__()

    self.hidden  = nn.Linear(input_size, hidden_size)
    self.relu    = nn.ReLU()
    self.out     = nn.Linear(hidden_size, out_size)
    self.softmax = nn.Softmax()

  def forward(self, X):
    
    feature = self.relu(self.hidden(X))
    output  = self.softmax(self.out(feature))

    return output

input_size  = train_data.shape[1]
hidden_size = 32
out_size    = 7

net = ECclassifier(input_size, hidden_size, out_size).to(device) #cast na GPU 
     

AttributeError: 'list' object has no attribute 'shape'

In [6]:
Fluxo de Treinamento

SyntaxError: invalid syntax (2724958817.py, line 1)

Fluxo de Validacao

In [51]:
'''
----> Função de perda de problema de classificação: 

y'= {0.12; 0.09; 0.25; 0.14}
y= {0; 0; 1; 0} ----> dog 

- Cross Entropy (entropia cruzada) / Log Loss

def CrossEntropy (yHat , y):
    if y == 1:
        return -log(yHat)
    else: 
        return -log(1 - yHat)


        ----> Medir loss ao longo das iteracoes para ver se modelo ta treinando bem ou nao 
'''

SyntaxError: unterminated string literal (detected at line 3) (2443592535.py, line 3)

In [None]:
def read_data(path_to_enzyme_esm2: str, path_to_enzyme_splitX_csv: str):
    """
    Reads in esm2 and splitX.csv
    :param path_to_enzyme_esm2: Absolute path to enzyme_esm2_splitX
    :param path_to_enzyme_splitX_csv: Absolute path to enzyme_splitX.csv
    :return: A dataframe (representing splitX.csv), headers of esm2 (list), esm2 embeddings (list)
    """

    headers = []
    embeddings = []

    with h5py.File(path_to_enzyme_esm2) as hdf_handle:
        for header, emb in hdf_handle.items():
            headers.append(header)
            embeddings.append(np.array(list(emb)))
    print("LOG: ESM2 DONE")
    
    enzyme_csv = pd.read_csv(path_to_enzyme_splitX_csv, header=0, sep=',')
    print("LOG: CSV DONE")
    print("Enzymes in SplitX.csv:", len(enzyme_csv))
    
    # TODO: Inorder to apply this line of code, I also have to find the corresponding embeddings and 
    #  drop these as well...
    # enzyme_csv = filter_unwanted_seqs(enzyme_csv, True)

    # control
    print("Embeddings:", len(embeddings))
    print("Embedding headers:", len(headers))

    return[enzyme_csv, headers, embeddings]

def add_labels(enzymes: pd.DataFrame, entry_ids: list, esm2_emb: list):
    """
    Takes in the output of read_data(), grabs the corresponding ec number out of splitX.csv (enzymes) and adds this label 
    to our esm2 embeddings
    :param enzymes: Dataframe of splitX.csv
    :param entry_ids: Headers of esm2 emeddings
    :param esm2_emb: Embeddings
    :return: A list containing tupels of (header, label, embedding)
    """

    header_labels = {} # dict with → header(id): label (1-7)
    missing_headers = set() # set containing missing headers

    header_to_ec_mapping = {}
    for entry, ec_number in zip(enzymes["Entry"], enzymes["EC number"]):
        header_to_ec_mapping[entry] = ec_number[0]  # Extract the first character

    for header in entry_ids:
        number = enzymes.loc[enzymes["Entry"] == header,"EC number"]
        try:
            ec = int(number.iat[0][0])-1 # since we always start counting from 0
            header_labels[header] = ec
        except IndexError:
            missing_headers.add(header)

    header_label_tuples = [(header, header_labels[header]) for header in entry_ids if header not in missing_headers]

    # Create a new list of embeddings with labels
    embeddings_with_labels = []

    for header, emb in zip(entry_ids, esm2_emb):
        curr_label = next((label for h, label in header_label_tuples if h == header), None)
        if curr_label is not None:
            embeddings_with_labels.append((header, curr_label, emb))

    return embeddings_with_labels