# Deep Learning Model
We are aiming to predict the Lip_norm parameter based on the binary positions and full embedding of a peptide.

## Dataset preparation
Read the dataframed with the embeddings and binary positions, and then format them.

In [1]:
import json
import pickle
import pandas as pd

# Get the embedding 1
with open('data/embeddings_test_1-003.pkl', 'rb') as file:
    embeddings = pickle.load(file)

# Get the embedding 2
# with open('data/embeddings_test_2.pkl', 'rb') as file:
#     embeddings_2 = pickle.load(file)

# # Get the embedding 3
# with open('data/embeddings_test_3.pkl', 'rb') as file:
#     embeddings_3 = pickle.load(file)

# # Get the embedding 4
# with open('data/embeddings_test_4.pkl', 'rb') as file:
#     embeddings_4 = pickle.load(file)

# Concatenate the embeddings
# embeddings = pd.concat([embeddings_1])
# 

embeddings.set_index('Uniprot_ID', inplace=True)
embeddings = embeddings.drop(columns=['full_sequence'])

In [2]:
embeddings

Unnamed: 0_level_0,full_embedding
Uniprot_ID,Unnamed: 1_level_1
P15703,"[[0.10351670533418655, -0.04619598016142845, 0..."
P38174,"[[0.017678968608379364, -0.011685207486152649,..."
P26637,"[[0.04360557720065117, 0.039450906217098236, 0..."
P06169,"[[0.06486686319112778, -0.018988797441124916, ..."
P00359,"[[0.05060335993766785, -0.002109627239406109, ..."
...,...
P40991,"[[0.05226866900920868, -0.04464501142501831, 0..."
P53633,"[[0.044984687119722366, -0.01916404627263546, ..."
Q01217,"[[0.08891955018043518, -0.039793942123651505, ..."
Q12109,"[[0.059575989842414856, -0.019570698961615562,..."


In [3]:
import numpy as np

# Creating trimmed embeddings
trimmed_embeddings = [np.array(embedding)[1:-1, :] for embedding in embeddings['full_embedding']]
trimmed_embeddings_df = pd.DataFrame({'embeddings': trimmed_embeddings}, index=embeddings.index)

In [4]:
trimmed_embeddings_df

Unnamed: 0_level_0,embeddings
Uniprot_ID,Unnamed: 1_level_1
P15703,"[[-0.0014747204259037971, -0.04175397753715515..."
P38174,"[[0.09441959857940674, 0.05087737366557121, -0..."
P26637,"[[0.07177527993917465, 0.05565423518419266, 0...."
P06169,"[[0.05485401302576065, 0.05323461815714836, -0..."
P00359,"[[0.0364735871553421, -0.02804701030254364, -0..."
...,...
P40991,"[[0.10278788208961487, 0.003414804581552744, -..."
P53633,"[[0.05683088302612305, -0.03139161318540573, -..."
Q01217,"[[0.13987456262111664, -0.090286985039711, -0...."
Q12109,"[[0.10665954649448395, 0.044686008244752884, -..."


In [5]:
trimmed_size_embeddings_df = trimmed_embeddings_df[trimmed_embeddings_df['embeddings'].apply(lambda x: x.shape[0]) < 1000]

In [6]:
trimmed_size_embeddings_df

Unnamed: 0_level_0,embeddings
Uniprot_ID,Unnamed: 1_level_1
P15703,"[[-0.0014747204259037971, -0.04175397753715515..."
P38174,"[[0.09441959857940674, 0.05087737366557121, -0..."
P06169,"[[0.05485401302576065, 0.05323461815714836, -0..."
P00359,"[[0.0364735871553421, -0.02804701030254364, -0..."
P37292,"[[0.1292525678873062, 0.10085765272378922, -0...."
...,...
P40991,"[[0.10278788208961487, 0.003414804581552744, -..."
P53633,"[[0.05683088302612305, -0.03139161318540573, -..."
Q01217,"[[0.13987456262111664, -0.090286985039711, -0...."
Q12109,"[[0.10665954649448395, 0.044686008244752884, -..."


In [7]:
max_length = 1000

# Padding embeddings
padded_embeddings = [np.pad(embedding, ((0, max_length - embedding.shape[0]), (0, 0)), mode='constant', constant_values=0)
                     for embedding in trimmed_size_embeddings_df['embeddings']]

In [8]:
padded_embeddings_df = pd.DataFrame({'embeddings': padded_embeddings}, index=trimmed_size_embeddings_df.index)

In [9]:
# Get the dataset
df = pd.read_csv('data/OsmoticStress_with_binary_positions_padded.csv')
df['Binary_Positions'] = df['Binary_Positions'].apply(json.loads)
df['Padded_Binary_Positions'] = df['Padded_Binary_Positions'].apply(json.loads)
df.drop(columns=['full_sequence', 'Peptide_sequence', "Binary_Positions"], inplace=True)
#df["full_sequence"]= df["full_sequence"].astype(str)
df.set_index('Uniprot_ID', inplace=True)

df

Unnamed: 0_level_0,Log2FC(LiP_norm),Padded_Binary_Positions
Uniprot_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
P15703,-2.176707,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
P15703,0.285029,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
P15703,-0.498240,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
P15703,0.348193,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
P15703,-0.434615,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...
P38887,-0.008441,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
P53093,0.006678,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
Q04772,-0.015149,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, ..."
P21192,-0.010109,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [10]:
dataset = df.merge(padded_embeddings_df, left_index=True, right_index=True)

In [11]:
# We can see that we have only the columns: 'Lip_norm', 'Binary Position', 'Embeddings'
dataset.head()

Unnamed: 0_level_0,Log2FC(LiP_norm),Padded_Binary_Positions,embeddings
Uniprot_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
O13516,4.378006,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.011881312355399132, -0.09332677721977234, ..."
O13516,3.345066,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.011881312355399132, -0.09332677721977234, ..."
O13516,3.458429,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.011881312355399132, -0.09332677721977234, ..."
O13516,3.607978,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.011881312355399132, -0.09332677721977234, ..."
O13516,4.070368,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.011881312355399132, -0.09332677721977234, ..."


In [12]:
import re

# Both columns are strings, we need to convert them to lists of integers so we can use them later
def convert_to_list(number_string):
    cleaned_string = re.sub(r"[^\d,\s]", "", number_string)
    return list(map(int, cleaned_string.split(',')))

#dataset['Binary_Positions'] = dataset['Binary_Positions'].apply(convert_to_list)

In [13]:
# def matrix_shape_lists(x):
#    return len(x), len(x[0])
# 
# dataset['Binary_Positions_shape'] = dataset['Binary_Positions'].apply(len)
# dataset['full_embedding_shape'] = dataset['embeddings'].apply(matrix_shape_lists)

In [14]:
dataset

Unnamed: 0_level_0,Log2FC(LiP_norm),Padded_Binary_Positions,embeddings
Uniprot_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
O13516,4.378006,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.011881312355399132, -0.09332677721977234, ..."
O13516,3.345066,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.011881312355399132, -0.09332677721977234, ..."
O13516,3.458429,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.011881312355399132, -0.09332677721977234, ..."
O13516,3.607978,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.011881312355399132, -0.09332677721977234, ..."
O13516,4.070368,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.011881312355399132, -0.09332677721977234, ..."
...,...,...,...
Q6Q547,-0.050568,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.03893417492508888, -0.04581452161073685, -..."
Q6Q547,0.543480,"[0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, ...","[[0.03893417492508888, -0.04581452161073685, -..."
Q6Q547,-0.153268,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.03893417492508888, -0.04581452161073685, -..."
Q6Q547,-0.113537,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.03893417492508888, -0.04581452161073685, -..."


In [15]:
dataset = dataset[:3000]

In [16]:
import torch
import numpy as np

# Assuming 'dataset' is a dictionary-like object with data
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Create tensors directly on the GPU, ensure you handle device availability
embeddings_tensor = torch.tensor(np.array(dataset['embeddings'].tolist(), dtype=np.float32), device=device)
binary_positions_tensor = torch.tensor(np.array(dataset['Padded_Binary_Positions'].tolist(), dtype=np.float32), device=device)

In [17]:
print(embeddings_tensor.shape)
print(binary_positions_tensor.shape)

torch.Size([3000, 1000, 1280])
torch.Size([3000, 1000])


In [18]:
binary_positions_tensor = binary_positions_tensor.unsqueeze(-1)

In [19]:
# Use in-place operations to save memory
X = embeddings_tensor + binary_positions_tensor

In [20]:
# Another tensor, also directly to GPU
y = torch.tensor(np.array(dataset['Log2FC(LiP_norm)'].tolist(), dtype=np.float32), device=device)

# Clear memory if not needed anymore
del binary_positions_tensor  # Free this if no longer needed
del embeddings_tensor
torch.cuda.empty_cache()  # Help PyTorch reclaim any unreferenced memory


In [21]:
# clean up memory
del embeddings
del df


## Model Training
Now that we have both input and target data, we can create our model.

In [31]:
import torch
print(torch.__version__)


1.3.1+cu92


In [23]:
print(device)

cuda


In [24]:
print(X.shape)
print(y.shape)

torch.Size([3000, 1000, 1280])
torch.Size([3000])


In [28]:
import torch
import torch.nn as nn
import torch.optim as optim
import copy
from torch.utils.data import TensorDataset, DataLoader, random_split

dataset = TensorDataset(X, y)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Model definition based on suggestion
class PeptideRegressor(nn.Module):
    def __init__(self, input_dim, num_heads, dropout_rate):
        super(PeptideRegressor, self).__init__()
        self.attention = nn.MultiheadAttention(embed_dim=input_dim, num_heads=num_heads, dropout=dropout_rate)
        self.pooling = nn.AdaptiveAvgPool1d(1)
        self.fc = nn.Linear(input_dim, 1) 

    def forward(self, x):
        attn_output, _ = self.attention(x, x, x)
        transposed = attn_output.transpose(1, 2)
        pooled = self.pooling(transposed)
        flattened = pooled.squeeze(-1)
        output = self.fc(flattened)
        return output

# Initialize the model
input_dim = 1280
num_heads = 16    # number of attention heads
dropout_rate = 0.1
model = PeptideRegressor(input_dim=input_dim, num_heads=num_heads, dropout_rate=dropout_rate).to(device)

# Loss and optimizer
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# Training settings
n_epochs = 5
batch_size = 64
best_val_loss = float('inf')
train_losses_avg = []
val_losse_avg = []

# Training loop
for epoch in range(n_epochs):
    model.train()
    train_losses = []
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        y_pred = model(X_batch)
        loss = loss_fn(y_pred, y_batch.unsqueeze(1))
        loss.backward()
        optimizer.step()
        train_losses.append(loss.item())
    
    train_losses_avg.append(sum(train_losses) / len(train_losses))
    print(f'Epoch {epoch+1}, Train Loss: {sum(train_losses) / len(train_losses)}')

    model.eval()
    val_losses = []
    for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        with torch.no_grad():
            y_pred = model(X_batch)
            mse = loss_fn(y_pred, y_batch.unsqueeze(1))
            mae = nn.L1Loss()(y_pred, y_batch.unsqueeze(1))
            val_losses.append(mse.item())
    avg_val_loss = sum(val_losses) / len(val_losses)
    val_losse_avg.append(avg_val_loss)
    print(f'Epoch {epoch+1}, Test MSE: {avg_val_loss}')
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        best_model = copy.deepcopy(model.state_dict())

RuntimeError: CUDA error: CUBLAS_STATUS_EXECUTION_FAILED when calling `cublasSgemm( handle, opa, opb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc)`

In [None]:
# # Visualize the training process
import matplotlib.pyplot as plt

plt.plot(train_losses_avg, label='Train Loss')
plt.plot(val_losse_avg, label='Test Loss')
plt.legend()
plt.show()
