# Deep Learning Model
We are aiming to predict the Lip_norm parameter based on the binary positions and full embedding of a peptide.

## Dataset preparation
Read the dataframed with the embeddings and binary positions, and then format them.

In [1]:
import pandas as pd
import pickle

In [2]:
import json

# Get the embeddings
with open('data/embeddings_test_1.pkl', 'rb') as file:
    embeddings = pickle.load(file)
embeddings.set_index('Uniprot_ID', inplace=True)
embeddings = embeddings.drop(columns=['full_sequence'])

# Get the dataset
df = pd.read_csv('data/OsmoticStress_with_binary_positions.csv')
df['Binary_Positions'] = df['Binary_Positions'].apply(json.loads)
df.drop(columns=['full_sequence', 'Peptide_sequence'], inplace=True)
#df["full_sequence"]= df["full_sequence"].astype(str)
df.set_index('Uniprot_ID', inplace=True)

# Merge the two and get the final dataset
dataset = df.merge(embeddings, left_index=True, right_index=True)

In [3]:
# We can see that we have only the columns: 'Lip_norm', 'Binary Position', 'Embeddings'
dataset.head()

Unnamed: 0_level_0,Log2FC(LiP_norm),Binary_Positions,full_embedding
Uniprot_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
O13516,4.378006,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.060491494834423065, -0.004284240305423737,..."
O13516,3.345066,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.060491494834423065, -0.004284240305423737,..."
O13516,3.458429,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.060491494834423065, -0.004284240305423737,..."
O13516,3.607978,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.060491494834423065, -0.004284240305423737,..."
O13516,4.070368,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.060491494834423065, -0.004284240305423737,..."


In [4]:
import re

# Both columns are strings, we need to convert them to lists of integers so we can use them later
def convert_to_list(number_string):
    cleaned_string = re.sub(r"[^\d,\s]", "", number_string)
    return list(map(int, cleaned_string.split(',')))

#dataset['Binary_Positions'] = dataset['Binary_Positions'].apply(convert_to_list)

In [5]:
def matrix_shape_lists(x):
   return len(x), len(x[0])

dataset['Binary_Positions_shape'] = dataset['Binary_Positions'].apply(len)
dataset['full_embedding_shape'] = dataset['full_embedding'].apply(matrix_shape_lists)

In [6]:
dataset

Unnamed: 0_level_0,Log2FC(LiP_norm),Binary_Positions,full_embedding,Binary_Positions_shape,full_embedding_shape
Uniprot_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
O13516,4.378006,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.060491494834423065, -0.004284240305423737,...",197,"(199, 1280)"
O13516,3.345066,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.060491494834423065, -0.004284240305423737,...",197,"(199, 1280)"
O13516,3.458429,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.060491494834423065, -0.004284240305423737,...",197,"(199, 1280)"
O13516,3.607978,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.060491494834423065, -0.004284240305423737,...",197,"(199, 1280)"
O13516,4.070368,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.060491494834423065, -0.004284240305423737,...",197,"(199, 1280)"
...,...,...,...,...,...
Q6Q547,-0.050568,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.04242077097296715, 0.0044928123243153095, ...",58,"(60, 1280)"
Q6Q547,0.543480,"[0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, ...","[[0.04242077097296715, 0.0044928123243153095, ...",58,"(60, 1280)"
Q6Q547,-0.153268,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.04242077097296715, 0.0044928123243153095, ...",58,"(60, 1280)"
Q6Q547,-0.113537,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.04242077097296715, 0.0044928123243153095, ...",58,"(60, 1280)"


## Creating combined embedding
Now that we have the dataframe processed, we wil first create the embeddings of the binary position, and then sum them up with the peptide embeding.

In [7]:
import numpy as np
trimmed_embeddings = [np.array(embedding)[1:-1, :] for embedding in dataset['full_embedding']]

In [None]:
max_length = max(embedding.shape[0] for embedding in trimmed_embeddings)
print(max_length)

1090


In [None]:
import torch
import torch.nn as nn
import numpy as np

padded_embeddings = [np.pad(embedding, ((0, max_length - embedding.shape[0]), (0, 0)), 
                             mode='constant', constant_values=0)
                     for embedding in trimmed_embeddings]
padded_embeddings_array = np.array(padded_embeddings, dtype=np.float32)
full_embedding = torch.tensor(padded_embeddings_array)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
full_embedding.shape

torch.Size([395, 1090, 1280])

In [None]:
binary_positions = dataset['Binary_Positions']
padded_binary_positions = [
    np.pad(position, (0, max_length - len(position)), 
           mode='constant', constant_values=0)
    for position in binary_positions
]
padded_binary_positions_array = np.array(padded_embeddings, dtype=np.float32)
full_binary_positions = torch.tensor(padded_embeddings_array)

In [None]:
full_binary_positions.shape

torch.Size([395, 1090, 1280])

In [None]:
combined_embeddings = full_embedding + full_binary_positions

In [None]:
# Select the target variable Lip_norm as the output y
y_list = np.array(dataset['Log2FC(LiP_norm)'])
y = torch.tensor(y_list, dtype=torch.float32)

In [None]:
y.shape

torch.Size([395])

## Model Training
Now that we have both input and target data, we can create our model.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import copy
from torch.utils.data import TensorDataset, DataLoader, random_split

dataset = TensorDataset(combined_embeddings, y)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Model definition based on suggestion
class PeptideRegressor(nn.Module):
    def __init__(self, input_dim, num_heads, dropout_rate):
        super(PeptideRegressor, self).__init__()
        self.attention = nn.MultiheadAttention(embed_dim=input_dim, num_heads=num_heads, dropout=dropout_rate, batch_first=True)
        self.pooling = nn.AdaptiveAvgPool1d(1)
        self.fc = nn.Linear(input_dim, 1) 

    def forward(self, x):
        attn_output, _ = self.attention(x, x, x)
        transposed = attn_output.transpose(1, 2)
        pooled = self.pooling(transposed)
        flattened = pooled.squeeze(-1)
        output = self.fc(flattened)
        return output


# Initialize the model
input_dim = 1280
num_heads = 4    # number of attention heads
dropout_rate = 0.1
model = PeptideRegressor(input_dim=input_dim, num_heads=num_heads, dropout_rate=dropout_rate)

# Loss and optimizer
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# Training settings
n_epochs = 5
batch_size = 32
best_val_loss = float('inf')
train_losses = []
val_losses = []

# Training loop
for epoch in range(n_epochs):
    model.train()
    train_losses = []
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        y_pred = model(X_batch)
        loss = loss_fn(y_pred, y_batch.unsqueeze(1))
        loss.backward()
        optimizer.step()
        train_losses.append(loss.item())
    print(f'Epoch {epoch+1}, Train Loss: {sum(train_losses) / len(train_losses)}')

    model.eval()
    val_losses = []
    for X_batch, y_batch in test_loader:
        with torch.no_grad():
            y_pred = model(X_batch)
            mse = loss_fn(y_pred, y_batch.unsqueeze(1))
            mae = nn.L1Loss()(y_pred, y_batch.unsqueeze(1))
            val_losses.append(mse.item())
    avg_val_loss = sum(val_losses) / len(val_losses)
    print(f'Epoch {epoch+1}, Test MSE: {avg_val_loss}, Test MAE: {mae.item()}')
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        best_model = copy.deepcopy(model.state_dict())

Epoch 1, Train Loss: 0.5057569712400436
Epoch 1, Test MSE: 0.34749752779801685, Test MAE: 0.34432071447372437
Epoch 2, Train Loss: 0.5082614421844482
Epoch 2, Test MSE: 0.3375875651836395, Test MAE: 0.32637590169906616
Epoch 3, Train Loss: 0.4870763003826141
Epoch 3, Test MSE: 0.3441019058227539, Test MAE: 0.33810997009277344


KeyboardInterrupt: 

In [None]:
# plot the training and validation losses
import matplotlib.pyplot as plt

plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()


ModuleNotFoundError: No module named 'matplotlib'

: 

In [None]:
# Test model in test set
# Just run one time!

model.load_state_dict(best_model)
model.eval()
with torch.no_grad():
    y_test_pred = model(X_test)
    test_mse = loss_fn(y_test_pred, y_test)
    print(f'Final Test MSE: {test_mse.item()}')

Final Test MSE: 0.5424415469169617
