# Deep Learning Model
We are aiming to predict the Lip_norm parameter based on the binary positions and full embedding of a peptide.

## Dataset preparation

In [1]:
import json
import pickle
import pandas as pd

def load_and_process_embedding(file_path):
    """
    Loads the embeddings
    """
    with open(file_path, 'rb') as file:
        embeddings = pickle.load(file)
    embeddings = pd.DataFrame(embeddings)
    embeddings.set_index('Uniprot_ID', inplace=True)
    embeddings = embeddings.drop(columns=['full_sequence'])
    return embeddings

embeddings = pd.DataFrame()

embedding_files = [
    'data/embeddings_new_3_1.pkl',
    # 'data/embeddings_new_3_2.pkl',
    # 'data/embeddings_new_3_3.pkl',
]

for file_path in embedding_files:
    print(file_path)
    current_embeddings = load_and_process_embedding(file_path)
    embeddings = pd.concat([embeddings, current_embeddings])

data/embeddings_new_3_1.pkl


In [2]:
embeddings_ = embeddings.reset_index()

In [6]:
# embeddings_filter = embeddings_[embeddings_['Uniprot_ID'] == 'P32485']

In [9]:
# embeddings_filter.reset_index(drop=True, inplace=True)
# embeddings_filter.to_csv('./data/P32485_peptide.csv')

In [8]:
import numpy as np

# Creating trimmed embeddings: because of the first and last tokens
# (https://www.science.org/doi/suppl/10.1126/science.ade2574/suppl_file/science.ade2574_sm.pdf)
# We used BOS and EOS tokens to signal the beginning and end of a real protein, to allow the model to separate a full-sized protein from a cropped one.
trimmed_embeddings = [np.array(embedding)[1:-1, :] for embedding in embeddings['full_embedding']]
trimmed_embeddings_df = pd.DataFrame({'embeddings': trimmed_embeddings}, index=embeddings.index)

In [9]:
# We can see that we have the trimmed embedings ready to be used
trimmed_embeddings_df.head()

Unnamed: 0_level_0,embeddings
Uniprot_ID,Unnamed: 1_level_1
Q04739,"[[0.01952355168759823, 0.0036598031874746084, ..."
P53172,"[[0.06398601084947586, -0.04467635229229927, -..."
P33298,"[[0.04065754637122154, -0.021706463769078255, ..."
Q12045,"[[0.0963035449385643, -0.10714740306138992, -0..."
Q03264,"[[0.013744623400270939, 0.028120659291744232, ..."


### Read Binary position data
Read generated binary position data and target values from the csv file, that was generated in notebook 1. The data is already ready, which means that the binary position  is already padded to the length of one thousand.

In [10]:
# Get the dataset
df = pd.read_csv('data/OsmoticStress_with_binary_positions_padded_5000.csv')
df['Binary_Positions'] = df['Binary_Positions'].apply(json.loads)
df['Padded_Binary_Positions'] = df['Padded_Binary_Positions'].apply(json.loads)
df.drop(columns=['full_sequence', 'Peptide_sequence', "Binary_Positions"], inplace=True)
df.set_index('Uniprot_ID', inplace=True)

df.head()

Unnamed: 0_level_0,index,Log2FC(LiP_norm),Qvalue(LiP),Padded_Binary_Positions
Uniprot_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
P15703,0,-2.176707,0.003686,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
P06169,101,2.264383,0.003686,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
P38174,18,1.216913,0.003686,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
P00359,280,2.790874,0.003686,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
P0CH08;P0CH09,331,-0.647026,0.004177,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [11]:
df_ = df.reset_index()
counts = df_['Uniprot_ID'].value_counts()
print(counts)

P32324    116
P02994    108
P10081     40
P06169     39
P00549     36
         ... 
P32466      1
P38066      1
Q04223      1
Q12513      1
P47137      1
Name: Uniprot_ID, Length: 1490, dtype: int64


### Prepare training dataset
Merge both embeddings and padded binary position together.

In [12]:
# dataset = df.merge(trimmed_embeddings_df, left_index=True, right_index=True)
dataset = df.merge(trimmed_embeddings_df, left_index=True, right_index=True, how='inner')
# We can see that we have only the columns: 'Lip_norm', 'Binary Position', 'Embeddings'
# print(dataset.head())
print(dataset.shape)

(637, 5)


In [13]:
dataset_ = dataset.reset_index()
counts = dataset_['Uniprot_ID'].value_counts()
print(counts)

P15624    9
P22203    6
P00927    6
P32356    5
Q12154    5
         ..
P40099    1
P40091    1
P40078    1
P40056    1
O13547    1
Name: Uniprot_ID, Length: 455, dtype: int64


In [14]:
dataset

Unnamed: 0_level_0,index,Log2FC(LiP_norm),Qvalue(LiP),Padded_Binary_Positions,embeddings
Uniprot_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
O13547,28401,0.240404,0.049373,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.020321033895015717, 0.057242896407842636, ..."
P00572,28283,0.485887,0.049034,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.06270206719636917, -0.10046922415494919, -..."
P00812,24722,0.476236,0.039900,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.1953696757555008, -0.1298731565475464, 0.0..."
P00812,24723,-0.530722,0.046914,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, ...","[[0.1953696757555008, -0.1298731565475464, 0.0..."
P00927,24626,0.541523,0.039875,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.10202319175004959, 0.04706750065088272, -0..."
...,...,...,...,...,...
Q99258,24759,0.157271,0.044713,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[-0.018650177866220474, 0.04459123685956001, ..."
Q99258,24760,-0.516572,0.046971,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[-0.018650177866220474, 0.04459123685956001, ..."
Q99258,24761,-0.535843,0.049788,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[-0.018650177866220474, 0.04459123685956001, ..."
Q99321,25764,0.189942,0.042230,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.04402398690581322, -0.057297028601169586, ..."


In [15]:
dataset.to_pickle('data/dataset_3.pkl')