# Embedding generation
This Jupyter notebook is designed to generate embeddings for each protein.

In [None]:
import esm
import torch
import pandas as pd
import tqdm
import tqdm
import numpy as np
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Loads the ESM-2 model
model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
model.eval()

ESM2(
  (embed_tokens): Embedding(33, 1280, padding_idx=1)
  (layers): ModuleList(
    (0): TransformerLayer(
      (self_attn): MultiheadAttention(
        (k_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (rot_emb): RotaryEmbedding()
      )
      (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
      (fc1): Linear(in_features=1280, out_features=5120, bias=True)
      (fc2): Linear(in_features=5120, out_features=1280, bias=True)
      (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
    )
    (1): TransformerLayer(
      (self_attn): MultiheadAttention(
        (k_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (v_proj): Linear(in_features=1280, out_features=1280, bia

In [None]:
# Get the sequence data already filtered
df = pd.read_csv('data/OsmoticStress_with_binary_positions_padded_5000.csv')

In [None]:
df_clean = df[["Uniprot_ID", "full_sequence", "Qvalue(LiP)"]]
df_clean.reset_index(drop=True, inplace=True)

In [None]:
# Fix type of full_sequence and remove NaN values
df_clean = df_clean[~df_clean['full_sequence'].apply(lambda x: isinstance(x, float))]
df_clean['full_sequence'] = df_clean['full_sequence'].astype(str)

In [None]:
# Retrieve sequences for unique protein IDs to avoid generating duplicate embeddings
df_clean_dropped = df_clean.drop_duplicates(subset='Uniprot_ID', keep='first')

In [None]:
df_clean_dropped

Unnamed: 0,Uniprot_ID,full_sequence,Qvalue(LiP)
0,P15703,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...,0.003686
1,P06169,MSEITLGKYLFERLKQVNVNTVFGLPGDFNLSLLDKIYEVEGMRWA...,0.003686
2,P38174,MTDAEIENSPASDLKELNLENEGVEQQDQAKADESDPVESKKKKNK...,0.003686
3,P00359,MVRVAINGFGRIGRLVMRIALSRPNVEVVALNDPFITNDYAAYMFK...,0.003686
5,Q04305,MSTARPRIITSKAPLLPQQTTPEQRYWRQYTSAQLVKEHNSVTHIS...,0.005126
...,...,...,...
4957,P36000,MPPLDKRIKKFLKDSIRIAPKISGKGELSELRTGLVSQYPQTRKDA...,0.051109
4959,P80428,MSNAALQVYGGDEVSAVVIDPGSYTTNIGYSGSDFPQSILPSVYGK...,0.051109
4964,P33441,MPLSQKQIDQVRTKVHYSEVDTPFNKYLDILGKVTKLTGSIINGTL...,0.051150
4987,P34240,MEKIPRWLLFSLISSVLCILGALCVPLLSVAFDSKRNSQSKLVNYG...,0.051166


In [None]:
def generate_embeddings(model, alphabet, sequences):
    """
    Function to generate the embeddings.
    """
    embeddings = []
    
    batch_converter = alphabet.get_batch_converter()
    
    for sequence in tqdm.tqdm(sequences, desc="Generating Embeddings"):
        data = [(0, sequence)]
        batch_labels, batch_strs, batch_tokens = batch_converter(data)

        # Generate embeddings
        with torch.no_grad():
            results = model(batch_tokens, repr_layers=[33]) 
            token_embeddings = results['representations'][33]

        # Create full embeddings
        full_embedding = token_embeddings.squeeze().numpy()
        embeddings.append(full_embedding)
    
    return embeddings

In [None]:
def pad_sequence(sequence, target_length=1000, pad_token='<pad>'):
    """
    Append the <pad> token to the sequence until it reaches a length of 1000.
    """
    padding_length = target_length - len(sequence)
    return sequence + pad_token * padding_length

# Apply padding to each sequence with <pad>
df_clean_dropped['padded_sequence'] = df_clean_dropped['full_sequence'].apply(lambda x: pad_sequence(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
# Check if the padding is correct
df_clean_dropped['full_sequence_length'] = df_clean_dropped['full_sequence'].apply(len)
df_clean_dropped['num_pad_tokens'] = df_clean_dropped['padded_sequence'].apply(lambda x: x.count('<pad>'))
df_clean_dropped['padded_sequence_length'] = df_clean_dropped['full_sequence_length'] + df_clean_dropped['num_pad_tokens']
df_clean_dropped['expected_padded_length'] = 1000

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_

In [None]:
df_clean_dropped.reset_index(drop=True, inplace=True)
df_clean_dropped = df_clean_dropped[1000:]

In [None]:
# Generate the embeddings!
sequences = df_clean_dropped['padded_sequence'].tolist()
embeddings = generate_embeddings(model, alphabet, sequences)

Generating Embeddings: 100%|██████████| 476/476 [1:13:50<00:00,  9.31s/it]


In [None]:
df_test_dropped['full_embedding'] = [e.tolist() for e in embeddings]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
# Save embeddings
df_test_dropped.to_pickle('data/embeddings_new_3_3.pkl', protocol=4)