In [1]:
import esm
import torch
import pandas as pd
import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Loadsm the model
model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
model.eval()

ESM2(
  (embed_tokens): Embedding(33, 1280, padding_idx=1)
  (layers): ModuleList(
    (0): TransformerLayer(
      (self_attn): MultiheadAttention(
        (k_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (rot_emb): RotaryEmbedding()
      )
      (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
      (fc1): Linear(in_features=1280, out_features=5120, bias=True)
      (fc2): Linear(in_features=5120, out_features=1280, bias=True)
      (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
    )
    (1): TransformerLayer(
      (self_attn): MultiheadAttention(
        (k_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (v_proj): Linear(in_features=1280, out_features=1280, bia

In [3]:
df = pd.read_csv('data/OsmoticStress_with_binary_positions_padded.csv')

In [4]:
df_clean = df[["Uniprot_ID", "full_sequence"]].drop_duplicates()
df_clean.reset_index(drop=True, inplace=True)

In [5]:
df_clean.shape

(2490, 2)

In [6]:
df_clean = df_clean[~df_clean['full_sequence'].apply(lambda x: isinstance(x, float))]
df_clean['full_sequence'] = df_clean['full_sequence'].astype(str)

In [7]:
df_clean

Unnamed: 0,Uniprot_ID,full_sequence
0,P15703,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...
1,P38174,MTDAEIENSPASDLKELNLENEGVEQQDQAKADESDPVESKKKKNK...
2,P06169,MSEITLGKYLFERLKQVNVNTVFGLPGDFNLSLLDKIYEVEGMRWA...
3,P00359,MVRVAINGFGRIGRLVMRIALSRPNVEVVALNDPFITNDYAAYMFK...
5,P37292,MFPRASALAKCMATVHRRGLLTSGAQSLVSKPVSEGDPEMFDILQQ...
...,...,...
2485,P38887,MILKLVHCLVALTGLIFAKPYQQQQAVLAPSQDVPLRDIHIGDINF...
2486,P53093,MSYGREDTTIEPDFIEPDAPLAASGGVADNIGGTMQNSGSRGTLDE...
2487,Q04772,MSSDGMNRDVSNSKPNVRFAAPQRLSVAHPAISSPLHMPMSKSSRK...
2488,P21192,MDNVVDPWYINPSGFAKDTQDEEYVQHHDNVNPTIPPPDNYILNNE...


In [8]:
df_full = pd.read_excel('data/OsmoticStress.xlsx')

In [9]:
# add Qvalue(LiP) information again to dataframe to get the sequences with lowest Qvalue(LiP) to create the embeddings
df_clean = pd.merge(df_clean, df_full[['Uniprot_ID', 'Qvalue(LiP)']], on='Uniprot_ID', how='left')

In [10]:
df_clean

Unnamed: 0,Uniprot_ID,full_sequence,Qvalue(LiP)
0,P15703,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...,0.003686
1,P15703,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...,0.034862
2,P15703,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...,0.039093
3,P15703,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...,0.042584
4,P15703,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...,0.048552
...,...,...,...
27139,P38887,MILKLVHCLVALTGLIFAKPYQQQQAVLAPSQDVPLRDIHIGDINF...,0.376924
27140,P53093,MSYGREDTTIEPDFIEPDAPLAASGGVADNIGGTMQNSGSRGTLDE...,0.381210
27141,Q04772,MSSDGMNRDVSNSKPNVRFAAPQRLSVAHPAISSPLHMPMSKSSRK...,0.381696
27142,P21192,MDNVVDPWYINPSGFAKDTQDEEYVQHHDNVNPTIPPPDNYILNNE...,0.382355


In [11]:
sorted_df = df_clean.sort_values(by='Qvalue(LiP)', ascending=True)
sorted_df.head()

Unnamed: 0,Uniprot_ID,full_sequence,Qvalue(LiP)
0,P15703,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...,0.003686
36,P06169,MSEITLGKYLFERLKQVNVNTVFGLPGDFNLSLLDKIYEVEGMRWA...,0.003686
18,P38174,MTDAEIENSPASDLKELNLENEGVEQQDQAKADESDPVESKKKKNK...,0.003686
215,P00359,MVRVAINGFGRIGRLVMRIALSRPNVEVVALNDPFITNDYAAYMFK...,0.003686
266,P37292,MFPRASALAKCMATVHRRGLLTSGAQSLVSKPVSEGDPEMFDILQQ...,0.005126


In [12]:
max_length = sorted_df['full_sequence'].str.len().max()
max_length

997

In [13]:
import tqdm
import numpy as np
import torch

def generate_embeddings(model, alphabet, sequences):
    """
    Function to generate the embeddings.
    """
    embeddings = []
    
    batch_converter = alphabet.get_batch_converter()
    
    for sequence in tqdm.tqdm(sequences, desc="Generating Embeddings"):
        data = [(0, sequence)]
        batch_labels, batch_strs, batch_tokens = batch_converter(data)

        # Generate embeddings
        with torch.no_grad():
            results = model(batch_tokens, repr_layers=[33]) 
            token_embeddings = results['representations'][33]

        # Average embeddings across all tokens and convert to numpy
        # averaged_embedding = token_embeddings.mean(dim=1).numpy()
        # embeddings.append(averaged_embedding)

        # Create full embeddings
        full_embedding = token_embeddings.squeeze().numpy()
        embeddings.append(full_embedding)
    
    #return np.vstack(embeddings) 
    return embeddings

df_test = df_clean[:500].copy()

In [14]:
df_test

Unnamed: 0,Uniprot_ID,full_sequence,Qvalue(LiP)
0,P15703,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...,0.003686
1,P15703,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...,0.034862
2,P15703,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...,0.039093
3,P15703,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...,0.042584
4,P15703,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...,0.048552
...,...,...,...
495,Q00055,MSAAADRLNLTSGHLNAGRKRSSSSVSLKAAEKPFKVTVIGSGNWG...,0.292011
496,Q00055,MSAAADRLNLTSGHLNAGRKRSSSSVSLKAAEKPFKVTVIGSGNWG...,0.361558
497,P49626,MSRPQVTVHSLTGEATANALPLPAVFSAPIRPDIVHTVFTSVNKNK...,0.007762
498,P49626,MSRPQVTVHSLTGEATANALPLPAVFSAPIRPDIVHTVFTSVNKNK...,0.027188


In [15]:
def pad_sequence(sequence, target_length=1000, pad_token='<pad>'):
    padding_length = target_length - len(sequence)
    return sequence + pad_token * padding_length

# Apply padding to each sequence with <pad>
df_test['padded_sequence'] = df_test['full_sequence'].apply(lambda x: pad_sequence(x))

In [16]:
df_test

Unnamed: 0,Uniprot_ID,full_sequence,Qvalue(LiP),padded_sequence
0,P15703,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...,0.003686,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...
1,P15703,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...,0.034862,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...
2,P15703,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...,0.039093,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...
3,P15703,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...,0.042584,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...
4,P15703,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...,0.048552,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...
...,...,...,...,...
495,Q00055,MSAAADRLNLTSGHLNAGRKRSSSSVSLKAAEKPFKVTVIGSGNWG...,0.292011,MSAAADRLNLTSGHLNAGRKRSSSSVSLKAAEKPFKVTVIGSGNWG...
496,Q00055,MSAAADRLNLTSGHLNAGRKRSSSSVSLKAAEKPFKVTVIGSGNWG...,0.361558,MSAAADRLNLTSGHLNAGRKRSSSSVSLKAAEKPFKVTVIGSGNWG...
497,P49626,MSRPQVTVHSLTGEATANALPLPAVFSAPIRPDIVHTVFTSVNKNK...,0.007762,MSRPQVTVHSLTGEATANALPLPAVFSAPIRPDIVHTVFTSVNKNK...
498,P49626,MSRPQVTVHSLTGEATANALPLPAVFSAPIRPDIVHTVFTSVNKNK...,0.027188,MSRPQVTVHSLTGEATANALPLPAVFSAPIRPDIVHTVFTSVNKNK...


In [17]:
# Check if the padding is correct
df_test['full_sequence_length'] = df_test['full_sequence'].apply(len)
df_test['num_pad_tokens'] = df_test['padded_sequence'].apply(lambda x: x.count('<pad>'))
df_test['padded_sequence_length'] = df_test['full_sequence_length'] + df_test['num_pad_tokens']
df_test['expected_padded_length'] = 1000

In [18]:
# Generate the embeddings!

sequences = df_test['padded_sequence'].tolist()
embeddings = generate_embeddings(model, alphabet, sequences)

Generating Embeddings: 100%|██████████| 500/500 [1:20:04<00:00,  9.61s/it]


In [19]:
df_test['full_embedding'] = [e.tolist() for e in embeddings]

In [20]:
df_test.to_pickle('data/embeddings_new.pkl', protocol=4)