In [1]:
import esm
import torch
import pandas as pd
import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Loadsm the model
model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
model.eval()

ESM2(
  (embed_tokens): Embedding(33, 1280, padding_idx=1)
  (layers): ModuleList(
    (0): TransformerLayer(
      (self_attn): MultiheadAttention(
        (k_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (rot_emb): RotaryEmbedding()
      )
      (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
      (fc1): Linear(in_features=1280, out_features=5120, bias=True)
      (fc2): Linear(in_features=5120, out_features=1280, bias=True)
      (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
    )
    (1): TransformerLayer(
      (self_attn): MultiheadAttention(
        (k_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (v_proj): Linear(in_features=1280, out_features=1280, bia

In [3]:
df = pd.read_csv('data/OsmoticStress_with_binary_positions_padded.csv')

In [4]:
df_clean = df[["Uniprot_ID", "full_sequence"]].drop_duplicates()
df_clean.reset_index(drop=True, inplace=True)

In [5]:
df_clean.shape

(2490, 2)

In [6]:
df_clean = df_clean[~df_clean['full_sequence'].apply(lambda x: isinstance(x, float))]
df_clean['full_sequence'] = df_clean['full_sequence'].astype(str)

In [7]:
df_clean

Unnamed: 0,Uniprot_ID,full_sequence
0,P15703,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...
1,P38174,MTDAEIENSPASDLKELNLENEGVEQQDQAKADESDPVESKKKKNK...
2,P06169,MSEITLGKYLFERLKQVNVNTVFGLPGDFNLSLLDKIYEVEGMRWA...
3,P00359,MVRVAINGFGRIGRLVMRIALSRPNVEVVALNDPFITNDYAAYMFK...
5,P37292,MFPRASALAKCMATVHRRGLLTSGAQSLVSKPVSEGDPEMFDILQQ...
...,...,...
2485,P38887,MILKLVHCLVALTGLIFAKPYQQQQAVLAPSQDVPLRDIHIGDINF...
2486,P53093,MSYGREDTTIEPDFIEPDAPLAASGGVADNIGGTMQNSGSRGTLDE...
2487,Q04772,MSSDGMNRDVSNSKPNVRFAAPQRLSVAHPAISSPLHMPMSKSSRK...
2488,P21192,MDNVVDPWYINPSGFAKDTQDEEYVQHHDNVNPTIPPPDNYILNNE...


In [8]:
df_full = pd.read_excel('data/OsmoticStress.xlsx')

In [9]:
# add Qvalue(LiP) information again to dataframe to get the sequences with lowest Qvalue(LiP) to create the embeddings
df_clean = pd.merge(df_clean, df_full[['Uniprot_ID', 'Qvalue(LiP)']], on='Uniprot_ID', how='left')

In [10]:
df_clean

Unnamed: 0,Uniprot_ID,full_sequence,Qvalue(LiP)
0,P15703,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...,0.003686
1,P15703,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...,0.034862
2,P15703,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...,0.039093
3,P15703,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...,0.042584
4,P15703,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...,0.048552
...,...,...,...
27139,P38887,MILKLVHCLVALTGLIFAKPYQQQQAVLAPSQDVPLRDIHIGDINF...,0.376924
27140,P53093,MSYGREDTTIEPDFIEPDAPLAASGGVADNIGGTMQNSGSRGTLDE...,0.381210
27141,Q04772,MSSDGMNRDVSNSKPNVRFAAPQRLSVAHPAISSPLHMPMSKSSRK...,0.381696
27142,P21192,MDNVVDPWYINPSGFAKDTQDEEYVQHHDNVNPTIPPPDNYILNNE...,0.382355


In [11]:
sorted_df = df_clean.sort_values(by='Qvalue(LiP)', ascending=True)
sorted_df.head()

Unnamed: 0,Uniprot_ID,full_sequence,Qvalue(LiP)
0,P15703,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...,0.003686
36,P06169,MSEITLGKYLFERLKQVNVNTVFGLPGDFNLSLLDKIYEVEGMRWA...,0.003686
18,P38174,MTDAEIENSPASDLKELNLENEGVEQQDQAKADESDPVESKKKKNK...,0.003686
215,P00359,MVRVAINGFGRIGRLVMRIALSRPNVEVVALNDPFITNDYAAYMFK...,0.003686
266,P37292,MFPRASALAKCMATVHRRGLLTSGAQSLVSKPVSEGDPEMFDILQQ...,0.005126


In [12]:
print(len(sorted_df['Uniprot_ID'].unique()))
print(len(sorted_df['Uniprot_ID']))


2441
27144


In [13]:
max_length = sorted_df['full_sequence'].str.len().max()
max_length

997

In [14]:
df_test = sorted_df.head(5000)
df_test = df_test[['Uniprot_ID', 'full_sequence']]

In [15]:
df_test

Unnamed: 0,Uniprot_ID,full_sequence
0,P15703,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...
36,P06169,MSEITLGKYLFERLKQVNVNTVFGLPGDFNLSLLDKIYEVEGMRWA...
18,P38174,MTDAEIENSPASDLKELNLENEGVEQQDQAKADESDPVESKKKKNK...
215,P00359,MVRVAINGFGRIGRLVMRIALSRPNVEVVALNDPFITNDYAAYMFK...
266,P37292,MFPRASALAKCMATVHRRGLLTSGAQSLVSKPVSEGDPEMFDILQQ...
...,...,...
11455,P38891,MLQRHSLKLGKFSIRTLATGAPLDASKLKITRNPNPSKPRPNEELV...
7175,P54115,MTKLHFDTAEPVKITLPNGLTYEQPTGLFINNKFMKAQDGKTYPVE...
11215,P27616,MSITKTELDGILPLVARGKVRDIYEVDAGTLLFVATDRISAYDVIM...
19163,P22336,MSSVQLSRGDFHSIFTNKQRYDNPTGGVYQVYNTRKSDGANSNRKN...


In [16]:
df_test_dropped = df_test.drop_duplicates(subset='Uniprot_ID', keep='first')

In [17]:
df_test_dropped

Unnamed: 0,Uniprot_ID,full_sequence
0,P15703,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...
36,P06169,MSEITLGKYLFERLKQVNVNTVFGLPGDFNLSLLDKIYEVEGMRWA...
18,P38174,MTDAEIENSPASDLKELNLENEGVEQQDQAKADESDPVESKKKKNK...
215,P00359,MVRVAINGFGRIGRLVMRIALSRPNVEVVALNDPFITNDYAAYMFK...
266,P37292,MFPRASALAKCMATVHRRGLLTSGAQSLVSKPVSEGDPEMFDILQQ...
...,...,...
23796,Q03667,MARSRGSSRPISRSRPTQTRSASTMAAPVHPQQQQQPNAYSHPPAA...
23805,Q12522,MATRTQFENSNEIGVFSKLTNTYCLVAVGGSENFYSAFEAELGDAI...
23813,P53875,MSQAAKNVIVKLIVGAGQAAPSPPVGPALGSKGIKAIDFCKEFNAR...
23814,P38295,MSEVSKWPAINPFHWGYNGTVSHIVGENGSIKLHLKDNKEQVDFDE...


In [18]:
import tqdm
import numpy as np
import torch

def generate_embeddings(model, alphabet, sequences):
    """
    Function to generate the embeddings.
    """
    embeddings = []
    
    batch_converter = alphabet.get_batch_converter()
    
    for sequence in tqdm.tqdm(sequences, desc="Generating Embeddings"):
        data = [(0, sequence)]
        batch_labels, batch_strs, batch_tokens = batch_converter(data)

        # Generate embeddings
        with torch.no_grad():
            results = model(batch_tokens, repr_layers=[33]) 
            token_embeddings = results['representations'][33]

        # Average embeddings across all tokens and convert to numpy
        # averaged_embedding = token_embeddings.mean(dim=1).numpy()
        # embeddings.append(averaged_embedding)

        # Create full embeddings
        full_embedding = token_embeddings.squeeze().numpy()
        embeddings.append(full_embedding)
    
    #return np.vstack(embeddings) 
    return embeddings

In [19]:
df_test_dropped

Unnamed: 0,Uniprot_ID,full_sequence
0,P15703,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...
36,P06169,MSEITLGKYLFERLKQVNVNTVFGLPGDFNLSLLDKIYEVEGMRWA...
18,P38174,MTDAEIENSPASDLKELNLENEGVEQQDQAKADESDPVESKKKKNK...
215,P00359,MVRVAINGFGRIGRLVMRIALSRPNVEVVALNDPFITNDYAAYMFK...
266,P37292,MFPRASALAKCMATVHRRGLLTSGAQSLVSKPVSEGDPEMFDILQQ...
...,...,...
23796,Q03667,MARSRGSSRPISRSRPTQTRSASTMAAPVHPQQQQQPNAYSHPPAA...
23805,Q12522,MATRTQFENSNEIGVFSKLTNTYCLVAVGGSENFYSAFEAELGDAI...
23813,P53875,MSQAAKNVIVKLIVGAGQAAPSPPVGPALGSKGIKAIDFCKEFNAR...
23814,P38295,MSEVSKWPAINPFHWGYNGTVSHIVGENGSIKLHLKDNKEQVDFDE...


In [20]:
def pad_sequence(sequence, target_length=1000, pad_token='<pad>'):
    padding_length = target_length - len(sequence)
    return sequence + pad_token * padding_length

# Apply padding to each sequence with <pad>
df_test_dropped['padded_sequence'] = df_test_dropped['full_sequence'].apply(lambda x: pad_sequence(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [21]:
df_test_dropped

Unnamed: 0,Uniprot_ID,full_sequence,padded_sequence
0,P15703,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...
36,P06169,MSEITLGKYLFERLKQVNVNTVFGLPGDFNLSLLDKIYEVEGMRWA...,MSEITLGKYLFERLKQVNVNTVFGLPGDFNLSLLDKIYEVEGMRWA...
18,P38174,MTDAEIENSPASDLKELNLENEGVEQQDQAKADESDPVESKKKKNK...,MTDAEIENSPASDLKELNLENEGVEQQDQAKADESDPVESKKKKNK...
215,P00359,MVRVAINGFGRIGRLVMRIALSRPNVEVVALNDPFITNDYAAYMFK...,MVRVAINGFGRIGRLVMRIALSRPNVEVVALNDPFITNDYAAYMFK...
266,P37292,MFPRASALAKCMATVHRRGLLTSGAQSLVSKPVSEGDPEMFDILQQ...,MFPRASALAKCMATVHRRGLLTSGAQSLVSKPVSEGDPEMFDILQQ...
...,...,...,...
23796,Q03667,MARSRGSSRPISRSRPTQTRSASTMAAPVHPQQQQQPNAYSHPPAA...,MARSRGSSRPISRSRPTQTRSASTMAAPVHPQQQQQPNAYSHPPAA...
23805,Q12522,MATRTQFENSNEIGVFSKLTNTYCLVAVGGSENFYSAFEAELGDAI...,MATRTQFENSNEIGVFSKLTNTYCLVAVGGSENFYSAFEAELGDAI...
23813,P53875,MSQAAKNVIVKLIVGAGQAAPSPPVGPALGSKGIKAIDFCKEFNAR...,MSQAAKNVIVKLIVGAGQAAPSPPVGPALGSKGIKAIDFCKEFNAR...
23814,P38295,MSEVSKWPAINPFHWGYNGTVSHIVGENGSIKLHLKDNKEQVDFDE...,MSEVSKWPAINPFHWGYNGTVSHIVGENGSIKLHLKDNKEQVDFDE...


In [22]:
# Check if the padding is correct
df_test_dropped['full_sequence_length'] = df_test_dropped['full_sequence'].apply(len)
df_test_dropped['num_pad_tokens'] = df_test_dropped['padded_sequence'].apply(lambda x: x.count('<pad>'))
df_test_dropped['padded_sequence_length'] = df_test_dropped['full_sequence_length'] + df_test_dropped['num_pad_tokens']
df_test_dropped['expected_padded_length'] = 1000

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_

In [23]:
df_test_dropped

Unnamed: 0,Uniprot_ID,full_sequence,padded_sequence,full_sequence_length,num_pad_tokens,padded_sequence_length,expected_padded_length
0,P15703,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...,313,687,1000,1000
36,P06169,MSEITLGKYLFERLKQVNVNTVFGLPGDFNLSLLDKIYEVEGMRWA...,MSEITLGKYLFERLKQVNVNTVFGLPGDFNLSLLDKIYEVEGMRWA...,563,437,1000,1000
18,P38174,MTDAEIENSPASDLKELNLENEGVEQQDQAKADESDPVESKKKKNK...,MTDAEIENSPASDLKELNLENEGVEQQDQAKADESDPVESKKKKNK...,421,579,1000,1000
215,P00359,MVRVAINGFGRIGRLVMRIALSRPNVEVVALNDPFITNDYAAYMFK...,MVRVAINGFGRIGRLVMRIALSRPNVEVVALNDPFITNDYAAYMFK...,332,668,1000,1000
266,P37292,MFPRASALAKCMATVHRRGLLTSGAQSLVSKPVSEGDPEMFDILQQ...,MFPRASALAKCMATVHRRGLLTSGAQSLVSKPVSEGDPEMFDILQQ...,490,510,1000,1000
...,...,...,...,...,...,...,...
23796,Q03667,MARSRGSSRPISRSRPTQTRSASTMAAPVHPQQQQQPNAYSHPPAA...,MARSRGSSRPISRSRPTQTRSASTMAAPVHPQQQQQPNAYSHPPAA...,156,844,1000,1000
23805,Q12522,MATRTQFENSNEIGVFSKLTNTYCLVAVGGSENFYSAFEAELGDAI...,MATRTQFENSNEIGVFSKLTNTYCLVAVGGSENFYSAFEAELGDAI...,245,755,1000,1000
23813,P53875,MSQAAKNVIVKLIVGAGQAAPSPPVGPALGSKGIKAIDFCKEFNAR...,MSQAAKNVIVKLIVGAGQAAPSPPVGPALGSKGIKAIDFCKEFNAR...,158,842,1000,1000
23814,P38295,MSEVSKWPAINPFHWGYNGTVSHIVGENGSIKLHLKDNKEQVDFDE...,MSEVSKWPAINPFHWGYNGTVSHIVGENGSIKLHLKDNKEQVDFDE...,451,549,1000,1000


In [24]:
df_test_dropped.reset_index(drop=True, inplace=True)
df_test_dropped = df_test_dropped[1000:]

In [25]:
df_test_dropped

Unnamed: 0,Uniprot_ID,full_sequence,padded_sequence,full_sequence_length,num_pad_tokens,padded_sequence_length,expected_padded_length
1000,Q04739,MAGDNPENKDASMLDVSDAASNTTINGKHSADSTNEASLAYTFSQM...,MAGDNPENKDASMLDVSDAASNTTINGKHSADSTNEASLAYTFSQM...,417,583,1000,1000
1001,P53172,MPQNTRHTSIVEMLSTPPQLPNSTDLNSLSEQTDKNTEANKSDTES...,MPQNTRHTSIVEMLSTPPQLPNSTDLNSLSEQTDKNTEANKSDTES...,527,473,1000,1000
1002,P33298,MEELGIVTPVEKAVEEKPAVKSYASLLAQLNGTVNNNSALSNVNSD...,MEELGIVTPVEKAVEEKPAVKSYASLLAQLNGTVNNNSALSNVNSD...,428,572,1000,1000
1003,Q12045,MASQQNKHAFLSKNRIFHNPDNVSSSKSRNLMDITNTTNTMNGSRP...,MASQQNKHAFLSKNRIFHNPDNVSSSKSRNLMDITNTTNTMNGSRP...,647,353,1000,1000
1004,Q03264,MTQDKEVKVVAPDVAPDQEVEINKSVKDAKHQTNDDSLLQHKKKGK...,MTQDKEVKVVAPDVAPDQEVEINKSVKDAKHQTNDDSLLQHKKKGK...,515,485,1000,1000
...,...,...,...,...,...,...,...
1471,Q03667,MARSRGSSRPISRSRPTQTRSASTMAAPVHPQQQQQPNAYSHPPAA...,MARSRGSSRPISRSRPTQTRSASTMAAPVHPQQQQQPNAYSHPPAA...,156,844,1000,1000
1472,Q12522,MATRTQFENSNEIGVFSKLTNTYCLVAVGGSENFYSAFEAELGDAI...,MATRTQFENSNEIGVFSKLTNTYCLVAVGGSENFYSAFEAELGDAI...,245,755,1000,1000
1473,P53875,MSQAAKNVIVKLIVGAGQAAPSPPVGPALGSKGIKAIDFCKEFNAR...,MSQAAKNVIVKLIVGAGQAAPSPPVGPALGSKGIKAIDFCKEFNAR...,158,842,1000,1000
1474,P38295,MSEVSKWPAINPFHWGYNGTVSHIVGENGSIKLHLKDNKEQVDFDE...,MSEVSKWPAINPFHWGYNGTVSHIVGENGSIKLHLKDNKEQVDFDE...,451,549,1000,1000


In [26]:
# Generate the embeddings!

sequences = df_test_dropped['padded_sequence'].tolist()
embeddings = generate_embeddings(model, alphabet, sequences)

Generating Embeddings: 100%|██████████| 476/476 [1:13:50<00:00,  9.31s/it]


In [27]:
df_test_dropped['full_embedding'] = [e.tolist() for e in embeddings]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [28]:
df_test_dropped.to_pickle('data/embeddings_new_3_3.pkl', protocol=4)

In [29]:
df_test_dropped

Unnamed: 0,Uniprot_ID,full_sequence,padded_sequence,full_sequence_length,num_pad_tokens,padded_sequence_length,expected_padded_length,full_embedding
1000,Q04739,MAGDNPENKDASMLDVSDAASNTTINGKHSADSTNEASLAYTFSQM...,MAGDNPENKDASMLDVSDAASNTTINGKHSADSTNEASLAYTFSQM...,417,583,1000,1000,"[[0.04509701952338219, -0.03058936633169651, 0..."
1001,P53172,MPQNTRHTSIVEMLSTPPQLPNSTDLNSLSEQTDKNTEANKSDTES...,MPQNTRHTSIVEMLSTPPQLPNSTDLNSLSEQTDKNTEANKSDTES...,527,473,1000,1000,"[[0.04759582504630089, -0.0024248778354376554,..."
1002,P33298,MEELGIVTPVEKAVEEKPAVKSYASLLAQLNGTVNNNSALSNVNSD...,MEELGIVTPVEKAVEEKPAVKSYASLLAQLNGTVNNNSALSNVNSD...,428,572,1000,1000,"[[0.07428184151649475, -0.027576325461268425, ..."
1003,Q12045,MASQQNKHAFLSKNRIFHNPDNVSSSKSRNLMDITNTTNTMNGSRP...,MASQQNKHAFLSKNRIFHNPDNVSSSKSRNLMDITNTTNTMNGSRP...,647,353,1000,1000,"[[0.049321383237838745, -0.024682389572262764,..."
1004,Q03264,MTQDKEVKVVAPDVAPDQEVEINKSVKDAKHQTNDDSLLQHKKKGK...,MTQDKEVKVVAPDVAPDQEVEINKSVKDAKHQTNDDSLLQHKKKGK...,515,485,1000,1000,"[[0.018463565036654472, -0.003225350519642234,..."
...,...,...,...,...,...,...,...,...
1471,Q03667,MARSRGSSRPISRSRPTQTRSASTMAAPVHPQQQQQPNAYSHPPAA...,MARSRGSSRPISRSRPTQTRSASTMAAPVHPQQQQQPNAYSHPPAA...,156,844,1000,1000,"[[0.0809628814458847, -0.09280642122030258, 0...."
1472,Q12522,MATRTQFENSNEIGVFSKLTNTYCLVAVGGSENFYSAFEAELGDAI...,MATRTQFENSNEIGVFSKLTNTYCLVAVGGSENFYSAFEAELGDAI...,245,755,1000,1000,"[[0.13922131061553955, 0.007300123106688261, 0..."
1473,P53875,MSQAAKNVIVKLIVGAGQAAPSPPVGPALGSKGIKAIDFCKEFNAR...,MSQAAKNVIVKLIVGAGQAAPSPPVGPALGSKGIKAIDFCKEFNAR...,158,842,1000,1000,"[[0.08811846375465393, -0.034994419664144516, ..."
1474,P38295,MSEVSKWPAINPFHWGYNGTVSHIVGENGSIKLHLKDNKEQVDFDE...,MSEVSKWPAINPFHWGYNGTVSHIVGENGSIKLHLKDNKEQVDFDE...,451,549,1000,1000,"[[0.0455806627869606, 0.009255562908947468, 0...."
