In [1]:
import esm
import torch
import pandas as pd
import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_excel('data/OsmoticStress.xlsx')

In [3]:
# Get a specific protein
df_P32485 = df[df["Uniprot_ID"] == "P32485"]

In [4]:
df_P32485

Unnamed: 0,Uniprot_ID,Gene_name,Systematic_gene_name,Protein_Description,Peptide_sequence,Log2FC(LiP_raw),Log2FC(LiP_norm),Pvalue(LiP),Qvalue(LiP),Log2FC(P.Abundance),Pvalue(P.Abundance),Qvalue(P.Abundance)
1552,P32485,HOG1,YLR113W,Mitogen-activated protein kinase HOG1,IQDPQMTGYVSTR,1.758023,1.758023,2.6e-05,0.010327,-0.1424,0.324092,0.257883
1553,P32485,HOG1,YLR113W,Mitogen-activated protein kinase HOG1,DVINTICSENTLK,0.405436,0.405436,0.005463,0.036264,-0.1424,0.324092,0.257883
1554,P32485,HOG1,YLR113W,Mitogen-activated protein kinase HOG1,PFSTAVLAK,-0.281359,-0.281359,0.033448,0.057266,-0.1424,0.324092,0.257883
1555,P32485,HOG1,YLR113W,Mitogen-activated protein kinase HOG1,ICDFGLAR,0.209505,0.209505,0.042918,0.063228,-0.1424,0.324092,0.257883
1556,P32485,HOG1,YLR113W,Mitogen-activated protein kinase HOG1,SAPYHDPTDEPVADAK,1.183001,1.183001,0.119974,0.100925,-0.1424,0.324092,0.257883
1557,P32485,HOG1,YLR113W,Mitogen-activated protein kinase HOG1,APEIMLTWQK,1.479126,1.479126,0.126023,0.10349,-0.1424,0.324092,0.257883
1558,P32485,HOG1,YLR113W,Mitogen-activated protein kinase HOG1,VSDHVAANDTITDYGNQ,0.759893,0.759893,0.14523,0.111707,-0.1424,0.324092,0.257883
1559,P32485,HOG1,YLR113W,Mitogen-activated protein kinase HOG1,LLQTRPLEK,0.280905,0.280905,0.199228,0.134569,-0.1424,0.324092,0.257883
1560,P32485,HOG1,YLR113W,Mitogen-activated protein kinase HOG1,TVEPDAVDLLEK,-0.061325,-0.061325,0.210163,0.139067,-0.1424,0.324092,0.257883
1561,P32485,HOG1,YLR113W,Mitogen-activated protein kinase HOG1,PSNILINENCDLK,-0.457868,-0.457868,0.218313,0.142298,-0.1424,0.324092,0.257883


In [5]:
df_2 = pd.read_csv('data/OsmoticStress_with_binary_positions_padded.csv')
df_2 = df_2.drop(columns=['Binary_Positions', 'Padded_Binary_Positions'])
df_2_P32485 = df_2[df_2["Uniprot_ID"] == "P32485"]
df_2_P32485['full_sequence_len'] = df_2_P32485['full_sequence'].apply(len)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [6]:
# Only get the P32485 HOG1
df_2_P32485_HOG1 = df_2_P32485.iloc[:1]
df_2_P32485_HOG1.head()

Unnamed: 0,Uniprot_ID,Peptide_sequence,Log2FC(LiP_norm),full_sequence,full_sequence_len
1336,P32485,IQDPQMTGYVSTR,1.758023,MTTNEEFIRTQIFGTVFEITNRYNDLNPVGMGAFGLVCSATDTLTS...,435


In [7]:
# Get the context of the peptide in the full sequence: 50 amino acids before and after the peptide
def get_peptide_context(full_seq, peptide, context_size=50):
    start_index = full_seq.find(peptide)
    if start_index == -1:
        return None, None, None  # Peptide not found
    start_slice = max(0, start_index - context_size)
    end_slice = min(len(full_seq), start_index + len(peptide) + context_size)
    return full_seq[start_slice:end_slice], start_slice, end_slice

# Apply the function to the first row of the dataframe
def apply_context(row):
    context, start_slice, end_slice = get_peptide_context(row['full_sequence'], row['Peptide_sequence'], context_size=50)
    return context, start_slice, end_slice

# Apply the function using the external lambda definition
df_context = df_2_P32485_HOG1.apply(apply_context, axis=1, result_type="expand")
df_context.columns = ['context_including_peptide', 'start_slice', 'end_slice']

In [8]:
df_2_P32485_HOG1

Unnamed: 0,Uniprot_ID,Peptide_sequence,Log2FC(LiP_norm),full_sequence,full_sequence_len
1336,P32485,IQDPQMTGYVSTR,1.758023,MTTNEEFIRTQIFGTVFEITNRYNDLNPVGMGAFGLVCSATDTLTS...,435


In [10]:
peptide_sequence = df_2_P32485_HOG1.iloc[0, 1]
print(len(peptide_sequence))
protein_sequence = df_2_P32485_HOG1.iloc[0, 3]
print(len(protein_sequence))

13
435


In [11]:
amino_acids = 'ACDEFGHIKLMNPQRSTVWY'

# Function to generate all mutant sequences
def generate_mutant_sequences(full_seq, context, start, end):
    mutants = []
    for index in range(len(context)):
        original_aa = context[index]
        for aa in amino_acids:
            if aa != original_aa:
                # Create the mutant sequence within the context
                mutant_context = context[:index] + aa + context[index + 1:]
                # Rebuild the full sequence with the mutated context
                mutant_sequence = full_seq[:start] + mutant_context + full_seq[end:]
                mutants.append((start + index, aa, mutant_sequence))
    return mutants

# Get all mutants
context, start, end = get_peptide_context(protein_sequence, peptide_sequence, 50)
mutants = generate_mutant_sequences(protein_sequence, context, start, end)

# Convert to DataFrame
df_mutants = pd.DataFrame(mutants, columns=['Position', 'Substituted_AA', 'Mutant_Sequence'])

In [12]:
df_mutants

Unnamed: 0,Position,Substituted_AA,Mutant_Sequence
0,117,A,MTTNEEFIRTQIFGTVFEITNRYNDLNPVGMGAFGLVCSATDTLTS...
1,117,C,MTTNEEFIRTQIFGTVFEITNRYNDLNPVGMGAFGLVCSATDTLTS...
2,117,D,MTTNEEFIRTQIFGTVFEITNRYNDLNPVGMGAFGLVCSATDTLTS...
3,117,E,MTTNEEFIRTQIFGTVFEITNRYNDLNPVGMGAFGLVCSATDTLTS...
4,117,F,MTTNEEFIRTQIFGTVFEITNRYNDLNPVGMGAFGLVCSATDTLTS...
...,...,...,...
2142,229,R,MTTNEEFIRTQIFGTVFEITNRYNDLNPVGMGAFGLVCSATDTLTS...
2143,229,S,MTTNEEFIRTQIFGTVFEITNRYNDLNPVGMGAFGLVCSATDTLTS...
2144,229,V,MTTNEEFIRTQIFGTVFEITNRYNDLNPVGMGAFGLVCSATDTLTS...
2145,229,W,MTTNEEFIRTQIFGTVFEITNRYNDLNPVGMGAFGLVCSATDTLTS...


In [13]:
def pad_sequence(sequence, target_length=1000, pad_token='<pad>'):
    """
    Add do the sequence <pad> to reach 1000.
    """
    padding_length = target_length - len(sequence)
    return sequence + pad_token * padding_length

# Apply padding to each sequence with <pad>
df_mutants['padded_sequence'] = df_mutants['Mutant_Sequence'].apply(lambda x: pad_sequence(x))

In [14]:
# Check if the padding is correct
df_mutants['full_sequence_length'] = df_mutants['Mutant_Sequence'].apply(len)
df_mutants['num_pad_tokens'] = df_mutants['padded_sequence'].apply(lambda x: x.count('<pad>'))
df_mutants['padded_sequence_length'] = df_mutants['full_sequence_length'] + df_mutants['num_pad_tokens']
df_mutants['expected_padded_length'] = 1000

In [15]:
df_mutants

Unnamed: 0,Position,Substituted_AA,Mutant_Sequence,padded_sequence,full_sequence_length,num_pad_tokens,padded_sequence_length,expected_padded_length
0,117,A,MTTNEEFIRTQIFGTVFEITNRYNDLNPVGMGAFGLVCSATDTLTS...,MTTNEEFIRTQIFGTVFEITNRYNDLNPVGMGAFGLVCSATDTLTS...,435,565,1000,1000
1,117,C,MTTNEEFIRTQIFGTVFEITNRYNDLNPVGMGAFGLVCSATDTLTS...,MTTNEEFIRTQIFGTVFEITNRYNDLNPVGMGAFGLVCSATDTLTS...,435,565,1000,1000
2,117,D,MTTNEEFIRTQIFGTVFEITNRYNDLNPVGMGAFGLVCSATDTLTS...,MTTNEEFIRTQIFGTVFEITNRYNDLNPVGMGAFGLVCSATDTLTS...,435,565,1000,1000
3,117,E,MTTNEEFIRTQIFGTVFEITNRYNDLNPVGMGAFGLVCSATDTLTS...,MTTNEEFIRTQIFGTVFEITNRYNDLNPVGMGAFGLVCSATDTLTS...,435,565,1000,1000
4,117,F,MTTNEEFIRTQIFGTVFEITNRYNDLNPVGMGAFGLVCSATDTLTS...,MTTNEEFIRTQIFGTVFEITNRYNDLNPVGMGAFGLVCSATDTLTS...,435,565,1000,1000
...,...,...,...,...,...,...,...,...
2142,229,R,MTTNEEFIRTQIFGTVFEITNRYNDLNPVGMGAFGLVCSATDTLTS...,MTTNEEFIRTQIFGTVFEITNRYNDLNPVGMGAFGLVCSATDTLTS...,435,565,1000,1000
2143,229,S,MTTNEEFIRTQIFGTVFEITNRYNDLNPVGMGAFGLVCSATDTLTS...,MTTNEEFIRTQIFGTVFEITNRYNDLNPVGMGAFGLVCSATDTLTS...,435,565,1000,1000
2144,229,V,MTTNEEFIRTQIFGTVFEITNRYNDLNPVGMGAFGLVCSATDTLTS...,MTTNEEFIRTQIFGTVFEITNRYNDLNPVGMGAFGLVCSATDTLTS...,435,565,1000,1000
2145,229,W,MTTNEEFIRTQIFGTVFEITNRYNDLNPVGMGAFGLVCSATDTLTS...,MTTNEEFIRTQIFGTVFEITNRYNDLNPVGMGAFGLVCSATDTLTS...,435,565,1000,1000


In [16]:
# Loadsm the model
model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
model.eval()

ESM2(
  (embed_tokens): Embedding(33, 1280, padding_idx=1)
  (layers): ModuleList(
    (0): TransformerLayer(
      (self_attn): MultiheadAttention(
        (k_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (rot_emb): RotaryEmbedding()
      )
      (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
      (fc1): Linear(in_features=1280, out_features=5120, bias=True)
      (fc2): Linear(in_features=5120, out_features=1280, bias=True)
      (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
    )
    (1): TransformerLayer(
      (self_attn): MultiheadAttention(
        (k_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (v_proj): Linear(in_features=1280, out_features=1280, bia

In [17]:
import tqdm
import numpy as np
import torch

def generate_embeddings(model, alphabet, sequences):
    """
    Function to generate the embeddings.
    """
    embeddings = []
    
    batch_converter = alphabet.get_batch_converter()
    
    for sequence in tqdm.tqdm(sequences, desc="Generating Embeddings"):
        data = [(0, sequence)]
        batch_labels, batch_strs, batch_tokens = batch_converter(data)

        # Generate embeddings
        with torch.no_grad():
            results = model(batch_tokens, repr_layers=[33]) 
            token_embeddings = results['representations'][33]

        # Create full embeddings
        full_embedding = token_embeddings.squeeze().numpy()
        embeddings.append(full_embedding)
    
    return embeddings

In [18]:
df_clean = df_mutants[1000:]

In [19]:
sequences = df_clean['Mutant_Sequence'].tolist()
print(len(sequences))

1147


In [20]:
# Generate the embeddings!
embeddings = generate_embeddings(model, alphabet, sequences)

Generating Embeddings: 100%|██████████| 1147/1147 [1:10:51<00:00,  3.71s/it]


In [21]:
df_clean['embeddings'] = [e.tolist() for e in embeddings]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [22]:
# Save the data in a pickle file
df_clean.to_pickle('data/mutant_sequences_50_context_fix_2.pkl')