In [None]:
import esm
import torch
import pandas as pd
import tqdm

In [None]:
df = pd.read_excel('data/OsmoticStress.xlsx')

In [None]:
df_P32485 = df[df["Uniprot_ID"] == "P32485"]

In [None]:
df_P32485

In [None]:
df_2 = pd.read_csv('data/OsmoticStress_with_binary_positions_padded.csv')
df_2 = df_2.drop(columns=['Binary_Positions', 'Padded_Binary_Positions'])
df_2_P32485 = df_2[df_2["Uniprot_ID"] == "P32485"]
df_2_P32485['full_sequence_len'] = df_2_P32485['full_sequence'].apply(len)

In [None]:
df_2_P32485_HOG1 = df_2_P32485.iloc[:1]
df_2_P32485_HOG1.head()

In [None]:
def get_peptide_context(full_seq, peptide, context_size=50):
    start_index = full_seq.find(peptide)
    start_slice = max(0, start_index - context_size)
    end_slice = min(len(full_seq), start_index + len(peptide) + context_size)
    return full_seq[start_slice:end_slice]

def apply_context(row):
    return get_peptide_context(row['full_sequence'], row['Peptide_sequence'])

# Apply the function using the external lambda definition
df_2_P32485_HOG1['context_including_peptide'] = df_2_P32485_HOG1.apply(apply_context, axis=1)

In [None]:
df_2_P32485_HOG1

In [None]:
sequence = df_2_P32485_HOG1.iloc[0, 5]
print(len(sequence))

In [None]:
amino_acids = 'ACDEFGHIKLMNPQRSTVWY'

# Function to generate all mutant sequences
def generate_mutant_sequences(sequence):
    mutants = []
    for index in range(len(sequence)):
        for aa in amino_acids:
            if aa != sequence[index]:  # Optionally exclude the original amino acid
                mutant_sequence = sequence[:index] + aa + sequence[index + 1:]
                mutants.append((index, aa, mutant_sequence))
    return mutants

# Get all mutants
mutants = generate_mutant_sequences(sequence)

# Convert to DataFrame
df_mutants = pd.DataFrame(mutants, columns=['Position', 'Substituted_AA', 'Mutant_Sequence'])

In [None]:
df_mutants

In [None]:
# Loadsm the model
model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
model.eval()

In [13]:
import tqdm
import numpy as np
import torch

def generate_embeddings(model, alphabet, sequences):
    """
    Function to generate the embeddings.
    """
    embeddings = []
    
    batch_converter = alphabet.get_batch_converter()
    
    for sequence in tqdm.tqdm(sequences, desc="Generating Embeddings"):
        data = [(0, sequence)]
        batch_labels, batch_strs, batch_tokens = batch_converter(data)

        # Generate embeddings
        with torch.no_grad():
            results = model(batch_tokens, repr_layers=[33]) 
            token_embeddings = results['representations'][33]

        # Average embeddings across all tokens and convert to numpy
        # averaged_embedding = token_embeddings.mean(dim=1).numpy()
        # embeddings.append(averaged_embedding)

        # Create full embeddings
        full_embedding = token_embeddings.squeeze().numpy()
        embeddings.append(full_embedding)
    
    #return np.vstack(embeddings) 
    return embeddings

In [14]:
df_clean = df_mutants

In [15]:
sequences = df_clean['Mutant_Sequence'].tolist()
print(len(sequences))

2147


In [None]:
# Generate the embeddings!
embeddings = generate_embeddings(model, alphabet, sequences)

Generating Embeddings:  48%|████▊     | 1026/2147 [25:11<27:09,  1.45s/it] 

In [None]:
df_clean['embeddings'] = [e.tolist() for e in embeddings]

In [None]:
df_clean.to_pickle('data/mutant_sequences_50_context.pkl')