In [6]:
from esm.models.esm3 import ESM3
from esm.sdk.api import ESMProtein, SamplingConfig
from esm.utils.constants.models import ESM3_OPEN_SMALL
import torch
import pandas as pd
import json
from tqdm import tqdm
from huggingface_hub import login

# Log in to Hugging Face using your token
login()

# Load the CSV file
print("Loading CSV file...")
df = pd.read_csv('OmicsSomaticMutations_split_4_failed.csv')

# Initialize the model on both CPU and GPU
# cpu_client = ESM3.from_pretrained(ESM3_OPEN_SMALL, device=torch.device("cpu"))

# def get_embedding(sequence):
#     print(len(sequence))
#     torch.cuda.empty_cache()
#     protein = ESMProtein(sequence=sequence)
#     protein_tensor = cpu_client.encode(protein)
#     output = cpu_client.forward_and_sample(
#         protein_tensor, SamplingConfig(return_mean_embedding=True)
#     )
#     whole_protein_embedding = output.mean_embedding
#     embedding_list = whole_protein_embedding.tolist()
#     embedding_string = ", ".join(f"{value}" for value in embedding_list)
#     return embedding_string

gpu_client = ESM3.from_pretrained(ESM3_OPEN_SMALL, device=torch.device("cuda"))

def get_embedding(sequence):
    print(len(sequence))
    if len(sequence) > 3900:
        raise ValueError(f"Sequence length ({len(sequence)}) exceeds maximum length of 3900")
    torch.cuda.empty_cache()
    protein = ESMProtein(sequence=sequence)
    protein_tensor = gpu_client.encode(protein)
    with torch.no_grad():
        output = gpu_client.forward_and_sample(
            protein_tensor, SamplingConfig(return_mean_embedding=True)
        )
    whole_protein_embedding = output.mean_embedding 
    print(len(whole_protein_embedding))
    embedding_list = whole_protein_embedding.tolist()
    embedding_string = ", ".join(f"{value}" for value in embedding_list)
    return embedding_string


# Create a list for Embedding
embedding_list = []

# Process each row in the DataFrame
print("Processing sequences...")
for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing rows"):
    sequence = row['Sequence']
    embedding = row['Embedding']
    if embedding == embedding:
        embedding_list.append(embedding)
    else:
        try:
            embedding = get_embedding(sequence)
            embedding_list.append(embedding)
        except Exception as e:
            print(f"Failed to process sequence: {str(e)}")
            embedding_list.append(None)

# Add the Embedding column to the DataFrame
df['Embedding'] = embedding_list

# Save the updated DataFrame to a new CSV file
output_file = 'OmicsSomaticMutations_split_4_failed_1.csv'
df.to_csv(output_file, index=False)
print(f"Data with embeddings saved to '{output_file}'")

# Print summary
print(f"Total rows: {len(df)}")
print(f"Successful embeddings: {len([e for e in embedding_list if e is not None])}")
print(f"Failed embeddings: {len([e for e in embedding_list if e is None])}")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Loading CSV file...


  state_dict = torch.load(


Processing sequences...


Processing rows:   0%|          | 0/3 [00:00<?, ?it/s]

3897


Processing rows:  33%|███▎      | 1/3 [00:00<00:01,  1.79it/s]

Failed to process sequence: CUDA out of memory. Tried to allocate 14.50 GiB. GPU 0 has a total capacity of 79.10 GiB of which 8.49 GiB is free. Process 3688433 has 70.60 GiB memory in use. Of the allocated memory 53.60 GiB is allocated by PyTorch, and 16.34 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
3812


Processing rows:  67%|██████▋   | 2/3 [00:01<00:00,  1.45it/s]

Failed to process sequence: CUDA out of memory. Tried to allocate 13.87 GiB. GPU 0 has a total capacity of 79.10 GiB of which 10.99 GiB is free. Process 3688433 has 68.10 GiB memory in use. Of the allocated memory 51.41 GiB is allocated by PyTorch, and 16.03 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
3843


Processing rows: 100%|██████████| 3/3 [00:02<00:00,  1.44it/s]

Failed to process sequence: CUDA out of memory. Tried to allocate 14.10 GiB. GPU 0 has a total capacity of 79.10 GiB of which 10.08 GiB is free. Process 3688433 has 69.00 GiB memory in use. Of the allocated memory 52.20 GiB is allocated by PyTorch, and 16.14 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Data with embeddings saved to 'OmicsSomaticMutations_split_4_failed_1.csv'
Total rows: 3
Successful embeddings: 0
Failed embeddings: 3





In [1]:
print(embedding_list[1])

NameError: name 'embedding_list' is not defined

In [None]:
print(len(embedding))

In [4]:
df_failed = df[df['Embedding'].isna()]

# Save the updated DataFrame with successful embeddings to a new CSV file
# output_file_success = 'OmicsSomaticMutations_split_4_success.csv'
# df_success.to_csv(output_file_success, index=False)
# print(f"Data with successful embeddings saved to '{output_file_success}'")

# Save the DataFrame with failed embeddings to a separate CSV file
output_file_failed = 'OmicsSomaticMutations_split_4_failed.csv'
df_failed.to_csv(output_file_failed, index=False)
print(f"Data with failed embeddings saved to '{output_file_failed}'")

Data with failed embeddings saved to 'OmicsSomaticMutations_split_4_failed.csv'
