In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# 1. Install the brain (Transformers library)
# Run this cell once
!pip install transformers torch pandas tqdm

import torch
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import numpy as np
from tqdm import tqdm
import gc

# 2. Setup the GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 3. Load the Model (The 150 Million Parameter version) - NOW WITH FP16 OPTIMIZATION
model_name = "facebook/esm2_t30_150M_UR50D"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# --- OPTIMIZATION 1: Load model in 16-bit precision ---
# This halves VRAM usage and speeds up calculation on the T4 GPU.
model = AutoModel.from_pretrained(model_name).to(device).to(torch.float16)
model.eval() # Freeze the model (We are not training, just using it)

# 4. Define the Helper Function to read FASTA (No changes to the function logic)
def read_fasta(file_path):
    ids = []
    sequences = []
    with open(file_path, 'r') as f:
        current_id = None
        current_seq = []
        for line in f:
            line = line.strip()
            if line.startswith(">"):
                if current_id:
                    ids.append(current_id)
                    sequences.append("".join(current_seq))
                current_id = line.split()[0][1:]
                current_seq = []
            else:
                current_seq.append(line)
        if current_id:
            ids.append(current_id)
            sequences.append("".join(current_seq))
    return ids, sequences

# 5. Run the Factory (Optimized Loop)
print("Reading Data...")
fasta_path = "/content/drive/MyDrive/Colab Notebooks/CAFA6_Project/train_sequences.fasta"
ids, sequences = read_fasta(fasta_path)
print(f"Loaded {len(ids)} sequences.")

# --- OPTIMIZATION 2: Increase Batch Size & Set Start Point ---
BATCH_SIZE = 128    # Increased from 32 to utilize T4 VRAM
CHUNK_SIZE = 5000
START_INDEX = 20000 # Skips chunks 0, 5000, 10000, 15000

print(f"Starting Optimized Generation from index {START_INDEX}...")
print(f"Using BATCH_SIZE: {BATCH_SIZE} and FP16.")


for i in range(START_INDEX, len(sequences), CHUNK_SIZE):
    chunk_seqs = sequences[i : i + CHUNK_SIZE]
    chunk_ids = ids[i : i + CHUNK_SIZE]

    chunk_embeddings = []

    # Inner loop for GPU batches
    for j in range(0, len(chunk_seqs), BATCH_SIZE):
        batch = chunk_seqs[j : j + BATCH_SIZE]

        # Tokenize and move to GPU
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=1024)

        # --- CRITICAL: Move input to the correct FP16 type ---
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)
            # Get the Mean of the sequence
            # CRITICAL: We need to convert the output back to FP32 before saving to Numpy!
            embeddings = outputs.last_hidden_state.mean(dim=1).float().cpu().numpy()
            chunk_embeddings.append(embeddings)

    # Save this chunk to Drive
    full_chunk = np.concatenate(chunk_embeddings)
    save_path = f"/content/drive/MyDrive/Colab Notebooks/CAFA6_Project/embeddings_chunk_{i}.npy"
    np.save(save_path, full_chunk)
    print(f"Saved chunk {i} to Drive")

    # Clean up memory
    del full_chunk, chunk_embeddings, inputs, outputs
    gc.collect()
    torch.cuda.empty_cache()

Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/95.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/93.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/595M [00:00<?, ?B/s]

Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t30_150M_UR50D and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reading Data...
Loaded 82404 sequences.
Starting Optimized Generation from index 20000...
Using BATCH_SIZE: 128 and FP16.
Saved chunk 20000 to Drive
Saved chunk 25000 to Drive
Saved chunk 30000 to Drive
Saved chunk 35000 to Drive
Saved chunk 40000 to Drive
Saved chunk 45000 to Drive
Saved chunk 50000 to Drive
Saved chunk 55000 to Drive
Saved chunk 60000 to Drive
Saved chunk 65000 to Drive
Saved chunk 70000 to Drive
Saved chunk 75000 to Drive
Saved chunk 80000 to Drive


In [None]:
# This lists all the main items in your Google Drive root folder
!ls -l /content/drive/MyDrive/"Colab Notebooks"/CAFA6_Project

total 52065
-rw------- 1 root root 53314140 Nov 18 18:21 train_sequences.fasta
