In [1]:
from transformers import AutoTokenizer, AutoModel,  AutoModelForSequenceClassification
from Bio import SeqIO
import torch
from torch.utils.data import DataLoader, TensorDataset
import random
import numpy as np
import torch.nn as nn
# https://huggingface.co/blog/AmelieSchreiber/esmbind
# the minimum for the ESM2 is 650M if we want better performance than ESM1b with 650M as well.
import pandas as pd
from pathlib import Path
from itertools import islice

In [2]:
dataset = TensorDataset(torch.arange(40, dtype=torch.float32).view(10, 4), torch.tensor([i for i in range(10)]))

In [3]:
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)


In [14]:
torch.arange(40).shape

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
        36, 37, 38, 39])

In [36]:
a = nn.Conv1d(1, 10, 2)
g = torch.Generator()
g.manual_seed(0)
dl = DataLoader(dataset, batch_size=4, worker_init_fn=seed_worker, generator=g, shuffle=True, num_workers=2)
for i in range(2):
    print("Epoch", i)
    for batch, label in dl:
       print(batch.unsqueeze(1).shape)
       print(batch.unsqueeze(1))
       o = a(batch.unsqueeze(1))
       print(o.shape)
       print(o)

Epoch 0
torch.Size([4, 1, 4])
tensor([[[12., 13., 14., 15.]],

        [[28., 29., 30., 31.]],

        [[20., 21., 22., 23.]],

        [[ 8.,  9., 10., 11.]]])
torch.Size([4, 10, 3])
tensor([[[  7.9515,   8.6100,   9.2686],
         [ -5.7440,  -6.2499,  -6.7559],
         [ -2.4015,  -2.5816,  -2.7617],
         [-12.2291, -13.2001, -14.1711],
         [ 10.4888,  11.3591,  12.2294],
         [  3.9718,   4.3172,   4.6625],
         [  1.2479,   1.3680,   1.4881],
         [ -0.7694,  -0.8526,  -0.9357],
         [-12.2041, -13.1510, -14.0978],
         [ -2.9194,  -3.1050,  -3.2906]],

        [[ 18.4879,  19.1464,  19.8049],
         [-13.8394, -14.3453, -14.8513],
         [ -5.2831,  -5.4632,  -5.6433],
         [-27.7649, -28.7359, -29.7069],
         [ 24.4138,  25.2841,  26.1544],
         [  9.4968,   9.8421,  10.1874],
         [  3.1696,   3.2897,   3.4098],
         [ -2.0995,  -2.1826,  -2.2658],
         [-27.3533, -28.3001, -29.2469],
         [ -5.8897,  -6.0753,  -6.

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device="cpu"

In [6]:
with open('../data/whole_sequence.fasta', 'r') as f:
    seqs = list(SeqIO.parse(f, 'fasta'))
seq = {s.id:str(s.seq) for s in seqs}

In [7]:
a = iter(seq.items())

In [8]:
u = islice(a, 10)
dict(u)

{'EH1(72)': 'MLLPETRNLLDLMDAATRGGRPRLETLPHAVGRKAVDKMSEDGEADPPEVAEVANGGFAGPASEIRFRRYRPLGEAAGLLPTLIYYHGGGFVIGNIETHDSTCRRLANKSRCQVISIDYRLAPEHPFPAPIDDGIAAFRHIRDNAESFGADAARLAVGGDSAGGAMAAVVCQACRDAGETGPAFQMLIYPATDSSRESASRVAFAEGYFLSKALMDWFWEAYVPEDTDLTDLRLSPLLATDFTGLPPAFVLTAGYDPLRDEGRAYADRLIEAGIKTTYVNYPGTIHGFFSLTRFLSQGLKANDEAAAVMGAHFGT',
 'EH2(71)': 'MGLQKLIVRTLMKLPESWILKLAGGTPVEIDGRTMDPRIQLLAAQGAKAPSMTSMSIEDARKSADEGLALLDAKPRRTVSILSRTIPGPAGDLHVRIYTPAGATGPLPGIVYYHMGGCVIGNLETCNTFCSILADDCRAIVVSVDYRLAPEHKFPAAMDDAVASFDWVSENAAALGIDPTRLGVGGDSAGGWLSAVVCQTRKAEGKTQPKAQLLIYPATDLDAKEGSMQSCAEIYPLTAEIMDWFMQQFLNSPEDAKDLKASPAHSEDLSGLAPALIMTAGFDVLRDQGEAYGNRLRDAGVPVTYRCYDSLSHAYTAFSGAVPAARQACEEIARDMARALG',
 'EH3(69)': 'MPDTTSLNIADDVRMDPRLKAMLAAFPMMEQQTFQTREEQVANANTPEATAAREQLKMMMDMMDSEEFAPSDNLDISTREFTSSPDGNAIKIQFIRPKGKQKVPCVYYIHGGGMMIMSAFYGNYRAWGKMIANNGVAVAMVDFRNCLSPSSAPEVAPFPAGLNDCVSGLKWVSENADELSIDKNKIIIAGESGGGNLTLATGLKLKQDGNIDLVKGLYALCPYIAGKWPQDRFPSSSENNGIMIELHNNQGALAYGIEQLEAENPLAWPSFASAEDMQGLPPTVINVNECDPLRDEGID

In [9]:
tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D")
tok = tokenizer(list(seq.values())[:2], padding=True, truncation=True, return_tensors="pt", is_split_into_words=False)

In [10]:
model = AutoModel.from_pretrained("facebook/esm2_t6_8M_UR50D", add_pooling_layer=False, output_hidden_states=True)
model.to(device)
model.eval()
n = AutoModelForSequenceClassification.from_pretrained("facebook/esm2_t6_8M_UR50D", num_labels=2)

Some weights of EsmForSequenceClassification were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [51]:
out = n(**tok)

In [13]:
output = model(**tok.to(device))

In [14]:
tok["attention_mask"]  = tok["attention_mask"].to(device)

In [15]:
output.last_hidden_state.shape

torch.Size([2, 343, 320])

In [16]:
results = {}
mask = tok["attention_mask"].bool()
for num, x in enumerate(output.hidden_states[-1]):
    masked_x = x[mask[num]]
    results[num] = masked_x.mean(dim=0).detach().cpu().numpy()

In [103]:
path = "data2.csv"
embeddings = pd.DataFrame(results).T
embeddings.to_csv(path, mode='a', header=not Path(path).exists())

In [18]:
x[tok["attention_mask"].bool()[0]].shape

torch.Size([317, 320])

## Load large datasets

In [19]:
from datasets import load_dataset, Dataset
from Bio import SeqIO
import pandas as pd
# https://huggingface.co/docs/datasets/loading

There are many ways to load files within datasets -> from local or remote files (json, csv, text, parquet)
Since we have a fasta file that is not supported (because it will treat each line as a row so it will double rows, but in fasta the first line is an id).
So we can process it in-memory to pandas, generators, dictionaries or list of dictionaries and use Datasets instead of load_dataset.
The load dataset it returns a dataset dict with different splits (train, test, val) as keys and then a dataset object as values.

Using datatses we will be directly using datasets so without the splits

To load fasta files use from generator beacause it is in-memory and the file might be too large to process.

In [33]:
a = load_dataset("text", data_files="../data/whole_sequence.fasta")
a

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 294
    })
})

In [20]:
def fasta_generator(fasta_file: str="../data/whole_sequence.fasta"):
    with open(fasta_file, 'r') as f:
        seqs = SeqIO.parse(f, 'fasta')
        for seq in seqs:
            yield {"id":seq.id, "seq":str(seq.seq)}

with open("../data/whole_sequence.fasta", 'r') as f:
    seqs = SeqIO.parse(f, 'fasta')
    d = pd.Series({s.id:str(s.seq) for s in seqs}).to_frame()
    d.columns = ["sequences"]

In [21]:
b = Dataset.from_generator(fasta_generator, gen_kwargs={"fasta_file":"../data/whole_sequence.fasta"})
b

Dataset({
    features: ['id', 'seq'],
    num_rows: 147
})

## Process or tokenize

In [22]:
from dataclasses import dataclass, field

Use map to apply the tokenizer function to the entire dataset
Then select the new columns generate to pass it to the model -> but you will have to change its format to torch tensors

In [23]:
dataset = b.map(lambda examples: tokenizer(examples["seq"], return_tensors="np",padding=True, truncation=True), batched=True)

In [24]:
u = dataset.select_columns(["input_ids", "attention_mask"])
dataset.set_format(type="torch", columns=["input_ids", "attention_mask"], device=device)

In [25]:
dataset

Dataset({
    features: ['id', 'seq', 'input_ids', 'attention_mask'],
    num_rows: 147
})

In [40]:
dataloader = DataLoader(dataset, batch_size=4)
for batch in dataloader:
    u = batch
u

{'input_ids': tensor([[ 0, 20, 10,  ...,  1,  1,  1],
         [ 0, 20, 15,  ...,  1,  1,  1],
         [ 0, 20, 15,  ...,  1,  1,  1]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])}

In [41]:
@dataclass(slots=True)
class LLMConfig:
    """
    Configuration for the language model.

    Parameters
    ----------  
    model_name : str
        Name of the language model.
    _device : str
        Device to use for the language model.
    disbale_gpu : bool
        Whether to disable the GPU.
    """
    model_name: str = "facebook/esm2_t6_8M_UR50D"
    disbale_gpu: bool = False
    _device: str = "cuda" if torch.cuda.is_available() else "cpu"

    @property
    def device(self):
        if self.disbale_gpu:
            return "cpu"
        return self._device
    

@dataclass(slots=True)
class TokenizeFasta:
    config: LLMConfig
    tokenizer: None = field(default=None, init=False)
     
    def __post_init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained(self.config.model_name)

    def chunks(self, fasta_file: str):
        """
        Split the fasta file into individual examples.

        Parameters
        ----------
        fasta_file : str
            Path to the FASTA file.

        Yields
        ------
        dict[str, str]
            A sample of the fasta sequence.
        """
        with open(fasta_file, 'r') as f:
            seqs = SeqIO.parse(f, 'fasta')
            for seq in seqs:
                yield {"id":seq.id, "seq":str(seq.seq)}

    def tokenize(self, fasta_file: str):
        """
        Tokenize the batch of sequences.

        Parameters
        ----------
        batch_seq : dict[str, str]
            Batch of sequences.

        Returns
        -------
        dict[str, torch.Tensor]
            Tokenized sequences.
        """
        dataset = Dataset.from_generator(self.chunks, gen_kwargs={"fasta_file": fasta_file})
        tok = dataset.map(lambda examples: self.tokenizer(examples["seq"], return_tensors="np",padding=True, truncation=True), batched=True)
        tok.set_format(type="torch", columns=["input_ids", "attention_mask"], device=self.config.device)
        return tok

@dataclass(slots=True)
class ExtractEmbeddings:
    config: LLMConfig
    model: None = field(default=None, init=False)

    def __post_init__(self):

        self.model = AutoModel.from_pretrained(self.config.model_name, add_pooling_layer=False, output_hidden_states=True)
        self.model.to(self.config.device)
    
    def extract(self, batch_seq_keys: list[str], tok: dict[str, torch.Tensor]):
        """
        Extract embeddings from the tokenized sequences.

        Parameters
        ----------
        batch_seq_keys : list[str]
            Keys for the batch of sequences.
        tok : dict[str, torch.Tensor]
            Tokenized sequences.
        Returns
        -------
        dict[str, np.array]
            Extracted embeddings.
        """
        results = {}
        output = self.model(**tok)
        mask = tok["attention_mask"].bool()
        for num, x in enumerate(output.last_hidden_state):
            masked_x = x[mask[num]]
            results[batch_seq_keys[num]] = masked_x.mean(dim=0).detach().cpu().numpy()
        return results
    
    def save(self, results: dict[str, np.array], path: str):
        """
        Save the embeddings to a CSV file.

        Parameters
        ----------
        results : dict[str, np.array]
            Embeddings to save.
        path : str
            Path to the CSV file.
        """
        embeddings = pd.DataFrame(results).T
        embeddings.to_csv(path, mode='a', header=not Path(path).exists())

        
def generate_embeddings(model_name, fasta_file, disable_gpu=False, batch_size=8, 
                        save_path = "embeddings.csv"):
    """
    Generate embeddings from a FASTA file.

    Returns
    -------
    dict[str, np.array]
        Extracted embeddings.
    """

    config = LLMConfig(model_name, disbale_gpu=disable_gpu)
    tokenizer = TokenizeFasta(config)
    embeddings = ExtractEmbeddings(config)
    tok = tokenizer.tokenize(fasta_file)

    seq_keys = list(tok["id"])
    for num, batch in enumerate(DataLoader(tok, batch_size=batch_size)):
        batch_seq_keys = seq_keys[num*batch_size:(num+1)*batch_size]
        results = embeddings.extract(batch_seq_keys, batch)
        embeddings.save(results, save_path)

In [30]:
data = TokenizeFasta(LLMConfig()).tokenize("../data/whole_sequence.fasta")
embed = ExtractEmbeddings(LLMConfig())
seq_keys = list(data["id"])

In [31]:
batch_size = 8

In [42]:
for num, batch in enumerate(DataLoader(data, batch_size=batch_size)):
    batch_seq_keys = seq_keys[num*batch_size:(num+1)*batch_size]
    results = embed.extract(batch_seq_keys, batch)
    embed.save(results, "embeddings.csv")

## Other ways to create emebeddings

In [82]:
attention_weights = torch.nn.Linear(320, 1)

In [87]:
attention_scores = attention_weights(output.hidden_states[-1])
attention_weights = torch.softmax(attention_scores, -1)

TypeError: 'Tensor' object is not callable

In [89]:
attention_weights.shape

torch.Size([2, 343, 1])

In [29]:
_temp = output.hidden_states[-1].reshape(output.hidden_states[-1].shape[0], -1)
_temp.shape

torch.Size([2, 109760])

In [53]:
_temp[0]

tensor([ 0.1419,  0.5839, -0.0722,  ...,  0.4682, -0.6849, -0.3094],
       grad_fn=<SelectBackward0>)

In [32]:
(0, 2048 - _temp.shape[1])

(0, -107712)

In [66]:
o = torch.nn.functional.pad(_temp, (0, 2048 - _temp.shape[1]))

In [68]:
o[0][:10]

tensor([ 0.1419,  0.5839, -0.0722,  0.3390, -0.1853, -0.0982, -0.9235,  0.1019,
        -0.4527, -0.6959], grad_fn=<SliceBackward0>)

In [50]:
len(set(o[0].detach().numpy()).intersection(_temp[0].detach().numpy()))xx

109670

In [74]:
len(set(output.hidden_states[-1][0][0].detach().numpy()).intersection(_temp[0][:100].detach().numpy()))

100