In [1]:
# !pip install dgeb

In [2]:
import logging
from collections import defaultdict

import dgeb
from dgeb.evaluators import ClusteringEvaluator
from dgeb.modality import Modality
from dgeb.models import BioSeqTransformer
from dgeb.tasks import Dataset, Task, TaskMetadata, TaskResult

logger = logging.getLogger(__name__)


def run_clustering_task(model: BioSeqTransformer, metadata: TaskMetadata) -> TaskResult:
    """Evaluate clustering task. Utilizes the ClusteringEvaluator."""
    if len(metadata.datasets) != 1:
        raise ValueError("Clustering tasks require 1 dataset.")
    ds = metadata.datasets[0].load()["train"]
    embeds = model.encode(ds["Sequence"])
    layer_results = defaultdict(dict)
    for i, layer in enumerate(model.layers):
        labels = ds["Label"]
        evaluator = ClusteringEvaluator(embeds[:, i], labels)
        layer_results["layers"][layer] = evaluator()
        logger.info(
            f"Layer: {layer}, {metadata.display_name} results: {layer_results['layers'][layer]}"
        )
    return TaskResult.from_dict(metadata, layer_results, model.metadata)


class RNAclustering(Task):
    metadata = TaskMetadata(
        id="ecoli_rna_clustering",
        display_name="E.coli RNA Clustering",
        description="Evaluate on RNA clustering task for sRNA/tRNA/rRNA segments in E.coli K-12.",
        type="clustering",
        modality=Modality.DNA,
        datasets=[
            Dataset(
                path="tattabio/e_coli_rnas",
                revision="4c134bb4bdb2b0ef1d59fe10797efdfeaf318de6",
            )
        ],
        primary_metric_id="v_measure",
    )

    def run(self, model: BioSeqTransformer) -> TaskResult:
        return run_clustering_task(model, self.metadata)

net = dgeb.get_model("togethercomputer/evo-1-8k-base")

# Run task
task = RNAclustering()
print(net)
result = task.run(net)

# # This part is not particularly helpful, this is for seeing how dgeb task works

  from .autonotebook import tqdm as notebook_tqdm
A new version of the following files was downloaded from https://huggingface.co/togethercomputer/evo-1-131k-base:
- configuration_hyena.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/togethercomputer/evo-1-131k-base:
- utils.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/togethercomputer/evo-1-131k-base:
- engine.py
- utils.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/togethercomputer/ev

ImportError: This modeling file requires the following packages that were not found in your environment: einops, flash_attn. Run `pip install einops flash_attn`

In [None]:
# from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM

# model_name = "togethercomputer/evo-1-8k-base"
# config = AutoConfig.from_pretrained(model_name, trust_remote_code=True, revision="1.1_fix")
# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     config=config,
#     trust_remote_code=True,
#     revision="1.1_fix"
# )


KeyboardInterrupt: 

In [None]:
from datasets import load_dataset

ds = load_dataset("tattabio/e_coli_rnas") # Load dataset from huggingface

In [None]:
print(ds['train'])
test1 = ds['train'][0]

embeddings = net.encode(test1["Sequence"])
print(len(test1["Sequence"]))
print(embeddings.shape) # See what the data looks like

Dataset({
    features: ['Sequence', 'Label'],
    num_rows: 161
})




100
(100, 2, 768)


In [None]:
import numpy as np
from tqdm import tqdm
import pickle
from torch.utils.data import DataLoader

batch_size = 32
sequences = [data["Sequence"] for data in ds['train']]
labels = [data["Label"] for data in ds['train']]

dataloader = DataLoader(list(zip(sequences, labels)), batch_size=batch_size, shuffle=False)

features = []
label_list = []

# Extract the embeddings of NT for each sequence
for batch in tqdm(dataloader):
    batch_seqs, batch_labels = batch
    embeddings = net.encode(batch_seqs)  # shape: (batch_size, num_layers, 512)
    features.extend(embeddings)
    label_list.extend(batch_labels)

# Save both
np.save("features-evo.npy", np.array(features))
np.save("labels-evo.npy", np.array(label_list)) 


  0%|          | 0/6 [00:00<?, ?it/s]

Parameter 'transform'=functools.partial(<bound method BioSeqTransformer._tokenize_func of <dgeb.models.NTModel object at 0x7f8d69dba490>>, EsmTokenizer(name_or_path='InstaDeepAI/nucleotide-transformer-v2-250m-multi-species', vocab_size=4107, model_max_length=2048, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '<unk>', 'pad_token': '<pad>', 'cls_token': '<cls>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("<mask>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<cls>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, 

In [None]:
print(net.encode.__doc__)

Returns a list of embeddings for the given sequences.
        Args:
            sequences (`List[str]`): List of sequences to encode
        Returns:
            `np.ndarray`: Embeddings for the given sequences of shape [num_sequences, num_layers, embedding_dim].
        
