In [8]:
from dataset import DNADataset
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM
from transformers.models.bert.configuration_bert import BertConfig
import tqdm
import numpy as np

In [26]:
model_path = "PoetschLab/GROVER"
tokenizer = AutoTokenizer.from_pretrained(
        model_path,
        # padding_side="right",
        trust_remote_code=True,
        # padding="max_length",
        padding='max_length', truncation=True, max_length=42
    )
model = AutoModel.from_pretrained(
            model_path, 
            trust_remote_code=True,
        )

In [None]:
### WITHOUT ATTENTION MASK

In [39]:
idx = [1,2,3,4,5]
dna_sequences = DNADataset(["ATCGGCAT", "ATCAAAAT","ATCAGCAT","ATCGTTAT","ATCGGTAT"])
batch_size = 5
n_gpu=1
is_hyenadna = False
device = "cpu"
model = model.to("cpu")
first_iteration = None
data_loader = DataLoader(
    dna_sequences,
    batch_size=batch_size * n_gpu,
    shuffle=False,
    num_workers=1 #2 * n_gpu,
)
for i, batch in enumerate(tqdm.tqdm(data_loader)):
    with torch.no_grad():
        inputs = tokenizer(batch, return_tensors="pt", padding=True)[
                "input_ids"
            ].to(device)
        hidden_states = model(inputs)[0]  # index tuple returned by model
        embedding = torch.mean(hidden_states, dim=1)  # average 
        
        if i == 0:
                embeddings = embedding
        else:
            embeddings = torch.cat(
                (embeddings, embedding), dim=0
            )  # concatenate along the batch dimension
    embeddings = np.array(embeddings.detach().cpu())

    embeddings = embeddings[np.argsort(idx)]
print(embeddings.shape)

  0%|          | 0/1 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|██████████| 1/1 [00:02<00:00,  2.62s/it]

(5, 768)





In [None]:
###WITH ATTENTION MASK

In [34]:
idx = [1,2,3,4,5]
dna_sequences = DNADataset(["ATCGGCAT", "ATCAAAAT","ATCAGCAT","ATCGTTAT","ATCGGTAT"])
batch_size = 5
n_gpu=1
is_hyenadna = False
device = "cpu"
model = model.to("cpu")
first_iteration = None
data_loader = DataLoader(
    dna_sequences,
    batch_size=batch_size * n_gpu,
    shuffle=False,
    num_workers=1 #2 * n_gpu,
)
for i, batch in enumerate(tqdm.tqdm(data_loader)):
    with torch.no_grad():
        input_tokens = tokenizer.batch_encode_plus(
            batch, return_tensors="pt", padding=True
        )
        input_ids = input_tokens["input_ids"].to(device)
        attention_mask = input_tokens['attention_mask'].to(device)
        
        if is_hyenadna:
            model_output = model.forward(input_ids=input_ids)[0].detach().cpu()
        else:
            model_output = model.forward(input_ids=input_ids, attention_mask=attention_mask)[0].detach().cpu()
        
        attention_mask = attention_mask.unsqueeze(-1).detach().cpu()
        embedding = torch.sum(model_output*attention_mask, dim=1) / torch.sum(attention_mask, dim=1) # along the sequence length
        
        if i == 0:
            embeddings = embedding
            first_iteration = np.array(embeddings)
        else:
            embeddings = torch.cat((embeddings, embedding))
embeddings = np.array(embeddings.detach().cpu())
embeddings = embeddings[np.argsort(idx)]
print(embeddings.shape)

  0%|          | 0/1 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|██████████| 1/1 [00:01<00:00,  1.25s/it]

(5, 768)





In [40]:
dna = "TTTTTTTT"
inputs = tokenizer(dna, return_tensors = 'pt')["input_ids"]
hidden_states = model(inputs)[0] # [1, sequence_length, 768]
print(hidden_states.shape) # expect to be [1, sequence_length, 768]

# embedding with mean pooling
print(hidden_states[0].shape)
embedding_mean = torch.mean(hidden_states[0], dim=0).unsqueeze(0)
second_test = np.array(embedding_mean.detach().cpu())[0,:]

torch.Size([1, 3, 768])
torch.Size([3, 768])


In [41]:
for i in range(embeddings.shape[0]):
    embedding_i = embeddings[i]
    print(embedding_i[0:5])
    print(second_test[0:5])
    print(i, np.array_equal(first_iteration,second_test))
    print(i, np.allclose(embedding_i, second_test, atol=1e-6))


[-0.28138414 -0.10935557 -0.08664906  0.1703153   0.03078712]
[-0.24650352 -0.12896508 -0.11860963  0.1611344   0.0012951 ]
0 False
0 False
[-0.28152242 -0.0984172  -0.08429249  0.16923341  0.01058975]
[-0.24650352 -0.12896508 -0.11860963  0.1611344   0.0012951 ]
1 False
1 False
[-0.26513562 -0.04229131 -0.01140821  0.22488098 -0.05971776]
[-0.24650352 -0.12896508 -0.11860963  0.1611344   0.0012951 ]
2 False
2 False
[-0.27973628 -0.04341701  0.12714693  0.3036515  -0.24680538]
[-0.24650352 -0.12896508 -0.11860963  0.1611344   0.0012951 ]
3 False
3 False
[-0.23722088 -0.03440793  0.05095869  0.22647448 -0.12337677]
[-0.24650352 -0.12896508 -0.11860963  0.1611344   0.0012951 ]
4 False
4 False


In [45]:
test = np.concatenate([embeddings, embeddings[1:3,:]], axis=0)

In [46]:
test.shape

(7, 768)

In [18]:
#### GROVER


1360

In [27]:
idx = [1,2,3,4,5]
dna_sequences = DNADataset(["ATCGGCAT"*171, "ATCAAAAT"*171,"ATCAGCAT"*171,"ATCGTTAT"*171,"ATCGGTAT"*171])
batch_size = 5
n_gpu=1
is_hyenadna = True
device = "cpu"
model = model.to("cpu")
first_iteration = None
data_loader = DataLoader(
    dna_sequences,
    batch_size=batch_size * n_gpu,
    shuffle=False,
    num_workers=1 #2 * n_gpu,
)
for i, batch in enumerate(tqdm.tqdm(data_loader)):
    with torch.no_grad():
        input_tokens = tokenizer.batch_encode_plus(
            batch, return_tensors="pt", padding=True, return_attention_mask=True
        )
        print(input_tokens.keys())
        input_ids = input_tokens["input_ids"].to(device)
        print(input_ids.shape)
        attention_mask = input_tokens['attention_mask'].to(device)
        print(attention_mask.shape)
        
        if is_hyenadna:
            model_output = model.forward(input_ids=input_ids)[0].detach().cpu()
        else:
            model_output = model.forward(input_ids=input_ids, attention_mask=attention_mask)[0].detach().cpu()
        print(model_output.shape)
        attention_mask = attention_mask.unsqueeze(-1).detach().cpu()
        embedding = torch.sum(model_output*attention_mask, dim=1) / torch.sum(attention_mask, dim=1) # along the sequence length
        
        if i == 0:
            embeddings = embedding
            first_iteration = np.array(embeddings)
        else:
            embeddings = torch.cat((embeddings, embedding))
embeddings = np.array(embeddings.detach().cpu())
embeddings = embeddings[np.argsort(idx)]
print(embeddings.shape)

  0%|          | 0/1 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Token indices sequence length is longer than the specified maximum sequence length for this model (515 > 512). Running this sequence through the model will result in indexing errors
  0%|          | 0/1 [00:01<?, ?it/s]

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
torch.Size([5, 515])
torch.Size([5, 515])





RuntimeError: The expanded size of the tensor (515) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [5, 515].  Tensor sizes: [1, 512]

In [29]:
import os

In [48]:
def validate_input_array(array):
    "Returns array similar to input array but C-contiguous and with own data."
    if not array.flags["C_CONTIGUOUS"]:
        array = np.ascontiguousarray(array)
    if not array.flags["OWNDATA"]:
        array = array.copy()

    assert array.flags["C_CONTIGUOUS"] and array.flags["OWNDATA"]

    return array

def calculate_vamb_embedding(dna_sequences: list[str], model_path: str) -> np.array:
    tnf_embeddings = os.path.join("../embeddings", "TNF.npy")

    # if os.path.exists(tnf_embeddings):
    print(f"Load TNF-embedding from file {tnf_embeddings}")
    tnf_embeddings = np.load(tnf_embeddings)
    # else:
    #     tnf_embeddings = calculate_tnf(dna_sequences)

    pretrained_vamb_embeddings = np.load(model_path)  # dim (256,100)
    kernel = validate_input_array(npz["arr_0"])
    print(f"Load VAMB-embedding from file {model_path}")
    print(f"shape of VAMB-embedding: {pretrained_vamb_embeddings.shape}")
    # tnf_embeddings += -(1 / 256)
    # embeddings = np.dot(tnf_embeddings, pretrained_vamb_embeddings)

    return embeddings

In [49]:
calculate_vamb_embedding(["asasd,"], "../helpers/vamb_embedding.npy")

Load TNF-embedding from file ../embeddings/TNF.npy
Load VAMB-embedding from file ../helpers/vamb_embedding.npy


AttributeError: 'NpzFile' object has no attribute 'shape'