# Compare output between BioNeMo2 And Evo2

In BioNeMo2, I have selected two fasta sequences, run predictions on them, and save the `input_ids` and generated `tokens_logits`.

Here, I apply Evo2's model on the same two fasta sequences, run predictions on them, and compare the `input_ids`, `logits`, and final predictions.

I find that while the `input_ids` between the two sequences line up, our model and evo2's output different logit values (and thus different predictions)

In [1]:
# Install dependencies
!pip install matplotlib pandas seaborn scikit-learn openpyxl

import torch
import glob
import json
from pathlib import Path
from Bio import SeqIO
import gzip
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import seaborn as sns
from sklearn.metrics import roc_auc_score

# Set root path
os.chdir('../..')



In [46]:
%load_ext autoreload
%autoreload 2
from evo2.models import Evo2

# Load model
model = Evo2('evo2_1b_base')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


100%|██████████████████████████████████████████| 25/25 [00:00<00:00, 409.19it/s]


Extra keys in state_dict: {'blocks.10.mixer.dense._extra_state', 'blocks.3.mixer.dense._extra_state', 'blocks.17.mixer.dense._extra_state', 'blocks.24.mixer.dense._extra_state', 'unembed.weight'}


## Load in sequence used in bionemo2, and the bionemo2 predictions/logits for it, and compare

## 1. Load Selected Sequences

In [6]:
from Bio import SeqIO

selected_dir = Path("notebooks/brca1_selected_seqs")


file_path = selected_dir / "brca1_selected_sequences.fasta"
sequences = {}

for record in SeqIO.parse(file_path, "fasta"):
    sequences[record.id] = str(record.seq)

bionemo_selected_seqs = [v for k, v in sequences.items()]
bionemo_selected_seqs

['TTAAACACTTTTCAAACCAGGCAATATTTTAGGCCTACTGTATATTTGCATTTTGAGCTTCCAATACGGATAAGTGACTGGAAAAAGCAGCTAGGTTTAGGTTGAAAAACAACAACCCACCGGGGAACACATTTTAGCAAATTCTTCTGAAAGTCAAAAATGTTATAGTCATAGGTAAAAAGTTACAAAGAACTACCAATTGTCAGAAATAGCTGCCAATATTGACTTAGAAGACAGCAGAAGGAATTTTAGTTCAAGAAACCTAAAACAGGCTGAAAACCTTACCTACCCTATAGCTACCACAAATAACACTGTTTCCAGTCATGATCATTCCTGATCACATATTAAGACATAACTGCAAATTGTGCTATACTGTACTATATTAAAAGGAAGTGAAATATGATCCCTATCCTAGAACTTTCCATACAAATGAATGTAAAACACCATAAAAATTAATCTTAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGTGGGCGGATCACGAGGTCAGGAAGTGGAGACCATCCTGGCTAACACGGTGAAACCCCGTCTCTACTAAAAATACAAAAAATTAGCCGGGCGTGGTGGTGGACGCCTGTAGTCCCAGCTACTTGGGGGGCCGAGGCAGGAGAATGGCGTGAACCCGGGAGGCGGAGCTTGCAGTGAGCCGAGATGGCGCCACTGCACTCCGGCCTGGGTGAAAGAGCGAGACTCCGTCTCAAAAACAAAACAAACAAAAATTAATCTTAAGCCAGGCGCAGTGGCTCACGCCAGCACTTTGGAAGGCCGAGGCGGGTGGATCACGAGATCAGGACTTCAAGACCAGCCTGACCAACGTGATGAAACCCTATCTCTACTAAAAATACAAAATTAGCCGGCCACGGTGGCGTGCGCCTATAATCCCAGCTACTCAGGAGGCTGAGGCAGGAGAAGCGCTTGAACTTGAACCTGGCAGGCGGAGGTTGCAGTGAGCCA

## Comparing Preds

### BioNeMo Preds:

In [7]:
# Find and load prediction files
selected_seq_pred_files = glob.glob(os.path.join(selected_dir, "predictions__rank_*.pt"))

# Load sequence ID maps (maps sequence ID -> prediction index)
with open(os.path.join(selected_dir, "seq_idx_map.json"), "r") as f:
    ref_seq_idx_map = json.load(f)

# Load predictions
bionemo_preds = torch.load(selected_seq_pred_files[0])
bionemo_preds

  bionemo_preds = torch.load(selected_seq_pred_files[0])


{'log_probs_seqs': tensor([-4.2812, -4.2779]), 'seq_idx': tensor([0, 1])}

### Get Preds From evo2's model:

In [40]:
!ls

activations_debug.log  evo2.jpg     model_outputs.pt  README.md		test
AUTHORS		       LICENSE	    notebooks	      requirements.txt	vortex
evo2		       MANIFEST.in  NOTICE	      setup.py


In [47]:
# [np.float32(-0.9418016), np.float32(-0.94185793)]
model.score_sequences(bionemo_selected_seqs)

 50%|██████████████████████▌                      | 1/2 [00:14<00:14, 14.10s/it]

Model outputs saved to model_outputs.pt


100%|█████████████████████████████████████████████| 2/2 [00:14<00:00,  7.25s/it]


[np.float32(-0.9418016), np.float32(-0.94185793)]

## Thus: BioNeMo Preds != Evo2 Preds

For bionemo2 on the same sequence, we get:

`{'log_probs_seqs': tensor([-1.0852, -1.0871]), 'seq_idx': tensor([0, 1])}`

But arc gets:

`[np.float32(-0.9418016), np.float32(-0.94185793)]`

What about for just the output logits?

---

## Compare Logits For Same Sequence:

### BioNeMo logits:

In [10]:
# selected_dir2 = Path("notebooks/brca_selected_seq2")

# selected_dir_just_foward = selected_dir
selected_logits_dir = selected_dir / "logits"

# Find and load prediction files
selected_seq_pred_just_forward_files = glob.glob(os.path.join(selected_logits_dir, "predictions__rank_*.pt"))

# Load sequence ID maps (maps sequence ID -> prediction index)
with open(os.path.join(selected_logits_dir, "seq_idx_map.json"), "r") as f:
    ref_seq_idx_map_jf = json.load(f)

# Load predictions
selected_preds_jf = torch.load(selected_seq_pred_just_forward_files[0])
print(f"logit shape: {selected_preds_jf['token_logits'].shape}")
selected_preds_jf

logit shape: torch.Size([2, 8192, 512])


  selected_preds_jf = torch.load(selected_seq_pred_just_forward_files[0])


{'token_logits': tensor([[[ -4.8125, -14.8125, -14.7500,  ..., -14.8125, -14.7500, -14.7500],
          [ -4.6875, -16.7500, -16.7500,  ..., -16.7500, -16.7500, -16.7500],
          [ -5.0938, -19.7500, -19.7500,  ..., -19.7500, -19.7500, -19.7500],
          ...,
          [ -0.6641,  -2.4844,  -2.5000,  ...,  -2.4844,  -2.4844,  -2.4844],
          [ -1.2031,  -3.3438,  -3.3281,  ...,  -3.3438,  -3.3281,  -3.3281],
          [ -2.7812, -19.5000, -19.5000,  ..., -19.5000, -19.5000, -19.5000]],
 
         [[ -4.8125, -14.8125, -14.7500,  ..., -14.8125, -14.7500, -14.7500],
          [ -4.6875, -16.7500, -16.7500,  ..., -16.7500, -16.7500, -16.7500],
          [ -6.0938, -34.0000, -34.0000,  ..., -34.0000, -34.0000, -34.0000],
          ...,
          [ -1.1250,  -2.7812,  -2.7812,  ...,  -2.7812,  -2.7812,  -2.7812],
          [ -2.7656, -19.5000, -19.5000,  ..., -19.5000, -19.5000, -19.5000],
          [ -1.1953,  -3.3281,  -3.3281,  ...,  -3.3438,  -3.3281,  -3.3281]]]),
 'pad_mask':

## Evo2 Logits:

To get Evo2 logits, run through their `prepare_batch` func then pass thru model.

(Note - I've confirmed this is the same output they get via pdb):

In [11]:
def prepare_batch(
        seqs,
        prepend_bos: bool = False,
        device: str = 'cuda:0'
):
    """
    Takes in a list of sequences, tokenizes them, and puts them in a tensor batch.
    Tokenization is done directly by converting characters to ASCII values.
    e.g. TTA... -> ([84, 84, 65, ...)]
    """
    seq_lengths = [len(seq) for seq in seqs]
    print(f"seq_lens: {seq_lengths}")
    input_ids = []
    for seq in seqs:
        # Direct tokenization using list comprehension with ord()
        tokens = [ord(char) for char in seq]
        input_ids.append(
            torch.tensor(tokens, dtype=torch.long).to(device).unsqueeze(0)
        )
    input_ids = torch.cat(input_ids, dim=0)
    return input_ids, seq_lengths


In [12]:
input_ids, seq_lengths = prepare_batch(bionemo_selected_seqs, device='cuda:0', prepend_bos=False)
assert len(seq_lengths) == input_ids.shape[0]


seq_lens: [8192, 8192]


In [13]:
with torch.inference_mode():
        logits, _ = model(input_ids)

In [14]:
logits

(tensor([[[ -7.7188, -47.7500, -47.7500,  ..., -47.7500, -47.7500, -47.7500],
          [ -6.1875, -47.2500, -47.2500,  ..., -47.5000, -47.2500, -47.2500],
          [ -6.0625, -47.5000, -47.5000,  ..., -47.5000, -47.5000, -47.5000],
          ...,
          [ -6.3750, -42.7500, -42.7500,  ..., -42.7500, -42.7500, -42.7500],
          [ -6.3750, -46.5000, -46.5000,  ..., -46.7500, -46.5000, -46.5000],
          [ -6.1250, -43.2500, -43.2500,  ..., -43.2500, -43.2500, -43.2500]],
 
         [[ -7.6875, -47.5000, -47.5000,  ..., -47.5000, -47.5000, -47.5000],
          [ -6.1875, -47.5000, -47.5000,  ..., -47.5000, -47.5000, -47.5000],
          [ -6.0312, -47.5000, -47.5000,  ..., -47.5000, -47.5000, -47.5000],
          ...,
          [ -6.2188, -43.5000, -43.5000,  ..., -43.7500, -43.5000, -43.5000],
          [ -6.3125, -42.7500, -42.7500,  ..., -42.7500, -42.7500, -42.7500],
          [ -6.2188, -46.5000, -46.5000,  ..., -46.5000, -46.5000, -46.5000]]],
        device='cuda:0', dtyp

## Thus: BioNeMo 2 Logits != Evo2 Logits

Let's make sure the `input_ids` are the same.

---

## Compare `input_id`s


### BioNeMo2 Input Id's:

In [15]:
bionemo_input_ids = selected_preds_jf['input_ids']
bionemo_input_ids

tensor([[84, 84, 84,  ..., 84, 84, 67],
        [84, 84, 65,  ..., 84, 67, 65]])

### Evo2 Input Id's:

In [16]:
input_ids

tensor([[84, 84, 65,  ..., 84, 67, 65],
        [84, 84, 84,  ..., 84, 84, 67]], device='cuda:0')

In [17]:
torch.sum(input_ids[0].cpu() == bionemo_input_ids[0]) == 8192

tensor(False)

In [18]:
torch.sum(input_ids[1].cpu() == bionemo_input_ids[1]) == 8192

tensor(False)

In [26]:
input_ids.tolist()[0][:9]

[84, 84, 65, 65, 65, 67, 65, 67, 84]

In [27]:
bionemo_input_ids.tolist()[0][:10]

[84, 84, 84, 65, 65, 65, 67, 65, 67, 84]

---

# Conclusion:

Despite equal `input_id`s: BioNeMo2 and Evo2 Models produce different logits for the same fasta sequence.

## Appendix

More stuff

In [15]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def categorical_heatmap_comparison(preds1, preds2, title="Categorical Heatmap of Model Argmax Predictions"):
    preds1_np = preds1.flatten().cpu().numpy()
    preds2_np = preds2.flatten().cpu().numpy()

    # Stack both tensors for visualization (rows: model versions, columns: positions)
    data = np.vstack([preds1_np, preds2_np])

    # Create categorical colormap (e.g., 'tab10' for 10 distinct categories, 'Set3' for more)
    unique_values = np.unique(data)  # Get unique argmax values
    palette = sns.color_palette("tab10", n_colors=len(unique_values))  # Generate categorical colors
    lut = dict(zip(unique_values, palette))  # Map each unique value to a color
    data_colors = np.vectorize(lut.get)(data)  # Convert data values to colors

    # Create heatmap
    plt.figure(figsize=(12, 3))  # Wide and short for large tensors
    sns.heatmap(data, cmap=palette, cbar=True, xticklabels=500, yticklabels=["Model 1", "Model 2"])

    # Labels and Title
    plt.xlabel("Position in Tensor")
    plt.ylabel("Model Version")
    plt.title(title)

    plt.show()


### Compare Argmax of logits to each other

In [28]:
logits[0][0].argmax(dim=-1)

tensor([67, 84, 84,  ..., 67, 65, 67], device='cuda:0')

In [29]:
selected_preds_jf['token_logits'][0].argmax(dim=-1)

tensor([67, 67, 67,  ..., 84, 67, 65])

In [30]:
logits_evo2_1 = logits[0][0].argmax(dim=-1)
logits_bionemo_1 = selected_preds_jf['token_logits'][0].argmax(dim=-1)

logits_evo2_2 = logits[0][1].argmax(dim=-1)
logits_bionemo_2 = selected_preds_jf['token_logits'][1].argmax(dim=-1)

In [33]:
print(f"Percent logits same: Sequence 1: {(torch.sum(logits_evo2_1.cpu() == logits_bionemo_1.cpu()) / 8192.).item()}")
# categorical_heatmap_comparison(logits_evo2_1, logits_bionemo_1, title="Heatmap of Model Argmax Predictions Sequence 1")

Percent logits same: Sequence 1: 0.3841552734375


In [34]:
print(f"Percent logits same: Sequence 2: {(torch.sum(logits_evo2_2.cpu() == logits_bionemo_2.cpu()) / 8192.).item()}")
# categorical_heatmap_comparison(logits_evo2_2, logits_bionemo_2, title="Heatmap of Model Argmax Predictions Sequence 2")

Percent logits same: Sequence 2: 0.3916015625
