In [1]:
import polars as pl

In [2]:
df = pl.read_parquet("../data/msg_finetune/train_predictions.parquet")

In [3]:
df["predicted_fingerprint"][0].shape

(4096,)

In [6]:
fp = df["predicted_fingerprint"][0].to_numpy()

In [8]:
import numpy as np

In [9]:
processed_fp = np.zeros_like(fp)
processed_fp[fp > 0.5] = 1

In [12]:
processed_fp

array([0., 0., 0., ..., 0., 0., 0.], shape=(4096,), dtype=float32)

In [None]:
import polars as pl
import numpy as np
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem

# ---------- helper functions -------------------------------------------------
def mol_from_inchi(inchi: str):
    """Convert InChI → RDKit Mol, return None on failure."""
    try:
        return Chem.MolFromInchi(inchi, sanitize=True, removeHs=True)
    except Exception:
        return None

def fp_as_numpy(mol, n_bits: int, radius: int = 2) -> np.ndarray:
    """Morgan fingerprint → NumPy array of 0/1 ints."""
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=n_bits)
    arr = np.zeros(n_bits, dtype=np.int8)
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr

def fold_4096_to_2048(fp4096: np.ndarray) -> np.ndarray:
    """
    Standard RDKit folding: add higher half into lower half
    (i.e. bit i of the folded vector is OR of bits i and i+2048).
    """
    folded = fp4096[:2048] ^ fp4096[2048:]
    return folded

# ---------- main workflow ----------------------------------------------------
# 1. Load data and sample 1 000 rows
df = pl.read_parquet("../test_predictions.parquet")
sample = df.sample(n=1_000, with_replacement=False)

records = []
for inchi in sample["inchi"]:
    mol = mol_from_inchi(inchi)
    if mol is None:
        continue                               # skip malformed InChI strings

    # 2. True fingerprints
    true_fp2048 = fp_as_numpy(mol, 2048)
    fp4096      = fp_as_numpy(mol, 4096)

    # 3. Fold 4096 → 2048
    folded_fp = fold_4096_to_2048(fp4096)

    # 4. Bit‑wise difference (Hamming distance)
    diff = np.count_nonzero(true_fp2048 ^ folded_fp)

    records.append({"inchi": inchi, "diff_bits": int(diff)})

# 5. Collate results
result_df = pl.DataFrame(records)
print(result_df.head())                 # preview
print(f"Mean bit difference: {result_df['diff_bits'].mean():.2f}")



shape: (5, 2)
┌─────────────────────────────────┬───────────┐
│ inchi                           ┆ diff_bits │
│ ---                             ┆ ---       │
│ str                             ┆ i64       │
╞═════════════════════════════════╪═══════════╡
│ InChI=1S/C37H42N4O6S2/c1-21(42… ┆ 0         │
│ InChI=1S/C28H39NO3S/c1-19(2)21… ┆ 0         │
│ InChI=1S/C37H48N4O5/c1-25(2)34… ┆ 0         │
│ InChI=1S/C25H25F3N4O6/c1-24(15… ┆ 0         │
│ InChI=1S/C29H36O6/c1-16-12-23-… ┆ 0         │
└─────────────────────────────────┴───────────┘
Mean bit difference: 0.00


In [1]:
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = AutoTokenizer.from_pretrained("ibm/materials.selfies-ted")

In [4]:
tokenizer.vocab

{'[156Dy]': 889,
 '.[N-1]': 613,
 '[B@H1-1]': 2098,
 '[=Gd]': 212,
 '[105Rh]': 448,
 '[=Zr+2]': 1137,
 '[Se+1]': 1371,
 '[58Fe]': 828,
 '[SeH8+6]': 2731,
 '.[AsH3]': 590,
 '[152Gd]': 1615,
 '[100Tc+5]': 1856,
 '[31P]': 1295,
 '[185Ta]': 1232,
 '[81Br-1]': 328,
 '[95Nb]': 1679,
 '[/Sn]': 92,
 '[/Si-1]': 571,
 '[=11CH1]': 2282,
 '.[203Pb+2]': 1946,
 '[Cr+6]': 2573,
 '[43Ca]': 1918,
 '[212Ra]': 237,
 '[104Tc]': 2532,
 '[/ClH1+1]': 335,
 '[Cl+2]': 1030,
 '.[Y+3]': 2991,
 '.[P-3]': 1217,
 '[Pa]': 2576,
 '[124Te]': 2032,
 '[Nb]': 1855,
 '[53Cr+6]': 1595,
 '.[Se-2]': 646,
 '[226Ac]': 1195,
 '.[Yb]': 915,
 '[#Branch1].[Cl-1]': 26,
 '[121SnH2]': 1448,
 '.[Gd+3]': 238,
 '[Si+4]': 795,
 '[=Hf]': 949,
 '[105Rh+3]': 2842,
 '[97Tc]': 1412,
 '[InH2]': 2735,
 '[112Cd]': 2581,
 '[197Hg]': 2840,
 '[236U]': 2687,
 '[#Tl]': 3065,
 '[131Cs+1]': 1818,
 '[#SH1]': 248,
 '[In]': 3107,
 '.[Hg+2]': 807,
 '.[204Hg+1]': 2447,
 '[134IH1]': 772,
 '.[11CH4]': 2118,
 '[144Sm]': 2738,
 '[Fm]': 2938,
 '[#Ho]': 2112,
 '[