# InChI db EDA

## Load db

In [None]:
import pandas as pd
import os
from tokenizers import ByteLevelBPETokenizer
from transformers import PreTrainedTokenizerFast

# Add the parent directory (repository root) to sys.path
script_dir = '.'   # current directory (where the notebook is running)
inchi_file = os.path.join(script_dir, "inchi_output.txt")
db = pd.read_csv(inchi_file, header=None, names=["InChI"], sep="\t")

db.head()

Unnamed: 0,InChI
0,"InChI=1S/C2H6O/c1-2-3/h3H,2H2,1H3"
1,InChI=1S/CH4/h1H4
2,"InChI=1S/CH4O/c1-2/h2H,1H3"
3,"InChI=1S/C2H7NS/c3-1-2-4/h4H,1-3H2"
4,InChI=1S/C2H8N2/c3-1-2-4/h1-4H2


## Max InChI str length

In [None]:
# --- 1. Max raw string length ---
max_str_len = max(db["InChI"].apply(len))
print(f"Longest InChI (characters): {max_str_len}")

Longest InChI (characters): 3915


## Load tokenizer

In [6]:
from tokenizers import ByteLevelBPETokenizer
import os
# --- 2. Load your trained tokenizer ---
# (Adjust path to where you saved it)
tokenizer = ByteLevelBPETokenizer(
    vocab=os.path.join(script_dir, "inchi_tokenizer", "vocab.json"),
    merges=os.path.join(script_dir, "inchi_tokenizer", "merges.txt"))
print("Loaded ByteLevelBPETokenizer")

Loaded ByteLevelBPETokenizer


## Tokenize longest InChI

In [8]:
# --- 3. Find the InChI with maximum string length and tokenize it ---
# Get the actual string
longest_inchi = db.loc[db["InChI"].str.len().idxmax(), "InChI"]

# Tokenize
encoded = tokenizer.encode(longest_inchi)
token_len = len(encoded.ids)
print(f"Longest InChI tokenized length: {token_len} tokens")
print(f"Sample tokens: {encoded.tokens[:15]}...")  # show first 15

Longest InChI tokenized length: 2068 tokens
Sample tokens: ['InChI', '=', '1', 'S', '/', 'C', '325', 'H', '387', 'N', '118', 'O', '209', 'P', '29']...


In [15]:
import transformers, tokenizers, numpy as np, torch
print(f"Transformers version: {transformers.__version__}")
print(f"Tokenizers version: {tokenizers.__version__}")
print(f"Numpy version: {np.__version__}")
print(f"Torch version: {torch.__version__}")

Transformers version: 5.1.0
Tokenizers version: 0.22.2
Numpy version: 2.0.1
Torch version: 2.7.0+cpu
