In [1]:
import torch
import pandas as pd
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("gbyuvd/bionat-selfies-gen-tokenizer-wordlevel")
    
# Ensure [MASK] token exists
if not hasattr(tokenizer, 'mask_token_id') or tokenizer.mask_token_id is None:
    tokenizer.add_special_tokens({'mask_token': '[MASK]'})

print(f"Vocab size: {tokenizer.vocab_size}")
print(f"[MASK] token ID: {tokenizer.mask_token_id}")

Vocab size: 412
[MASK] token ID: 4


In [None]:
from model import ImplicitRefinementModel, ImplicitRefinementConfig 
seq_len = 90
config = ImplicitRefinementConfig(
        vocab_size=tokenizer.vocab_size,
        hidden_size=320,
        num_layers=6,
        num_heads=4,
        max_seq_len=seq_len,          
        max_refinement_steps=10,
        dropout=0.1,
        use_self_cond=True,
        stop_threshold=0.02,
        min_refine_uncertainty=0.1,
        ema_decay=0.995,
        diversity_weight=0.05,
        sampling_temperature=1.0,
        use_refine_gate=True,  # Enable internal refinement gate
        use_gradient_checkpointing=False  # Enable for larger models
    )

model = ImplicitRefinementModel(config, tokenizer=tokenizer).to(device)

In [4]:
checkpoint_path = "best_chemistrySELFIESmodel.pth"  # or "refinement_model_z_final.pth"

torch.serialization.add_safe_globals([chemistrySELFIESmodelfinal])
checkpoint = torch.load(checkpoint_path, map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()  # set to evaluation mode
print("✅ Model weights loaded successfully!")

NameError: name 'chemistrySELFIESmodelfinal' is not defined

In [6]:
def generate_and_decode(model, tokenizer, num_samples=5, max_len=90, temperature=1.0):
    """Generate chemistry molecules using adaptive refinement."""
    model.eval()
    print(f"\n🧪 Generating {num_samples} SELFIES molecules...")
    print(f"   Temperature: {temperature}")
    print("="*70)
    
    with torch.no_grad():
        samples = model.sample(batch_size=2, max_len=90, device='cuda')
    
    # Compute actual lengths (stop at first pad or eos)
    actual_lengths = []
    for s in samples:
        # Find first pad or eos
        s_list = s if isinstance(s, list) else s.tolist()
        length = len(s_list)
        for i, tok in enumerate(s_list):
            if tok == tokenizer.pad_token_id or tok == tokenizer.eos_token_id:
                length = i
                break
        actual_lengths.append(torch.tensor(length))
    
    for i, (sample, length) in enumerate(zip(samples, actual_lengths)):
        decoded = tokenizer.decode(sample[:length.item()], skip_special_tokens=True)
        print(f"{i+1}. (len={length.item()}) {decoded}")
    
    print("="*70)


In [12]:
# Try different temperatures
for temp in [0.8, 1.0, 1.2]:
    print(f"\n--- Temperature: {temp} ---")
    generate_and_decode(
        model=model,
        tokenizer=tokenizer,
        num_samples=5,
        temperature=temp
    )


--- Temperature: 0.8 ---

🧪 Generating 5 SELFIES molecules...
   Temperature: 0.8
✅ Stopped at step 3 (change: 0.00%)
1. (len=19) [C] [C] [C] [C] [C] [C] [C] [C] [C] [C] [=Branch1] [C] [C] [C] [=C] [C] [C] [=C]
2. (len=47) [C] [=C] [C] [C] [=C] [=C] [C] [Ring2] [C] [=Branch2] [N] [C] [C] [C] [=Branch1] [C] [C] [Branch1] [=C] [O] [C] [C] [C] [C] [Ring1] [=C] [N] [C] [Branch2] [Branch2] [C] [C] [C] [O] [C] [=C] [C] [C] [O] [C] [Ring1] [C] [C] [=C] [Branch1] [C]

--- Temperature: 1.0 ---

🧪 Generating 5 SELFIES molecules...
   Temperature: 1.0
✅ Stopped at step 5 (change: 0.00%)
1. (len=42) [C] [C] [C] [C] [#C] [C] [Ring1] [Branch1] [C] [C] [C] [O] [=Branch1] [C] [C] [=C] [C] [C] [C] [C] [C] [C] [C] [C] [=C] [C] [=C] [=C] [O] [Ring2] [C] [Ring1] [=O] [Branch2] [O] [C] [Ring1] [C] [=Branch1] [S] [=C]
2. (len=50) [C] [C] [C] [C] [=C] [=O] [C] [Ring2] [=Branch1] [C] [=C] [C] [=Branch1] [=C] [=C] [C] [O] [N] [=C] [=C] [NH1] [C] [N] [C] [C] [Ring1] [=C] [C] [O] [C] [C] [C] [=C] [C] [C] [C] [C

In [3]:
import selfies as sf

tokens = "[C] [=C] [=C] [C] [C] [Branch2] [C] [Ring1] [=C] [C] [C] [C] [C] [C] [C] [C] [C] [=C] [C] [N] [Branch1] [C] [C] [C] [=Branch1] [C] [Branch2] [C] [C] [C] [C] [Ring1] [C] [#Branch2] [C] [C] [Branch1] [Branch2] [C] [C] [C] [C] [C] [Branch1] [Ring1] [C] [C] [C] [C] [=C] [C] [Ring1] [C] [Branch1] [C] [Ring2] [=C] [Ring2] [Ring2] [C] [C]"
tokens = tokens.replace(" ", "")
print(sf.decoder(tokens))

CF


In [18]:
model.eval()
analysis = model.analyze_refinement_trajectory(
    max_len=16,
    device='cuda',
    seed=22
)
model.print_refinement_trajectory(analysis, tokenizer=tokenizer)

🔍 Refinement Trajectory (INTERNAL GATE)

t=0: [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK]
        ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑
       Gate: 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00
t=1: [ [3]] [ [5]] [ [5]] [[22]] [ [5]] [ [5]] [ [5]] [ [7]] [ [8]] [[15]] [ [7]] [ [7]] [ [5]] [[22]] [ [5]] [ [5]]
        ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑
       Gate: 0.74 0.93 0.94 0.92 0.92 0.92 0.92 0.92 0.92 0.92 0.92 0.91 0.91 0.91 0.91 0.92
t=2: [ [3]] [ [5]] [ [9]] [ [5]] [ [5]] [ [5]] [ [5]] [[13]] [ [9]] [[15]] [[10]] [ [5]] [ [5]] [ [5]] [ [5]] [ [5]]
          ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑   ↑ ↑ ↑ ↑ ↑ ↑
       Gate: 0.26 0.58 0.74 0.84 0.72 0.73 0.75 0.63 0.80 0.49 0.62 0.60 0.72 0.84 0.73 0.75
t=3: [ [3]] [ [5]] [ [5]] [[18]] [ [5]] [ [5]] [ [5]] [ [7]] [ [8]] [[15]] [[33]] [ [6]] [ [6]] [ [7]] [ [5]] [ [8]]
          ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑   ↑ ↑ ↑ ↑ ↑ ↑
       Gate: 0.29 0.52 0.77 0.63 0.62 0.62 0.65 0.70