In [1]:
%load_ext autoreload
%autoreload 2

import json
import random
import string
import re
from pathlib import Path
from collections import Counter

import torch
import nnsight
from transformers import AutoTokenizer

# --- Parameters ---
# Match the plan's requirements
DATASET_SIZE = 128
LEXICON_CONDITION = "mixed" 
SEED = 23
MODEL_NAME = "meta-llama/Llama-3.3-70B-Instruct"
OUTPUT_FILE = Path("../data/patching_dataset_N32_mixed.jsonl")

# For reproducibility
random.seed(SEED)
torch.manual_seed(SEED)

# Make sure the output directory exists
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)

In [2]:
from transformers import AutoTokenizer
import nnsight
import torch

# --- Load Model and Tokenizer (Revised) ---

# 1. Load the "fast" tokenizer explicitly.
# This is crucial because the fast version has the .char_to_token() method we need.
# trust_remote_code=True is often needed for newer models like Llama 3.
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True, trust_remote_code=True)

# Llama-3 tokenizers don't have a pad_token by default. We'll set it to the eos_token.
# This is good practice and prevents potential issues later.
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# 2. Load the model via nnsight, passing our pre-loaded tokenizer.
# This ensures nnsight uses the exact tokenizer object we need.
model = nnsight.LanguageModel(
    MODEL_NAME,
    tokenizer=tokenizer,  # Pass the tokenizer here
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

print("Model and fast tokenizer loaded successfully.")

Model and fast tokenizer loaded successfully.


In [None]:
R1_NAME = "made of"
