In [1]:
from data.utils import prepare_data
from tokenization.utils import build_asin_id_tokenizer

In [2]:
ds_train, ds_valid, ds_test, asin2id = prepare_data("Toys_and_Games")
tokenizer = build_asin_id_tokenizer(asin2id)


Map:   0%|          | 0/356872 [00:00<?, ? examples/s]

Map:   0%|          | 0/44609 [00:00<?, ? examples/s]

Map:   0%|          | 0/44609 [00:00<?, ? examples/s]

In [6]:
def print_model_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Total parameters: {total_params:,}")
    print(f"Trainable parameters: {trainable_params:,}")
    print(f"Trainable ratio: {100 * trainable_params / total_params:.2f}%")


In [21]:
def tokenize_function(examples):
    # combine prompt + completion
    text = examples["prompt"] + " " + examples["completion"]
    # truncate and return encoded inputs
    return tokenizer(text, truncation=True, padding=False)

tokenized_train = ds_train.map(tokenize_function)
tokenized_valid = ds_valid.map(tokenize_function)
tokenized_test = ds_test.map(tokenize_function)

Map:   0%|          | 0/356872 [00:00<?, ? examples/s]

Map:   0%|          | 0/44609 [00:00<?, ? examples/s]

Map:   0%|          | 0/44609 [00:00<?, ? examples/s]

In [None]:
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
)
import torch
import numpy as np

# -------------------------
# 0) Tokenizer
# -------------------------
# use your tokenizer; must match the model's vocab
# tokenizer = ...
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


ds = DatasetDict(
    {
        "train": ds_train,
        "validation": ds_valid,
        "test": ds_test,
    }
)

# -------------------------
# 2) Preprocess
#    - Train: loss only on completion (mask prompt with -100)
#    - Valid: evaluate last completion token (mask everywhere except last pos)
# -------------------------
def tok_ids(text):
    return tokenizer(text, add_special_tokens=False)["input_ids"]

def preprocess_train(ex):
    p = tok_ids(ex["prompt"])
    c = tok_ids(ex["completion"]) + [tokenizer.eos_token_id]  # EOS so model learns to stop
    input_ids = p + c
    labels = ([-100] * len(p)) + c  # loss only on completion span
    return {
        "input_ids": input_ids,
        "attention_mask": [1] * len(input_ids),
        "labels": labels,
    }

def preprocess_valid_last_token(ex):
    p = tok_ids(ex["prompt"])
    c = tok_ids(ex["completion"])
    assert len(c) > 0, "Completion must not be empty"
    target_id = c[-1]

    # Context excludes the last token we want to predict
    context_ids = p + c[:-1]

    # Labels vector: only the LAST position holds target_id; others -100
    # (allows model to compute loss at that single position; also used to find the position)
    labels = ([-100] * (len(context_ids) - 1)) + [target_id]

    return {
        "input_ids": context_ids,
        "attention_mask": [1] * len(context_ids),
        "labels": labels,
    }

tokenized_train = ds["train"].map(preprocess_train, remove_columns=ds["train"].column_names)
tokenized_valid = ds["validation"].map(preprocess_valid_last_token, remove_columns=ds["validation"].column_names)
tokenized_test  = ds["test"].map(preprocess_valid_last_token, remove_columns=ds["test"].column_names) if ds["test"] else None

collate_fn = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=None)

# -------------------------
# 3) Metrics: Recall@100 on the last token
#    Use preprocess_logits_for_metrics to extract only the logits at the
#    single position where labels != -100, so compute_metrics sees [B, V].
# -------------------------
def preprocess_logits_for_metrics(logits, labels):
    # logits: [B, T, V] (Tensor); labels: [B, T] (Tensor) with exactly one non -100 per row
    if isinstance(logits, tuple):
        logits = logits[0]
    with torch.no_grad():
        # position of interest per row = wherever labels != -100
        # We guarantee exactly one position per row.
        mask = (labels != -100)  # [B, T]
        # Convert mask to indices
        idx = torch.argmax(mask.to(torch.int32), dim=1)  # [B]; OK because exactly one True
        batch_idx = torch.arange(logits.size(0), device=logits.device)
        # Slice logits at that position → [B, V]
        selected = logits[batch_idx, idx]  # [B, V]
    return selected

def compute_metrics(eval_pred):
    # eval_pred.predictions: [B, V] float (numpy)
    # eval_pred.label_ids:   [B, T] ints where exactly one != -100 per row (the target id at last position)
    preds = eval_pred.predictions  # numpy [B, V]
    labels = eval_pred.label_ids   # numpy [B, T]
    # extract target ids
    # For each row, pick the element where label != -100
    target_ids = labels[labels != -100].reshape(-1)  # shape [B]

    # top-k
    k = 100
    topk_idx = np.argpartition(-preds, kth=min(k-1, preds.shape[1]-1), axis=1)[:, :k]  # [B, K] unordered within K
    # Turn into hits
    # (vectorized membership test)
    # Build a boolean matrix [B, K] whether target in topK
    hits = (topk_idx == target_ids[:, None]).any(axis=1).astype(np.float32)

    recall_at_100 = float(hits.mean()) if hits.size > 0 else 0.0
    return {"recall@100_last_token": recall_at_100}

# -------------------------
# 4) TrainingArguments (fix names)
# -------------------------
args = Seq2SeqTrainingArguments(
    output_dir="./qwen3_prompt_completion",
    eval_strategy="steps",      # <-- name is evaluation_strategy
    eval_steps=100,                   # integer number of steps
    logging_steps=50,
    save_steps=1000,
    save_total_limit=2,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    learning_rate=5e-4,
    warmup_steps=0,
    weight_decay=0.0,
    report_to="none",
    predict_with_generate=False,      # we want logits, not generated tokens
    include_inputs_for_metrics=False, # not needed
    fp16=False, bf16=False,
    seed=42,
)

# -------------------------
# 5) Trainer
# -------------------------
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    tokenizer=tokenizer,
    data_collator=collate_fn,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    compute_metrics=compute_metrics,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
)

# Train + Eval
trainer.train()
print("Validation:", trainer.evaluate())

# Optional: test with same metric
if tokenized_test is not None:
    print("Test:", trainer.evaluate(eval_dataset=tokenized_test))


Map:   0%|          | 0/356872 [00:00<?, ? examples/s]

Map:   0%|          | 0/44609 [00:00<?, ? examples/s]

Map:   0%|          | 0/44609 [00:00<?, ? examples/s]

TypeError: Seq2SeqTrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [None]:
from transformers import DataCollatorForSeq2Seq


scale = 16
config = Qwen3Config(
    vocab_size=tokenizer.vocab_size,
    hidden_size=max(1, int(4096/scale)),
    intermediate_size=max(1, int(22016/scale)),
    num_hidden_layers=max(1, int(32/scale)),
    num_attention_heads=max(1, int(32/scale)),
    num_key_value_heads=max(1, int(32/scale)),
    head_dim=max(1, int(128/scale)),
)
model = Qwen3ForCausalLM(config)
model.config.pad_token_id = tokenizer.pad_token_id
model.config.eos_token_id = tokenizer.eos_token_id
model.generation_config.max_new_tokens = 1  # adjust if your completions are longer

collate_fn = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,                 # optional but recommended
    label_pad_token_id=-100,     # ignore padded label positions
    pad_to_multiple_of=8,        # optional; good for Tensor Cores
)


args = Seq2SeqTrainingArguments(
    output_dir="./qwen3_prompt_completion",
    eval_strategy="steps",      # <-- name is evaluation_strategy
    eval_steps=100,                   # integer number of steps
    logging_steps=50,
    save_steps=1000,
    save_total_limit=2,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    learning_rate=5e-4,
    warmup_steps=0,
    weight_decay=0.0,
    report_to="none",
    predict_with_generate=False,      # we want logits, not generated tokens
    include_inputs_for_metrics=False, # not needed
    fp16=False, bf16=False,
    seed=42,
)

# -------------------------
# 5) Trainer
# -------------------------
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    tokenizer=tokenizer,
    data_collator=collate_fn,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    compute_metrics=compute_metrics,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
)

# Train + Eval
trainer.train()
print("Validation:", trainer.evaluate())

# Optional: test with same metric
if tokenized_test is not None:
    print("Test:", trainer.evaluate(eval_dataset=tokenized_test))


  trainer = Seq2SeqTrainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 165912, 'bos_token_id': 165911, 'pad_token_id': 165910}.


Step,Training Loss,Validation Loss


In [None]:
from transformers import DataCollatorForLanguageModeling, DataCollatorWithPadding


collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
# DataLoader
collator = DataCollatorWithPadding(tokenizer)
valid_loader = DataLoader(ds_valid_proc, batch_size=32, shuffle=False, collate_fn=collator)

In [None]:

scale = 20
config = Qwen3Config(
    vocab_size=tokenizer.vocab_size,
    hidden_size=max(1, int(4096/scale)),
    intermediate_size=max(1, int(22016/scale)),
    num_hidden_layers=max(1, int(32/scale)),
    num_attention_heads=max(1, int(32/scale)),
    num_key_value_heads=max(1, int(32/scale)),
    head_dim=max(1, int(128/scale)),
)
model = Qwen3ForCausalLM(config)
model.config.pad_token_id = tokenizer.pad_token_id
model.config.eos_token_id = tokenizer.eos_token_id
model.generation_config.max_new_tokens = 1  # adjust if your completions are longer

def count_params(m):
    total = sum(p.numel() for p in m.parameters())
    trainable = sum(p.numel() for p in m.parameters() if p.requires_grad)
    print(f"Total parameters:     {total:,}  ({total/1e6:.2f}M)")
    print(f"Trainable parameters: {trainable:,}  ({trainable/1e6:.2f}M)")
count_params(model)

# -------------------------
# 6) Train
# -------------------------
args = Seq2SeqTrainingArguments(
    output_dir="./qwen3_prompt_completion",
    eval_strategy="steps",                  # <- correct arg name
    eval_steps=0.1,  # int steps
    logging_steps=100,
    save_steps=1000,
    save_total_limit=2,
    per_device_train_batch_size=32,               # safer defaults; raise if you have GPU
    per_device_eval_batch_size=64,
    num_train_epochs=100,
    learning_rate=5e-4,
    warmup_steps=0,
    weight_decay=0.0,
    report_to="none",
    predict_with_generate=True,
    include_inputs_for_metrics=False,
    fp16=False, bf16=False,
    seed=SEED,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    tokenizer=tokenizer,
    data_collator=collate_fn,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    compute_metrics=compute_metrics,
)

trainer.train()
print("Validation:", trainer.evaluate())
print("Test:", trainer.evaluate(eval_dataset=tokenized_test))


In [None]:
# tiny_qwen3_addition_for_prompts.py
# Same training loop, now for datasets with {prompt, completion} and a prebuilt tokenizer.

import os, random
import numpy as np
import torch

from transformers import (
    Qwen3Config,
    Qwen3ForCausalLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    set_seed,
)

from datasets import DatasetDict

# -------------------------
# 0) Repro
# -------------------------
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED); set_seed(SEED)

# --- helper: robust encode for str OR list (ints/strs) ---
def _encode_field(value, tok):
    # If it's already a string: standard encode
    if isinstance(value, str):
        return tok(value, add_special_tokens=False)

    # If it's a list (e.g., [123, 45, 6] or ["B08..", "B07.."]):
    # 1) cast ints -> strings
    # 2) tell HF these are pre-tokenized "words"
    if isinstance(value, list):
        tokens = [str(x) for x in value]
        return tok(tokens, add_special_tokens=False, is_split_into_words=True)

    # Anything else is unsupported
    raise ValueError(f"Unsupported field type for tokenization: {type(value)}")

# inverse map: id -> asin token
id2asin = {v: k for k, v in asin2id.items()}

# robust converter: value -> list[str] tokens (ASINs)
def _to_token_words(value):
    if isinstance(value, str):
        return [value]
    if isinstance(value, int):
        return [id2asin.get(value, str(value))]  # fallback: stringified id
    if isinstance(value, list):
        out = []
        for x in value:
            if isinstance(x, int):
                out.append(id2asin.get(x, str(x)))
            else:
                out.append(str(x))
        return out
    raise ValueError(f"Unsupported field type for tokenization: {type(value)}")

# ensure left padding + special tokens exist
tokenizer.padding_side = "left"
if tokenizer.pad_token_id is None:
    tokenizer.add_special_tokens({"pad_token": "<pad>"})
if tokenizer.eos_token_id is None:
    tokenizer.add_special_tokens({"eos_token": "<eos>"})


def tok_train(ex, tok):
    enc = tok(_to_token_words(ex["prompt"]), add_special_tokens=False, is_split_into_words=True)
    dec = tok(_to_token_words(ex["completion"]), add_special_tokens=False, is_split_into_words=True)
    eos = tok.eos_token_id
    input_ids = enc["input_ids"] + dec["input_ids"] + [eos]
    labels    = [-100] * len(enc["input_ids"]) + dec["input_ids"] + [eos]
    return {"input_ids": input_ids, "attention_mask": [1]*len(input_ids), "labels": labels}

def tok_eval(ex, tok):
    enc = tok(_to_token_words(ex["prompt"]), add_special_tokens=False, is_split_into_words=True)
    dec = tok(_to_token_words(ex["completion"]), add_special_tokens=False, is_split_into_words=True)
    eos = tok.eos_token_id
    input_ids = enc["input_ids"]
    labels    = dec["input_ids"] + [eos]
    return {"input_ids": input_ids, "attention_mask": [1]*len(input_ids), "labels": labels}


# -------------------------
# 2) Collator (LEFT padding everywhere)
# -------------------------
def make_collate_fn(tokenizer):
    LABEL_PAD_ID = -100
    PAD_ID = tokenizer.pad_token_id

    def collate_fn(features):
        feats = [{
            "input_ids": list(f["input_ids"]),
            "attention_mask": list(f["attention_mask"]),
            "labels": list(f["labels"]),
        } for f in features]

        max_len = max(len(f["input_ids"]) for f in feats)

        batch_input_ids, batch_attn, batch_labels = [], [], []
        for f in feats:
            L_in, L_lb = len(f["input_ids"]), len(f["labels"])
            pad_in = max_len - L_in
            pad_lb = max_len - L_lb

            input_ids = [PAD_ID]*pad_in + f["input_ids"]
            attn      = [0]*pad_in     + f["attention_mask"]
            labels    = [LABEL_PAD_ID]*pad_lb + f["labels"]

            input_ids = input_ids[:max_len]
            attn      = attn[:max_len]
            labels    = labels[:max_len]

            batch_input_ids.append(input_ids)
            batch_attn.append(attn)
            batch_labels.append(labels)

        return {
            "input_ids": torch.tensor(batch_input_ids, dtype=torch.long),
            "attention_mask": torch.tensor(batch_attn, dtype=torch.long),
            "labels": torch.tensor(batch_labels, dtype=torch.long),
        }
    return collate_fn





# -------------------------
# 3) Metrics: compare tail(pred) with labels length (no "=" assumption)
# -------------------------
def make_compute_metrics(tokenizer, max_print=3):
    LABEL_PAD_ID = -100
    eos_id = tokenizer.eos_token_id

    def compute_metrics(eval_pred):
        preds, labels = eval_pred
        n_total = 0
        n_wrong = 0
        printed = 0
        skipped_empty = 0

        for i, (p, l) in enumerate(zip(preds, labels)):
            p = p.tolist() if hasattr(p, "tolist") else list(p)
            l = l.tolist() if hasattr(l, "tolist") else list(l)

            gold_ids = [t for t in l if t != LABEL_PAD_ID]
            if not gold_ids:
                skipped_empty += 1
                continue

            first_gold = gold_ids[0]
            first_pred = p[len(p) - len(gold_ids)]  # first predicted token aligned to labels

            n_total += 1
            if first_pred != first_gold:
                n_wrong += 1
                if printed < max_print:
                    print(f"[mistake {printed+1}] idx={i}\n"
                          f"  pred_id={first_pred}, gold_id={first_gold}\n"
                          f"  pred_tok={tokenizer.convert_ids_to_tokens([first_pred])}\n"
                          f"  gold_tok={tokenizer.convert_ids_to_tokens([first_gold])}")
                    printed += 1

        if n_total == 0:
            print(f"[metric] WARNING: no non-empty labels (skipped={skipped_empty}).")
            return {"accuracy_first_token": 0.0, "n_mistakes": 0}

        acc = 1.0 - n_wrong / n_total
        return {"accuracy_first_token": acc, "n_mistakes": n_wrong}

    return compute_metrics


# -------------------------
# 4) Build tokenized datasets
# -------------------------
ds = DatasetDict({
    "train": ds_train,
    "validation": ds_valid,
    "test": ds_test,
})

tokenized_train = ds["train"].map(lambda ex: tok_train(ex, tokenizer),
                                  remove_columns=["prompt", "completion"])
tokenized_valid = ds["validation"].map(lambda ex: tok_eval(ex, tokenizer),
                                       remove_columns=["prompt", "completion"])
tokenized_test  = ds["test"].map(lambda ex: tok_eval(ex, tokenizer),
                                 remove_columns=["prompt", "completion"])

collate_fn = make_collate_fn(tokenizer)
compute_metrics = make_compute_metrics(tokenizer)

# -------------------------
# 5) Tiny model config
# -------------------------
scale = 20
config = Qwen3Config(
    vocab_size=tokenizer.vocab_size,
    hidden_size=max(1, int(4096/scale)),
    intermediate_size=max(1, int(22016/scale)),
    num_hidden_layers=max(1, int(32/scale)),
    num_attention_heads=max(1, int(32/scale)),
    num_key_value_heads=max(1, int(32/scale)),
    head_dim=max(1, int(128/scale)),
)
model = Qwen3ForCausalLM(config)
model.config.pad_token_id = tokenizer.pad_token_id
model.config.eos_token_id = tokenizer.eos_token_id
model.generation_config.max_new_tokens = 1  # adjust if your completions are longer

def count_params(m):
    total = sum(p.numel() for p in m.parameters())
    trainable = sum(p.numel() for p in m.parameters() if p.requires_grad)
    print(f"Total parameters:     {total:,}  ({total/1e6:.2f}M)")
    print(f"Trainable parameters: {trainable:,}  ({trainable/1e6:.2f}M)")
count_params(model)

# -------------------------
# 6) Train
# -------------------------
args = Seq2SeqTrainingArguments(
    output_dir="./qwen3_prompt_completion",
    eval_strategy="steps",                  # <- correct arg name
    eval_steps=0.1,  # int steps
    logging_steps=100,
    save_steps=1000,
    save_total_limit=2,
    per_device_train_batch_size=32,               # safer defaults; raise if you have GPU
    per_device_eval_batch_size=64,
    num_train_epochs=100,
    learning_rate=5e-4,
    warmup_steps=0,
    weight_decay=0.0,
    report_to="none",
    predict_with_generate=True,
    include_inputs_for_metrics=False,
    fp16=False, bf16=False,
    seed=SEED,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    tokenizer=tokenizer,
    data_collator=collate_fn,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    compute_metrics=compute_metrics,
)

trainer.train()
print("Validation:", trainer.evaluate())
print("Test:", trainer.evaluate(eval_dataset=tokenized_test))


Map:   0%|          | 0/356872 [00:00<?, ? examples/s]

Map:   0%|          | 0/44609 [00:00<?, ? examples/s]

Map:   0%|          | 0/44609 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 165912, 'bos_token_id': 165911, 'pad_token_id': 165910}.


Total parameters:     68,372,040  (68.37M)
Trainable parameters: 68,372,040  (68.37M)


Step,Training Loss,Validation Loss


KeyboardInterrupt: 

In [22]:
scale = 32
config = Qwen3Config(
    vocab_size=tokenizer.vocab_size,
    hidden_size=max(1, int(4096/scale)),
    intermediate_size=max(1, int(22016/scale)),
    num_hidden_layers=max(1, int(32/scale)),
    num_attention_heads=max(1, int(32/scale)),
    num_key_value_heads=max(1, int(32/scale)),
    head_dim=max(1, int(128/scale)),
)
model = Qwen3ForCausalLM(config)
model.config.pad_token_id = tokenizer.pad_token_id
model.config.eos_token_id = tokenizer.eos_token_id
model.generation_config.max_new_tokens = 1  # adjust if your completions are longer

def count_params(m):
    total = sum(p.numel() for p in m.parameters())
    trainable = sum(p.numel() for p in m.parameters() if p.requires_grad)
    print(f"Total parameters:     {total:,}  ({total/1e6:.2f}M)")
    print(f"Trainable parameters: {trainable:,}  ({trainable/1e6:.2f}M)")
count_params(model)

# -------------------------
# 6) Train
# -------------------------
args = Seq2SeqTrainingArguments(
    output_dir="./qwen3_prompt_completion",
    eval_strategy="steps",                  # <- correct arg name
    eval_steps=0.1,  # int steps
    logging_steps=100,
    save_steps=1000,
    save_total_limit=2,
    per_device_train_batch_size=32,               # safer defaults; raise if you have GPU
    per_device_eval_batch_size=64,
    num_train_epochs=100,
    learning_rate=5e-4,
    warmup_steps=0,
    weight_decay=0.0,
    report_to="none",
    predict_with_generate=True,
    include_inputs_for_metrics=False,
    fp16=False, bf16=False,
    seed=SEED,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    tokenizer=tokenizer,
    data_collator=collate_fn,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    compute_metrics=compute_metrics,
)

trainer.train()
print("Validation:", trainer.evaluate())
print("Test:", trainer.evaluate(eval_dataset=tokenized_test))


  trainer = Seq2SeqTrainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 165912, 'bos_token_id': 165911, 'pad_token_id': 165910}.


Total parameters:     42,740,872  (42.74M)
Trainable parameters: 42,740,872  (42.74M)


Step,Training Loss,Validation Loss


KeyboardInterrupt: 