In [8]:
from textwrap import wrap

import pandas as pd
import torch
from torch import tensor
from transformers import AutoModelForCausalLM, AutoTokenizer

In [2]:
tokenizer = AutoTokenizer.from_pretrained(
    "hackathon-pretrain-mpt-60m-tinystories/1742956447-inventive-lionfish/ep3-ba13672-rank0",
    padding_side='left',
)
tokenizer.pad_token = "1"
tokenizer

GPTNeoXTokenizerFast(name_or_path='hackathon-pretrain-mpt-60m-tinystories/1742956447-inventive-lionfish/ep3-ba13672-rank0', vocab_size=50254, model_max_length=2048, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '1'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<|padding|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	50254: AddedToken("                        ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50255: AddedToken("                       ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50256: AddedToken("                      ", rstrip=False, lstrip=False, single_word=False, normalized=True, special

In [3]:
model = AutoModelForCausalLM.from_pretrained(
    "hackathon-pretrain-mpt-60m-tinystories/1742956447-inventive-lionfish/ep3-ba13672-rank0",
    torch_dtype=torch.bfloat16,
    device_map="auto",
    use_cache=True,
)
model = torch.compile(model)

In [4]:
df = pd.read_csv("data/evaluation_prompts.csv")
df

Unnamed: 0,prompt,completion
0,"Once upon a time, there lived a bunny in a fie...",
1,"Once upon a time, there lived a bunny in a fie...",
2,"Once upon a time, there lived a bunny in a fie...",
3,"Once upon a time, there lived a bunny in a fie...",
4,"Once upon a time, there lived a bunny in a fie...",
...,...,...
295,"Once upon a time, in a small village, there li...",
296,"Once upon a time, in a small village, there li...",
297,"Once upon a time, in a small village, there li...",
298,"Once upon a time, in a small village, there li...",


In [5]:
encoded_prompts = [
    tensor(tokenizer.encode(p)).to(model.device) for p in df.prompt
]

In [21]:
def generate_batch(prompts, batch_size=8):
    inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=100,
        do_sample=True,
        temperature=0.7,
        top_p=0.95,
        pad_token_id=tokenizer.pad_token_id,
    )
    return [tokenizer.decode(output, skip_special_tokens=True).lstrip("1") for output in outputs]

# Process in batches
batch_size = 8
for i in range(0, len(df), batch_size):
    batch = df.prompt.iloc[i:i+batch_size].tolist()
    completions = generate_batch(batch)
    df.loc[i:i+batch_size-1, 'completion'] = completions

In [26]:
df

Unnamed: 0,prompt,completion
0,"Once upon a time, there lived a bunny in a fie...","Once upon a time, there lived a bunny in a fie..."
1,"Once upon a time, there lived a bunny in a fie...","Once upon a time, there lived a bunny in a fie..."
2,"Once upon a time, there lived a bunny in a fie...","Once upon a time, there lived a bunny in a fie..."
3,"Once upon a time, there lived a bunny in a fie...","Once upon a time, there lived a bunny in a fie..."
4,"Once upon a time, there lived a bunny in a fie...","Once upon a time, there lived a bunny in a fie..."
...,...,...
295,"Once upon a time, in a small village, there li...","Once upon a time, in a small village, there li..."
296,"Once upon a time, in a small village, there li...","Once upon a time, in a small village, there li..."
297,"Once upon a time, in a small village, there li...","Once upon a time, in a small village, there li..."
298,"Once upon a time, in a small village, there li...","Once upon a time, in a small village, there li..."


In [22]:
wrap(
    df.completion[0],
)

['Once upon a time, there lived a bunny in a field. Her name was Lucy.',
 'Lucy loved to have feasts and parties with her bunny friends. One day,',
 "when Lucy was about to leave for a feast at a friend's house, she",
 "realized she's starting to feel sick. She was so weak she could barely",
 'breathe.  Lucy: Oh no! Where are you okay? I am still feeling sick?',
 'Her best friend, Ella, noticed Lucy sitting alone in the field. Ella',
 'had an idea. Ella told Lucy that she was eating delicious carrots and',
 'lettuce, or maybe hopping around, but she might get worse. Lucy: Why',
 "don't you want to eat carrots? If you eat them, you can go anywhere",
 'you want! Lucy had enough carrots to be healthy']

In [23]:
wrap(
    df.completion[1],
)

['Once upon a time, there lived a bunny in a field. Her name was Lucy.',
 'Lucy loved to have feasts and parties with her bunny friends. One day,',
 "when Lucy was about to leave for a feast at a friend's house, she",
 "realized she's starting to feel sick. She was so weak she could barely",
 'move!  Lucy\'s best friend, Tommy, came running over to her. "Lucy, are',
 'you okay?" Tommy asked. Lucy shook her head. "I\'m just feeling weak,"',
 'she replied.  Tommy grinned. "I\'m here for you, Lucy." He handed her a',
 'plate of carrots, and Lucy happily ate them all up. Then, he made sure',
 "Lucy finished eating all the carrots she'd been eaten.  Lucy thanked",
 'Tommy for helping']

In [31]:
df.loc[:, 'completion'] = df.apply(
    lambda row: row.completion[len(row.prompt):],
    axis=1,
)

In [33]:
df.to_csv("hackathon_submissions/20250331_submission.csv", index=False)

In [34]:
sum(p.numel() for p in model.parameters() if p.requires_grad)

50962944

In [38]:
def generate_batch(prompts, batch_size=8):
    inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=1000,
        do_sample=True,
        temperature=0.7,
        top_p=0.95,
        pad_token_id=tokenizer.pad_token_id,
    )
    return [tokenizer.decode(output, skip_special_tokens=True).lstrip("1") for output in outputs]


completions = generate_batch(df.prompt.iloc[:batch_size].tolist())

In [52]:
[len(tokenizer.encode(x)) for x in completions]


[1061, 1063, 1062, 1062, 1063, 1039, 1039, 1038]

In [56]:
any("<|endoftext|>" in x for x in df.completion)

False