In [1]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AutoTokenizer, AutoModelForCausalLM
import torch.nn.functional as F
import numpy as np


In [26]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.eval()


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [14]:
import urllib.request

url = "https://huggingface.co/datasets/roneneldan/TinyStories/resolve/main/TinyStoriesV2-GPT4-train.txt"
filename = "TinyStoriesV2-GPT4-train.txt"

urllib.request.urlretrieve(url, filename)
print("Download complete!")


Download complete!


In [32]:
with open("TinyStoriesV2-GPT4-train.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

all_stories = raw_text.split("<|endoftext|>")

all_stories = [story.strip() for story in all_stories if story.strip()]

stories_subset = all_stories[:100]

print(f"Loaded {len(stories_subset)} stories.")
print("Sample story:", stories_subset[1])

Loaded 100 stories.
Sample story: Once upon a time, there was a reliable otter named Ollie. He lived in a river with his family. They all loved to play and swim together.
One day, Ollie's mom said, "Ollie, hurry and get some fish for dinner!" Ollie swam fast to catch fish. He saw his friend, the duck. "Hi, Ollie!" said the duck. "Hi, duck!" said Ollie. "I need to hurry and catch fish for my family."
While Ollie was catching fish, he found a big shiny stone. He thought, "This is not a fish, but it is so pretty!" Ollie took the shiny stone home to show his family. They all looked at the shiny stone and smiled. The shiny stone made everyone happy, and they forgot about the fish for dinner.


In [None]:
import re

prompts = []

for story in stories_subset:
    match = re.split(r'(?<=[.!?])\s+', story, maxsplit=1)
    if match:
        prompt = match[0]
        prompts.append(prompt)
print(f"Extracted {len(prompts)} prompts.")
print("Sample prompt:", prompts[99])


Extracted 100 prompts.
Sample prompt: Once upon a time, there was a polite farm dog named Spot.


In [33]:
def generate_text(prompt, model, tokenizer, max_new_tokens=100):
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]

    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=max_new_tokens,  # how many *new* tokens to generate
        do_sample=True,                 # use sampling instead of greedy decoding
        temperature=0.9,                # controls randomness
        top_k=50,                       # top-k sampling
        top_p=0.95,                     # nucleus sampling
        repetition_penalty=1.2,         # penalize repeating phrases
        no_repeat_ngram_size=3,         # avoid short n-gram loops
        pad_token_id=tokenizer.eos_token_id,
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [38]:
generated_text = []

for i, prompt in enumerate(prompts):
    full_story = generate_text(prompt, model, tokenizer)
    generated_text.append(full_story)

generated_text[0]


'Once upon a time there was a little boy named Ben. When they saw him, he became more of the same than his father\'s brother and auntie; but to this day some people have said that it came from an imaginary being who wished never again for any help with their own family." - Charles Dickens (13)\nYou\'ll be able to see how all kinds go wrong if you look closely at Wikipedia page on The Little Doctor Who story "The Lord Of Shadow". A man in Blackadder Street goes through most stories after having died while walking past'

Metrics

Predictive Qualities

Perplexity - ability to predict the next word in sequence (1 is perfect), calculated based on the likelihood of the generated tokens given the model's training

In [35]:
import math

def calculate_perplexity(model, tokenizer, text):
    encodings = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**encodings, labels=encodings["input_ids"])
        loss = outputs.loss
    return math.exp(loss.item())

In [None]:
calculate_perplexity(model, tokenizer, generated_text) 

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


37.475495546998715

BLEU, ROUGE

In [41]:
from evaluate import load

bleu = load("bleu")
rouge = load("rouge")

# For each generated and reference pair
results = bleu.compute(predictions=generated_text, references=stories_subset)
print("BLEU:", results["bleu"])

results = rouge.compute(predictions=generated_text, references=stories_subset)
print("ROUGE:", results)

BLEU: 0.06726828559805573
ROUGE: {'rouge1': np.float64(0.2518043131539909), 'rouge2': np.float64(0.09347634019934056), 'rougeL': np.float64(0.17336363995553644), 'rougeLsum': np.float64(0.20800055106219872)}


Diversity / Creativity

Self-BLEU (penalizes similar generations)

In [42]:
from nltk.translate.bleu_score import sentence_bleu

def compute_self_bleu(texts):
    scores = []
    for i in range(len(texts)):
        references = [t.split() for j, t in enumerate(texts) if j != i]
        hypothesis = texts[i].split()
        score = sentence_bleu(references, hypothesis)
        scores.append(score)
    return sum(scores) / len(scores)


In [43]:
compute_self_bleu(generated_text)

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


0.16357457616414095

Distinct-n (n-gram diversity)

In [45]:
def distinct_n(texts, n=2):
    all_ngrams = set()
    total_ngrams = 0
    for text in texts:
        tokens = text.split()
        ngrams = zip(*[tokens[i:] for i in range(n)])
        ngram_list = list(ngrams)
        all_ngrams.update(ngram_list)
        total_ngrams += len(ngram_list)
    return len(all_ngrams) / total_ngrams if total_ngrams > 0 else 0


In [46]:
distinct_n(generated_text)

0.7854216087987923

 Entropy

In [47]:
from collections import Counter
import math

def calculate_entropy(texts, n=1):
    counter = Counter()
    total = 0
    for text in texts:
        tokens = text.split()
        ngrams = list(zip(*[tokens[i:] for i in range(n)]))
        counter.update(ngrams)
        total += len(ngrams)
    probs = [count / total for count in counter.values()]
    entropy = -sum(p * math.log(p) for p in probs)
    return entropy


In [48]:
calculate_entropy(generated_text)

6.590104667252185

Efficiency

In [49]:
import time

start = time.time()
_ = generate_text(prompt, model, tokenizer)
end = time.time()

print(f"Inference time: {end - start:.2f} sec")


Inference time: 0.91 sec


## GPT2 with MC Dropout

In [51]:
def generate_story_mc_dropout(prompt, num_samples=20, max_new_tokens=100):
    model_inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = model_inputs["input_ids"]

    for _ in range(max_new_tokens):
        all_probs = []

        # MC dropout passes
        for _ in range(num_samples):
            with torch.no_grad():
                outputs = model(input_ids)
                logits = outputs.logits  # shape: (1, seq_len, vocab_size)
                next_token_logits = logits[0, -1, :]  # logits for next token
                probs = F.softmax(next_token_logits, dim=-1)
                all_probs.append(probs.cpu().numpy())

        avg_probs = np.mean(all_probs, axis=0)  # posterior predictive
        next_token_id = np.random.choice(len(avg_probs), p=avg_probs)
        next_token = torch.tensor([[next_token_id]])

        # Append to input
        input_ids = torch.cat([input_ids, next_token], dim=-1)

    return tokenizer.decode(input_ids[0], skip_special_tokens=True)



In [53]:
test = prompts[1]
print(generate_story_mc_dropout(test))

Once upon a time, there was a reliable otter named Ollie. She lived with a cousin too nervous to ever be a Terrier. Her Syndrome performed very well her second term in office, multiple had a Box. Another year, and another mermaid par been implemented, and with the times, that had been exaggerated.8 The Terrier Girl had she not, a seemingly distant cousin, was once in MOST TERRIBLE TIMES.


3

At the bus stop hours of 7 a.m. Mall Ave., for about a


Metrics

Predictive Quality

Perplexity

Bleu Score

Diversity / Creativity

Self-Bleu and ROUGE

Entropy

Distinct-n (n-gram diversity)

Efficiency