In [1]:
import torch
import re
from transformers import GPT2Tokenizer, GPT2LMHeadModel

############################################
# 1. Setup GPT-2 Model/Tokenizer
############################################
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

############################################
# 2. Poem: The Snow Man (One line per element)
############################################
poem_lines = [
    "One must have a mind of winter",
    "To regard the frost and the boughs",
    "Of the pine-trees crusted with snow;",
    "And have been cold a long time",
    "To behold the junipers shagged with ice,",
    "The spruces rough in the distant glitter",
    "Of the January sun; and not to think",
    "Of any misery in the sound of the wind,",
    "In the sound of a few leaves,",
    "Which is the sound of the land",
    "Full of the same wind",
    "That is blowing in the same bare place",
    "For the listener, who listens in the snow,",
    "And, nothing himself, beholds",
    "Nothing that is not there and the nothing that is."
]

############################################
# 3. Function: Get Clean Top-K Next Tokens
############################################
def get_clean_topk_tokens(model, tokenizer, prompt, top_k=10):
    """
    Returns a list of (token_str, probability) for the top_k most likely next
    tokens from GPT-2, filtering out anything that isn't purely alphabetic.
    """
    # Encode the prompt
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(input_ids)

    logits = outputs.logits[0, -1, :]  # last position
    probs = torch.softmax(logits, dim=-1)

    # We'll fetch more than top_k in case we skip many
    top_tokens = torch.topk(probs, top_k * 10)

    filtered = []
    for token_id, prob_val in zip(top_tokens.indices, top_tokens.values):
        token_str = tokenizer.decode([token_id.item()],
                                     skip_special_tokens=True,
                                     clean_up_tokenization_spaces=True)
        token_str = token_str.strip()

        # Skip anything that is empty or has non-alphabetic chars
        # -> This excludes punctuation, question marks, partial subwords, etc.
        if not token_str:
            continue
        if not token_str.isalpha():
            continue

        # If we reach here, it's a good token
        filtered.append((token_str, prob_val.item()))
        if len(filtered) >= top_k:
            break

    return filtered

############################################
# 4. Replace the LAST word with the n-th next token
############################################
def p_plus_n_transform_last_word(line, n=7, topk_size=15):
    """
    Replaces *only the last word* in `line` with the n-th most probable
    GPT-2 next token, filtering to purely alphabetic tokens only.
    Returns:
      new_line: the line with the last word replaced
      chosen_word: the new token
      chosen_prob: probability of that token
    """
    words = line.strip().split()
    if not words:
        return line, None, None

    # last token
    last_word = words[-1]
    match = re.match(r"^(.*?)(\W*)$", last_word)
    if match:
        main_word = match.group(1)
        punctuation = match.group(2)
    else:
        main_word = last_word
        punctuation = ""

    # context is everything except the last word
    context = " ".join(words[:-1])
    if context:
        context += " "

    # get top-k from GPT-2
    topk = get_clean_topk_tokens(model, tokenizer, context, top_k=max(n, topk_size))

    if not topk:
        # no valid tokens found, keep original
        return line, None, None

    # choose the n-th token or fallback
    if len(topk) < n:
        chosen_token, chosen_prob = topk[-1]
    else:
        chosen_token, chosen_prob = topk[n-1]

    # preserve capitalization
    if main_word and main_word[0].isupper():
        chosen_token = chosen_token.capitalize()

    # rebuild line
    new_line = words[:-1]
    new_line.append(chosen_token + punctuation)
    new_line = " ".join(new_line)

    return new_line, chosen_token, chosen_prob

############################################
# 5. Generate Poem Transformations
############################################
def generate_pn_poem(poem_lines, n=7, topk_size=15):
    """
    For each line, replaces the last word with the n-th next token
    that is purely alphabetic. Skips punctuation tokens, question marks, etc.
    Returns: (transformed_lines, changes)
    where changes is a list of (original_last_word, chosen_word, chosen_prob).
    """
    transformed = []
    changes = []
    for line in poem_lines:
        if not line.strip():
            transformed.append(line)
            continue

        last_word_raw = line.strip().split()[-1]
        new_line, chosen_word, chosen_prob = p_plus_n_transform_last_word(line, n=n, topk_size=topk_size)
        transformed.append(new_line)

        if chosen_word is not None and chosen_prob is not None:
            changes.append((last_word_raw, chosen_word, chosen_prob))

    return transformed, changes

############################################
# 6. Create P+7 and P+X
############################################
p7_lines, p7_changes = generate_pn_poem(poem_lines, n=7, topk_size=15)
x_value = 10
px_lines, px_changes = generate_pn_poem(poem_lines, n=x_value, topk_size=15)

############################################
# 7. Write P+7 to a File
############################################
with open("P+7.txt", "w", encoding="utf-8") as f:
    f.write("=== P+7 Version (Last Word, Alphabetic Only) ===\n\n")
    for line in p7_lines:
        f.write(line + "\n")
    f.write("\n--- Changed Words (Original -> Replacement, prob) ---\n")
    for (orig, repl, prob) in p7_changes:
        f.write(f"{orig} -> {repl} (prob={prob:.5f})\n")

print("P+7 transformation saved to P+7.txt")

############################################
# 8. Write P+X to a File
############################################
px_filename = f"P+{x_value}.txt"
with open(px_filename, "w", encoding="utf-8") as f:
    f.write(f"=== P+{x_value} Version (Last Word, Alphabetic Only) ===\n\n")
    for line in px_lines:
        f.write(line + "\n")
    f.write("\n--- Changed Words (Original -> Replacement, prob) ---\n")
    for (orig, repl, prob) in px_changes:
        f.write(f"{orig} -> {repl} (prob={prob:.5f})\n")

print(f"P+{x_value} transformation saved to {px_filename}")

############################################
# 9. Print a Sample
############################################
print("\nSample P+7 lines:\n")
for ln in p7_lines[:5]:
    print(ln)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

P+7 transformation saved to P+7.txt
P+10 transformation saved to P+10.txt

Sample P+7 lines:

One must have a mind of iced
To regard the frost and the ills
Of the pine-trees crusted with ik;
And have been cold a long ich
To behold the junipers shagged with ips,
