# Instalacion y Carga Previa

In [1]:
from datasets import load_dataset
ds = load_dataset("abisee/cnn_dailymail", "3.0.0",  split="test")
print(ds)

Dataset({
    features: ['article', 'highlights', 'id'],
    num_rows: 11490
})


In [2]:
import os, sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    print(f"Añadido '{module_path}' a sys.path")

Añadido 'c:\fespa-dev\nlp-curso\nlp-proyecto03' a sys.path


In [3]:
from src.utils import *
setup_chapter()

Using transformers v4.51.3
Using datasets v3.6.0


* devive: Nos aseguramos que estemos usando el poder computacional de nuestra GPU ;p
* model: Modelo preentrenado, en este caso usaremos un modelo preentrado `gpt2-xl`
* tokenizer: Tokenizador para nuestro modelo, se ajusta segun el modelo seleccionado


In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "gpt2-xl"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

In [5]:
device

'cuda'

In [6]:
tokenizer

GPT2TokenizerFast(name_or_path='gpt2-xl', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}
)

# Greedy Search Decoding --> torch implementaion

In [7]:
import pandas as pd
def greedy_search_generation(model, tokenizer, input_txt):
    iterations = []
    n_steps = 8
    choices_per_step = 5

    input_ids = tokenizer(input_txt, return_tensors="pt")["input_ids"].to(device)
    with torch.no_grad():
        for _ in range(n_steps):
            iteration = dict()
            iteration["Input"] = tokenizer.decode(input_ids[0])
            output = model(input_ids=input_ids)
            # Seleccionamos los logits del primer batch y el último token y aplicamos softmax
            next_token_logits = output.logits[0, -1, :]
            next_token_probs = torch.softmax(next_token_logits, dim=-1)
            sorted_ids = torch.argsort(next_token_probs, dim=-1, descending=True)
            # Guardamos los tokens con mayor probabilidad :>
            for choice_idx in range(choices_per_step):
                token_id = sorted_ids[choice_idx]
                token_prob = next_token_probs[token_id].cpu().numpy()
                token_choice = (
                    f"{tokenizer.decode(token_id)} ({100 * token_prob:.2f}%)"
                )
                iteration[f"Choice {choice_idx+1}"] = token_choice
            # Insertamos el token predicho a nuestro input para la siguiente iteración
            input_ids = torch.cat([input_ids, sorted_ids[None, 0, None]], dim=-1)
            iterations.append(iteration)
    return iterations

In [8]:
input_txt = "Once upon a time, in a land far, far away, there lived a dragon who loved to"
iterations = greedy_search_generation(model, tokenizer, input_txt)
df = pd.DataFrame(iterations)
df

Unnamed: 0,Input,Choice 1,Choice 2,Choice 3,Choice 4,Choice 5
0,"Once upon a time, in a land far, far away, the...",eat (14.80%),fly (6.00%),play (4.72%),read (4.45%),sing (3.22%)
1,"Once upon a time, in a land far, far away, the...",. (13.28%),people (11.11%),dragons (3.08%),humans (2.38%),the (2.10%)
2,"Once upon a time, in a land far, far away, the...",He (12.89%),One (12.45%),\n (11.91%),His (5.39%),The (4.74%)
3,"Once upon a time, in a land far, far away, the...",had (22.78%),was (16.69%),loved (8.75%),lived (5.97%),would (3.87%)
4,"Once upon a time, in a land far, far away, the...",a (36.92%),many (18.03%),three (5.50%),two (5.39%),one (2.75%)
5,"Once upon a time, in a land far, far away, the...",lot (5.06%),very (3.61%),great (2.82%),dragon (2.31%),large (2.27%)
6,"Once upon a time, in a land far, far away, the...",of (97.88%),to (1.27%),", (0.19%)",. (0.12%),more (0.10%)
7,"Once upon a time, in a land far, far away, the...",friends (52.05%),dragon (4.48%),food (3.58%),dragons (3.21%),followers (2.05%)


In [9]:
iterations[-1]

{'Input': 'Once upon a time, in a land far, far away, there lived a dragon who loved to eat. He had a lot of',
 'Choice 1': ' friends (52.05%)',
 'Choice 2': ' dragon (4.48%)',
 'Choice 3': ' food (3.58%)',
 'Choice 4': ' dragons (3.21%)',
 'Choice 5': ' followers (2.05%)'}