Скачаем датасет с короткими постами sentiment140

In [1]:
import requests

url = "https://code.s3.yandex.net/deep-learning/tweets.txt"
response = requests.get(url)

with open("data/raw_dataset.csv", "wb") as f:
    f.write(response.content)


В data_utils.py
- удалим ссылкина сайты
- приведем к нижнем регистру
- удалим все, кроме латинских букв, цифр и пробелов
- удалим дублирующиеся пробелы, пробелы по краям
- токенизируем текст
- разобьем на 3 датасета: тренировочный, валидационный, тестовый

"очистим" текст

In [None]:
import importlib
import src.data_utils

importlib.reload(src.data_utils)

from src.data_utils import preprocess_dataset

preprocess_dataset("data/raw_dataset.csv", "data/dataset_processed.csv")


токенизируем текст

In [5]:
import importlib
import src.data_utils

importlib.reload(src.data_utils)


from src.data_utils import tokenize_dataset

tokenize_dataset(
    "data/dataset_processed.csv",
    "data/tokenized_dataset.csv"
)


Разобьём на 3 датасета: тренировочный (80%), валидационный (10%), тестовый (10%)

In [6]:
import importlib
import src.data_utils

importlib.reload(src.data_utils)

from src.data_utils import split_dataset

split_dataset(
    input_path="data/tokenized_dataset.csv",
    train_path="data/train.csv",
    val_path="data/val.csv",
    test_path="data/test.csv"
)


In [None]:
В next_token_dataset.py
Создадим даталоадеры и train.csv, val.csv, test.csv

In [7]:
from src.next_token_dataset import make_dataloader
from transformers import AutoTokenizer

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
pad_id = tokenizer.pad_token_id  # у bert-base-uncased это 0

batch_size = 64
max_len = 128

_, train_loader = make_dataloader("data/train.csv", batch_size=batch_size, shuffle=True,  max_len=max_len, pad_id=pad_id)
_, val_loader   = make_dataloader("data/val.csv",   batch_size=batch_size, shuffle=False, max_len=max_len, pad_id=pad_id)
_, test_loader  = make_dataloader("data/test.csv",  batch_size=batch_size, shuffle=False, max_len=max_len, pad_id=pad_id)

print("train batches:", len(train_loader))
print("val batches:", len(val_loader))
print("test batches:", len(test_loader))


train batches: 20015
val batches: 2502
test batches: 2502


In [8]:
batch = next(iter(train_loader))

print("input_ids:", batch["input_ids"].shape)
print("targets:", batch["targets"].shape)
print("lengths:", batch["lengths"].shape)

# посмотрим на один пример в батче (без паддинга)
i = 0
L = batch["lengths"][i].item()

x_ids = batch["input_ids"][i, :L].tolist()
y_ids = batch["targets"][i, :L].tolist()

print("X tokens:", tokenizer.convert_ids_to_tokens(x_ids[:20]))
print("Y tokens:", tokenizer.convert_ids_to_tokens(y_ids[:20]))

# sanity-check: Y должен быть X, сдвинутым на 1
print("Shift OK:", x_ids[1:] == y_ids[:-1])


input_ids: torch.Size([64, 31])
targets: torch.Size([64, 31])
lengths: torch.Size([64])
X tokens: ['[CLS]', 'my', '##sw', '##eet', '##mel', '##od', '##y', 'nice', 'tie', 'i', 'love', 'the', 'green']
Y tokens: ['my', '##sw', '##eet', '##mel', '##od', '##y', 'nice', 'tie', 'i', 'love', 'the', 'green', '[SEP]']
Shift OK: True


проверю метрики на тестовой валидации

In [10]:
import torch
import evaluate
from tqdm import tqdm
from transformers import AutoTokenizer

from src.lstm_model import LSTMNextToken
from src.next_token_dataset import make_dataloader

# --- пути ---
ckpt_path = "models/lstm_next_token.pt"   # ваш файл весов
test_csv = "data/test.csv"

# --- загрузка чекпоинта ---
ckpt = torch.load(ckpt_path, map_location="cpu")
cfg = ckpt["config"]

model_name = ckpt.get("model_name", "bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained(model_name)
pad_id = ckpt.get("pad_id", tokenizer.pad_token_id)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = LSTMNextToken(
    vocab_size=ckpt["vocab_size"],
    emb_dim=cfg["emb_dim"],
    hidden_dim=cfg["hidden_dim"],
    num_layers=cfg["num_layers"],
    dropout=cfg["dropout"],
    pad_id=pad_id,
).to(device)

model.load_state_dict(ckpt["model_state_dict"])
model.eval()

# --- dataloader ---
_, test_loader = make_dataloader(
    test_csv,
    batch_size=64,
    shuffle=False,
    max_len=cfg["max_len"],
    pad_id=pad_id,
)

rouge = evaluate.load("rouge")

predictions = []
references = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Test ROUGE"):
        x = batch["input_ids"].to(device)     # [B, T]
        y = batch["targets"].to(device)       # [B, T]
        lengths = batch["lengths"].to(device) # [B]

        B = x.size(0)
        for i in range(B):
            L = int(lengths[i].item())
            if L <= 2:
                continue

            # восстановим "полную" последовательность токенов (примерно):
            # full = x + последний токен из y
            x_i = x[i, :L]
            y_i = y[i, :L]
            full = torch.cat([x_i, y_i[-1:].clone()], dim=0)  # [L+1]

            full_len = full.size(0)
            prompt_len = max(1, int(full_len * 0.75))
            if prompt_len >= full_len:
                continue

            prompt = full[:prompt_len]
            ref_tail = full[prompt_len:]
            num_new = int(ref_tail.size(0))

            gen_full = model.generate(
                input_ids=prompt,
                num_new_tokens=num_new,
                max_len=cfg["max_len"],
                temperature=1.0,
                top_k=50,
                eos_id=None,  # генерируем фиксированную длину
            )

            gen_tail = gen_full[prompt_len:]

            pred_text = tokenizer.decode(gen_tail.tolist(), skip_special_tokens=True).strip()
            ref_text  = tokenizer.decode(ref_tail.tolist(), skip_special_tokens=True).strip()

            predictions.append(pred_text)
            references.append(ref_text)

results = rouge.compute(predictions=predictions, references=references)

print("ROUGE on test:")
for k, v in results.items():
    print(f"{k}: {v:.4f}")

print("\nExamples:")
for i in range(3):
    print("REF :", references[i])
    print("GEN :", predictions[i])
    print("-" * 80)


KeyboardInterrupt: 