In [None]:
# !pip install datasets==2.16.1

In [None]:
import random
import re
from typing import List

from datasets import load_dataset, list_datasets, Dataset
from IPython.display import Image
from IPython.core.display import HTML
import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, GPT2Model
from tqdm import tqdm

https://huggingface.co/docs/transformers/model_doc/gpt2

https://huggingface.co/docs/transformers/main_classes/tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [None]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# DEVICE = torch.device("cpu")
print(f"Our device is {DEVICE}")

# Как работает генерация с т.з. кода под капотом:

In [None]:
TEXT_INPUT = "Парламент- это не место для"

In [None]:
inputs = tokenizer(TEXT_INPUT, return_tensors="pt")

In [None]:
for k, v in inputs.items():
  inputs[k] = v.to(DEVICE)

In [None]:
bare_model = GPT2Model.from_pretrained("gpt2")
bare_model.eval()
bare_model.to(DEVICE)
bare_outputs = bare_model(**inputs, output_hidden_states=True)

In [None]:
bare_model

In [None]:
last_hidden_states = bare_outputs.last_hidden_state

Weight tying

In [None]:
Image(url= "https://lena-voita.github.io/resources/lectures/lang_models/practical/weight_tying_idea-min.png", width=1900, height=900)

In [None]:
logits = torch.matmul(
    last_hidden_states[-1][-1],
    bare_model.wte.weight.T
)

In [None]:
bare_probas = F.softmax(logits, dim=0)

In [None]:
torch.argmax(bare_probas)

# Упрощённая генерация:

In [None]:
llm_model = AutoModelForCausalLM.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)
llm_model.eval()
llm_model.to(DEVICE)
llm_outputs = llm_model(**inputs)

In [None]:
n_params = 0
for param in llm_model.parameters(recurse=True):
    n_params += param.numel()


n_params = str(n_params)
n_params = ",".join(
    [
        n_params[i: i+3]
        for i in range(
            len(n_params) - 3, -1, -3
        )
    ][::-1]
)
print(f"Number of parameters: {n_params}")

In [None]:
torch.argmax(llm_outputs.logits[-1][-1])

In [None]:
llm_probas = F.softmax(llm_outputs.logits[-1][-1], dim=0)

In [None]:
torch.allclose(bare_probas, llm_probas, rtol=1e-4)

# Задание 1: написать свою имплементацию BPE

Как работает алгоритм: \
- У алгоритма один гиперпараметр- число итераций \
- На каждой итерации мы находим самую популярную пару токенов, идущих подряд \(для примера наховём их a, b) \
- Мы создаём новый токен, соответствующий конактенции пары из предыдущего пункта \(a,b -> ab), все вхождения пары в тренировочных данных заменяем на новый токен

In [None]:
def update_training_data(training_data, max_key):
    n_tokens = len(training_data)
    training_data_tmp = []
    i = 0
    while i < n_tokens - 1:
        if training_data[i] + training_data[i+1] == max_key:
            training_data_tmp.append(max_key)
            i += 2
        else:
            training_data_tmp.append(training_data[i])
            i += 1

    if i == n_tokens - 1:
        training_data_tmp.append(training_data[i])

    return training_data_tmp

In [None]:
training_data = [
    "a", "b", "c", "d",
    "e",
    "a",
    "a", "b", "c", "d",
    "b", "c", "d"
]

In [None]:
ALPHABET = set(training_data)

In [None]:
print(f"This is my alphabeth: {ALPHABET}")
print()
print(f"Its length is {len(ALPHABET)}")

In [None]:
NUM_MERGES = 3

In [None]:
from collections import Counter

for _ in range(NUM_MERGES):
    counter = Counter()

    for cur_token, next_token in zip(training_data, training_data[1:]):
        counter[cur_token + next_token] += 1

    max_key = max(counter, key=counter.get)
    print(f"Found new max key: {max_key}!")
    ALPHABET.add(max_key)

    training_data = update_training_data(training_data, max_key)

In [None]:
print(f"This is my alphabeth: {ALPHABET}")
print()
print(f"Its length is {len(ALPHABET)}")

# Задание 2: написать fine-tuning для языковой модели под набор данных:

Описание датасета можно найти тут: \
https://paperswithcode.com/dataset/rucos \
https://huggingface.co/datasets/RussianNLP/russian_super_glue

In [None]:
dataset = load_dataset("RussianNLP/russian_super_glue", name='rucos')

In [None]:
RE_BAD_PATTERNS = re.compile("(@[a-z]+|\n)")

In [None]:
random_idx_from_train = random.randint(0, len(dataset['train']) - 1) # 66411

random_object = dataset['train'][random_idx_from_train]['passage']

filtered_random_object = RE_BAD_PATTERNS.sub(" ", random_object)
print(random_object)
print("*" * 20)
print(filtered_random_object)

In [None]:
tokenizer.add_special_tokens({'pad_token': "<|endoftext|>"})

def texts_to_batch(texts: List[str]) -> torch.Tensor:
    clean_texts = [
        RE_BAD_PATTERNS.sub(" ", _["passage"]) for _ in texts
    ]
    tokenized_texts = tokenizer(
        text=clean_texts,
        return_tensors="pt",
        add_special_tokens=True,
        padding="max_length",
        truncation=True
      )
    return tokenized_texts

In [None]:
BATCH_SIZE = 2

train_dl = torch.utils.data.DataLoader(
    dataset=dataset['train'],
    batch_size=BATCH_SIZE,
    shuffle=True,
    drop_last=True,
    collate_fn=texts_to_batch
)

In [None]:
N_ITERATIONS = 1000

inputs = tokenizer("В прошлый четверг президенты Казахстана и России", return_tensors="pt")
for k, v in inputs.items():
  inputs[k] = v.to(DEVICE)

In [None]:
OUTPUT_SIZE = 40

In [None]:
llm_model = AutoModelForCausalLM.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)
llm_model.eval()
llm_model.to(DEVICE)
llm_outputs = llm_model(**inputs)

In [None]:
loss = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(
    params=llm_model.parameters(), lr=1e-6
)

cur_iteration = 0
for batch in train_dl:
    if cur_iteration == N_ITERATIONS:
      break

    llm_model.train()
    #################
    input_tokens = batch['input_ids'][:, :-1].to(DEVICE)
    labels = batch['input_ids'].clone()[:, 1:].to(DEVICE)
    attention_mask = batch['attention_mask'][:, :-1].to(DEVICE)
    out_logits = llm_model(input_ids=input_tokens, attention_mask=attention_mask).logits
    labels[labels == tokenizer.pad_token_id] = -100
    loss_value = loss(out_logits.permute(0, 2, 1), labels)
    #################
    print(f"Loss value: {loss_value.item()}")
    loss_value.backward()
    optimizer.step()


    llm_model.eval()
    for n_beams in range(2, 5):
      beam_output = llm_model.generate(**inputs, max_new_tokens=OUTPUT_SIZE, num_beams=n_beams)
      print(f"Beam size={n_beams}")
      print(tokenizer.decode(beam_output[0], skip_special_tokens=True))
      print()

    print("*" * 20)
    cur_iteration += 1

# Задание 3: написать greedy search, сравнить результаты с имплементацией от transformers

In [None]:
OUTPUT_SIZE = 40

In [None]:
llm_model = AutoModelForCausalLM.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)
llm_model.eval()
llm_model.to(DEVICE)
llm_outputs = llm_model(**inputs)

In [None]:
def convert_to_expected_input(input_ids, attention_mask):
    input_ids = torch.tensor(input_ids, device=DEVICE)
    attention_mask = torch.tensor(attention_mask, device=DEVICE)
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask
    }

In [None]:
input_ids, attention_mask = inputs["input_ids"].tolist(), inputs["attention_mask"].tolist()
"""
input_ids должно содержать нагенерированные токены
"""

for _ in range(OUTPUT_SIZE):
  ######
  bare_inputs = convert_to_expected_input(input_ids, attention_mask)
  bare_outputs = bare_model(**bare_inputs, output_hidden_states=True)
  logits = torch.matmul(
      bare_outputs.last_hidden_state[-1][-1],
      bare_model.wte.weight.T
  )
  bare_probas = F.softmax(logits, dim=0)
  next_token = torch.argmax(bare_probas).item()
  input_ids[-1].append(next_token)
  attention_mask[-1].append(1)
  ######

In [None]:
llm_predictions = llm_model.generate(**inputs, max_new_tokens=OUTPUT_SIZE)

Ниже проверяем, что наивная имплементация совпадает с ожидаемой:

In [None]:
assert input_ids[-1][-OUTPUT_SIZE:] == llm_predictions[-1][-OUTPUT_SIZE:].tolist()

In [None]:
tokenizer.decode(llm_predictions[0], skip_special_tokens=True)