In [1]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
from google.colab import files

upload = files.upload()

In [None]:
!ls -la

In [None]:
!mv 'dataset.jsonl' 'data/dataset.jsonl'

In [None]:
DATASET_PATH = 'data/dataset.jsonl'

with open(DATASET_PATH) as f: # open the dataset file
    df = pd.read_json(DATASET_PATH, lines=True).set_index('id') # read the .jsonl file into Pandas DataFrame

In [None]:
print(df.dtypes)
# display(df)

In [None]:
df.head()

In [None]:
# plot rating histogram with outliers removed
rating = df.rating.dropna()
quantile = rating.quantile(.99)
rating.hist(bins=100, range=(rating.min(), quantile))
plt.xlabel('rating')
plt.ylabel('count')
plt.show()

In [None]:
# plot quote count by year
by_year = df.groupby(df.date.dt.year)['text'].count()
by_year.plot.bar()
plt.xlabel('year')
plt.ylabel('count')
plt.show()

In [None]:
# plot count of NaN and not-NaN rated quotes
nans = df.rating.isna().sum()
not_nans = len(df) - nans


bars = [nans, not_nans]
y_pos = np.arange(len(bars))
plt.bar(y_pos, bars)
plt.xticks(y_pos, ('NaN', 'Not NaN'))
plt.ylabel('count')
plt.show()

In [None]:
pd.set_option('display.max_colwidth', 150)
# pd.set_option('display.width', 500)

In [None]:
# df['text'][0:5]
df['text'][2]

Задание 1

In [None]:
# Загрузка твитов
!wget https://gist.githubusercontent.com/avidale/d3da0ded85a4a16db6eb84d8331638ce/raw/a188084e5ef37b43b01fef0534b55c865b9a569e/tweets.txt

In [None]:
!ls

In [None]:
# Закачиваем сохранённые твиты
with open('tweets.txt', 'r') as f:
    tweets = f.read().strip().split('\n\n')
print(len(tweets))
for i in range(3):
    print(tweets[i])

In [None]:
import torch
device = torch.device("cuda")

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
# Загружаем large GPT3, которая основана на GPT2
model_name = 'sberbank-ai/rugpt3large_based_on_gpt2'
#model_name = 'Grossmend/rudialogpt3_medium_based_on_gpt2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

In [None]:
import random

In [None]:
# Мы хотим, чтобы GPT выводила суть, что после 3-х звёздочек ('***')
#  нужно генерировать какой то осмысленный текст похожий на твит.
#
# Пишем рэндомное сэмплирование 5 твитов (берём 5 твитов из 26 случайным обазом)

sep = '\n***\n'  # Признак того, что твит закончился и нужногенерировать ещё один твит
# sep = '\n27479153	Sandy_mustache	2021-02-18 16:44:00	'


# Так как мы постоянно сэмплируем разные твиты,
# мы будем постоянно получать разное распределение
prefix = sep.join([''] + random.sample(tweets, k=5) + [''])

tokens = tokenizer(prefix, return_tensors='pt')
tokens = {k: v.to(model.device) for k, v in tokens.items()}
end_token_id = tokenizer.encode('***')[0]  # '***' - токен который будет оканчивать твит

# выводим то, что мы передаём на вход
print(prefix)

In [None]:
# Генерируем какой то осмысленный текст похожий на твит.

size = tokens['input_ids'].shape[1]
output = model.generate(
    **tokens,
    #end_token=end_token_id,
    do_sample=False,  # вкл/выкл режим выдачи нескольких вариантов д.б. ещё один параметр
    max_length=size+128,
    # max_length=size+64,
    repetition_penalty=4.2,  # штраф за повторы одинарных токенов
    temperature=0.7,  # температура
    num_beams=10,  # Строим дерево глубины 10
    # no_repeat_ngram_size=3  # ! тройки подряд идущих токенов не должны повторяться (3 и меньше токенов не должны повторяться)
)
decoded = tokenizer.decode(output[0])
result = decoded[len(prefix):]
print(result)

In [None]:
# Генерируем какой то осмысленный текст похожий на твит.

size = tokens['input_ids'].shape[1]
output = model.generate(
    **tokens,
    #end_token=end_token_id,
    do_sample=False,
    max_length=size+128,
    # max_length=size+64,
    repetition_penalty=4.2,   # штраф за повторы одинарных токенов
    temperature=1.3,  # температура
    num_beams=7,  # Строим дерево глубины 10
    no_repeat_ngram_size=5  # ! тройки подряд идущих токенов не должны повторяться (3 и меньше токенов не должны повторяться)
)
decoded = tokenizer.decode(output[0])
result = decoded[len(prefix):]
print(result)

Задание 2

In [None]:
!pip install razdel networkx pymorphy2[fast] nltk rouge==0.3.1
!pip install --upgrade datasets tqdm transformers

In [None]:
# !pip install datasets

In [None]:
!ls

In [None]:
from datasets import load_dataset

dataset_train = load_dataset('IlyaGusev/gazeta', revision="v1.0", split= 'train[:10%]')
dataset_test = load_dataset('IlyaGusev/gazeta', revision="v1.0", split= 'test[:10%]')

In [None]:
dataset_train

In [None]:
dataset_test

In [None]:
dataset_test['summary'][0]

In [None]:
dataset_test['title'][0]

In [None]:
model_name = "IlyaGusev/rut5_base_sum_gazeta"

In [None]:
def len_tok(text):
    return len(text.split())

In [None]:
max_len_sum, max_len_tl = max(map(len_tok, dataset_train['summary'])), max(map(len_tok, dataset_train['title']))
max_len_sum, max_len_tl

In [None]:
max_len_sum, max_len_tl = 60, 15

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    tokenized_input = tokenizer(batch['summary'], padding='max_length', truncation=True, max_length=max_len_sum)
    tokenized_label = tokenizer(batch['title'], padding='max_length', truncation=True, max_length=max_len_tl)

    tokenized_input['labels'] = tokenized_label['input_ids']

    return tokenized_input

dataset_train = dataset_train.map(tokenize, batched=True, batch_size=8)
dataset_test = dataset_test.map(tokenize, batched=True, batch_size=8)

dataset_train.set_format('numpy', columns=['input_ids', 'attention_mask', 'labels'])
dataset_test.set_format('numpy', columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
dataset_train.save_to_disk('gazeta/train')
dataset_test.save_to_disk('gazeta/test')

In [None]:
!ls

In [None]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments


model = T5ForConditionalGeneration.from_pretrained(model_name)

In [None]:
output_dir = 'gazeta/output'

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_accumulation_steps=1, # Number of eval steps to keep in GPU (the higher, the mor vRAM used)
    prediction_loss_only=True, # If I need co compute only loss and not other metrics, setting this to true will use less RAM
    learning_rate=0.00001,
    evaluation_strategy='steps', # Run evaluation every eval_steps
    save_steps=1000, # How often to save a checkpoint
    save_total_limit=1, # Number of maximum checkpoints to save
    remove_unused_columns=True, # Removes useless columns from the dataset
    run_name='run_gazeta', # Wandb run name
    logging_steps=500, # How often to log loss to wandb
    eval_steps=500, # How often to run evaluation on the val_set
    logging_first_step=False, # Whether to log also the very first training step to wandb
    load_best_model_at_end=True, # Whether to load the best model found at each evaluation.
    metric_for_best_model="loss", # Use loss to evaluate best model.
    greater_is_better=False # Best model is the one with the lowest loss, not highest.
)

In [None]:
%%time

# Обучение. У нас 10 эпох.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_test
)

trainer.train()

In [None]:
trainer.save_model(output_dir + '/model')

In [None]:
INX = 100
print("SUMMARY: | {}".format(dataset_test['summary'][INX]))
print("TITLE: | {}".format(dataset_test['title'][INX]))

In [None]:
device = "cuda"

In [None]:
import torch

input_text = dataset_test['summary'][INX]

with torch.no_grad():
    tokenized_text = tokenizer(input_text, truncation=True, padding=True, return_tensors='pt')

    source_ids = tokenized_text['input_ids'].to(device, dtype = torch.long)
    source_mask = tokenized_text['attention_mask'].to(device, dtype = torch.long)

    generated_ids = model.generate(
        input_ids = source_ids,
        attention_mask = source_mask,
        max_length=512,
        num_beams=7,
        temperature = 1.3,
        repetition_penalty=1,
        length_penalty=1,
        early_stopping=True,
        no_repeat_ngram_size=2  # количество повторов n-грамм > 2 запрещено.
    )

    # Параметры подбираются эксперементально

    pred = tokenizer.decode(generated_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

print("\noutput:\n" + pred)

In [None]:
INX = 0
print("SUMMARY: | {}".format(dataset_test['summary'][INX]))
print("TITLE: | {}".format(dataset_test['title'][INX]))

input_text = dataset_test['summary'][INX]

with torch.no_grad():
    tokenized_text = tokenizer(input_text, truncation=True, padding=True, return_tensors='pt')

    source_ids = tokenized_text['input_ids'].to(device, dtype = torch.long)
    source_mask = tokenized_text['attention_mask'].to(device, dtype = torch.long)

    generated_ids = model.generate(
        input_ids = source_ids,
        attention_mask = source_mask,
        max_length=512,
        num_beams=7,
        temperature = 1.3,
        repetition_penalty=1,
        length_penalty=1,
        early_stopping=True,
        no_repeat_ngram_size=2  # количество повторов n-грамм > 2 запрещено.
    )

    # Параметры подбираются эксперементально
    pred = tokenizer.decode(generated_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

print("\noutput:\n" + pred)

Вывод

Модель сформировала достаточно приемлемый заголовок.

Правда во втором случае он сообщает о событии как о случившемся, хотя оно ещё не произошло.