In [None]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer
from transformers import TextDataset,DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments, GPT2LMHeadModel
from transformers import pipeline

import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader, random_split

from datasets import load_data

from tqdm.auto import tqdm
import warnings
warnings.filterwarnings("ignore")

In [7]:
# оставим только самый первый контекст, в остальных много пропусков и они не несут особой информации
data = pd.read_csv('/kaggle/working/data.csv')
new_data = data.loc[:, ['context_1', 'response']]
new_data.dropna(inplace=True)
new_data = new_data.rename({'context_1': 'input', 'response': 'output'}, axis=1)
new_data = new_data[new_data.apply(lambda row: all(len(str(cell)) >= 10 for cell in row), axis=1)]
new_data = new_data.reset_index(drop=True)
new_data.to_csv('/kaggle/working/new_data.csv', index=False)

In [39]:
new_data['full_mes'] = new_data['input'] + ' <SEP> ' + new_data['output']

In [41]:
nl = []
for i in new_data['full_mes']:
    nl.append(i)

In [44]:
import re
def build_text_files(txts, dest_path):
    f = open(dest_path, 'w')
    data = ''
    for txt in txts:
        summary = txt.strip()
        summary = re.sub(r"\n", " ", summary)
        data += summary + "  "
    f.write(data)

In [45]:
train, test = train_test_split(nl, test_size=0.2)

build_text_files(train,'/kaggle/working/train_dataset.txt')
build_text_files(test,'/kaggle/working/test_dataset.txt')

In [46]:
tokenizer = GPT2Tokenizer.from_pretrained('sberbank-ai/rugpt3medium_based_on_gpt2')
model = GPT2LMHeadModel.from_pretrained("sberbank-ai/rugpt3medium_based_on_gpt2")

train_path = '/kaggle/working/train_dataset.txt'
test_path = '/kaggle/working/test_dataset.txt'

In [75]:
class CleanedTextDataset(TextDataset):
    def __getitem__(self, i):
        item = self.examples[i]
        cleaned_item = [x if x is not None else 3 for x in item]
        return torch.tensor(cleaned_item, dtype=torch.long)

def load_dataset(train_path, test_path, tokenizer):
    train_dataset = CleanedTextDataset(
        tokenizer=tokenizer,
        file_path=train_path,
        block_size=16)

    test_dataset = CleanedTextDataset(
        tokenizer=tokenizer,
        file_path=test_path,
        block_size=16)

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False)
    
    return train_dataset, test_dataset, data_collator

train_dataset, test_dataset, data_collator = load_dataset(train_path, test_path, tokenizer)

In [78]:
output_directory = '/kaggle/working/rugpt-3/'
training_args = TrainingArguments(
    output_dir=output_directory,
    overwrite_output_dir = True,
    warmup_ratio=0.1,
    lr_scheduler_type='cosine',
    num_train_epochs=5, 
    gradient_accumulation_steps=2,
    learning_rate=1e-4,
    optim="adafactor",
    per_device_train_batch_size=32, 
    per_device_eval_batch_size=64,  
    eval_steps = 1000, 
    save_steps=3000, 
    logging_steps=50,
    report_to='wandb',
    save_total_limit = 2,
    no_cuda=False,
    )

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()

Step,Training Loss
10,5.2216
20,5.0989
30,4.985
40,4.8934
50,4.8191
60,4.737
70,4.7214
80,4.67
90,4.648
100,4.6227


TrainOutput(global_step=3725, training_loss=3.580711025007619, metrics={'train_runtime': 5654.9173, 'train_samples_per_second': 84.318, 'train_steps_per_second': 0.659, 'total_flos': 1.3830442918084608e+16, 'train_loss': 3.580711025007619, 'epoch': 5.0})

In [113]:
model.save_pretrained('/kaggle/working/finetuned_model/')
tokenizer.save_pretrained('/kaggle_working/finetuned_tokenizer/')

('/kaggle_working/finetuned_tokenizer/tokenizer_config.json',
 '/kaggle_working/finetuned_tokenizer/special_tokens_map.json',
 '/kaggle_working/finetuned_tokenizer/vocab.json',
 '/kaggle_working/finetuned_tokenizer/merges.txt',
 '/kaggle_working/finetuned_tokenizer/added_tokens.json')

In [155]:
# загрузим на huggingface, чтобы в последствии можно было вызывать по имени
model.push_to_hub(repo_id="danzzzll/verystupid_rugpt-3")
tokenizer.push_to_hub(repo_id="danzzzll/verystupid_rugpt-3")

CommitInfo(commit_url='https://huggingface.co/danzzzll/verystupid_rugpt-3/commit/1ec0d736976e7f92e4f1c695b7525b553b732105', commit_message='Upload tokenizer', commit_description='', oid='1ec0d736976e7f92e4f1c695b7525b553b732105', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
# проверка работы модели
# на gpu - 3-4 секунды, на cpu - 10-15
import logging
logging.getLogger("transformers").setLevel(logging.ERROR)

model_name = 'danzzzll/verystupid_rugpt-3'
generation = pipeline('text-generation', model=model_name, tokenizer=model_name, device=0) #если только cpu, то device='cpu'

while True:
    user_input = input("You: ")
    modify_input = f'Вопрос: {user_input} Ответ:'
    if user_input.lower() in ["quit", "exit", "bye"]:
        break

    response = generation(modify_input, max_length=100, min_length=30, temperature=.9, num_beams=3, repetition_penalty=1.5,  num_return_sequences=2, no_repeat_ngram_size=2, do_sample=True)[0]['generated_text']
    response = re.sub(f'{modify_input}', '', response)
    print(f"Bot: {response}")