# Генерация русских народных сказок

In [None]:
!git clone https://github.com/edbons/faiky-tails.git

In [None]:
%cd faiky-tails

In [None]:
!pip install -r requirements.txt

In [None]:
import os
import pandas as pd
import pprint
from itertools import product
from collections import defaultdict
import torch
from transformers import GPT2Tokenizer
from zipfile import ZipFile
import nltk
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
# Загрузка данных обученных моделей

!FILEID='1f1MU0bgIo1X_78vpuc-DqKH8joHRcbgT' && \
FILENAME='savedir.zip' && \
FILEDEST="https://docs.google.com/uc?export=download&id=${FILEID}" && \
wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate ${FILEDEST} -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=${FILEID}" -O $FILENAME && rm -rf /tmp/cookies.txt


In [None]:
zip_f = ZipFile('savedir.zip', mode='r')
zip_f.extractall(path='.')

# Демо генерации текста

In [None]:
# Ввод собственных ключевых фраз для затравки

print("Напишите на русском языке кратко события происходящие с героями. Для окончания укажите слово 'exit' или укажите более 20 фраз:")
kw = []
while True:
  promt = input()
  if promt.lower() == 'exit' or len(kw) > 20:
    break
  else:
    kw.append(promt)

print("Введенные фразы:", kw)

In [None]:
# Настройки токенизатора и модели для генерации

tokenizer = GPT2Tokenizer.from_pretrained("sberbank-ai/rugpt3small_based_on_gpt2", add_prefix_space=True)
tokenizer.add_special_tokens({'bos_token': '<s>',                                     
                                        'eos_token': '</s>',
                                        'additional_special_tokens': ['[SEP]', '_kw_', '_endkw_']
                                    })

device = 'cuda' if torch.cuda.is_available() else 'cpu' 

output_dir= os.path.join('savedir','s_kw')
with open(os.path.join(output_dir,'checkpoints/checkpoint.pt'), 'rb') as f:
  model = torch.load(f, map_location=device)

In [None]:
# Подготовка затравки

context = " _kw_ ".join(kw)
context = tokenizer.encode(context)

septok = tokenizer.convert_tokens_to_ids('[SEP]')
starttok = tokenizer.convert_tokens_to_ids('<s>')
endtok = tokenizer.convert_tokens_to_ids('</s>')
endkeytok = tokenizer.convert_tokens_to_ids('_endkw_')

context = [starttok] + context + [endkeytok] + [septok]
input_ids = torch.LongTensor(context)
input_ids = torch.unsqueeze(input_ids, 0)

In [None]:
# Параметры для генерации

params = {'num_beams': 4,
            'top_p': 0.95,
            'top_k': 0,
            'temperature': 1.0,
            'repetition_penalty': 2.0
        }

In [None]:
# Генерация текста

sample_output = model.generate(
                            input_ids.to(device),                                                
                            max_length=512, 
                            do_sample=True,                     
                            eos_token_id=endtok,
                            bos_token_id=tokenizer.bos_token_id,
                            decoder_start_token_id=septok,
                            pad_token_id=0,
                            min_length=256,
                            num_return_sequences=1,                   
                            no_repeat_ngram_size=3, 
                            forced_eos_token_id = endtok,
                            early_stopping=True,  
                            **params
                        )

In [None]:
context_txt = tokenizer.batch_decode(input_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False) 
hyps = tokenizer.batch_decode(sample_output[:, input_ids.shape[-1]:], skip_special_tokens=False, clean_up_tokenization_spaces=False)
print("Затравка:", context_txt[0], sep='\n')
print("\nТекст:", hyps[0], sep='\n')

# Генерация текстов из тестовой выборки

## Генерация с настройками по умолчанию

In [None]:
!python generate.py --experiment_name s_kw --output_dir ./savedir --hf_model "sberbank-ai/rugpt3small_based_on_gpt2" --gen_len 512 --n_ctx 70 --n_batch 2 --use_ner

In [None]:
!python generate.py --experiment_name s_kw_ner --output_dir ./savedir --hf_model "sberbank-ai/rugpt3small_based_on_gpt2" --gen_len 512 --n_ctx 70 --n_batch 2 --use_ner

In [None]:
!python generate.py --experiment_name baseline --output_dir ./savedir --hf_model "sberbank-ai/rugpt3small_based_on_gpt2" --gen_len 512 --n_ctx 70 --n_batch 2 --use_ner

## Генерация для различных настроек генерации

In [None]:
temp = [0.7, 1, 1.3]
top_p = [0.95]
beams = [1]

for t, p, n_beam in product(temp, top_p, beams):
  !python generate.py --experiment_name s_kw --output_dir ./savedir --hf_model "sberbank-ai/rugpt3small_based_on_gpt2" --num_beams {n_beam} --k 0 --p {p} --temperature {t} --gen_len 512 --n_ctx 70 --n_batch 8 --use_ner

# Вычисление метрик качества

In [None]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
!python evaluate.py --output_dir ./savedir --experiment_name s_kw 

In [None]:
!python evaluate.py --output_dir ./savedir --experiment_name s_kw_ner 

In [None]:
!python evaluate.py --output_dir ./savedir --experiment_name baseline

## Подготовка сводной таблицы с метриками экспериментов

In [None]:
!python present_eval_results.py --output_dir ./savedir

In [None]:
df_report = pd.read_csv('experiments_results.csv', sep='|', encoding='utf-8')
pd.options.display.float_format = '{:.4f}'.format
df_report.dropna(axis=0, inplace=True)

cols_max = ['ms_jaccard2', 'ms_jaccard3', 'ms_jaccard4', 'ms_jaccard5',  'forward_bleu2', 'backward_bleu2', 'ha_bleu2', 'forward_bleu3', 'backward_bleu3', 'ha_bleu3', 'forward_bleu4', 'backward_bleu4', 'ha_bleu4', 'forward_bleu5', 'backward_bleu5', 'ha_bleu5',  'rouge-1', 'rouge-2', 'rouge-l', 'bertscore_f1_l11', 'bertscore_f1_l12']
cols_min = ['tfidf_distance', 'fbd_1-6', 'fbd_7-12', 'self_bleu2', 'self_bleu3', 'self_bleu4', 'self_bleu5']

best = defaultdict(list)
for col in cols_max:
    best[df_report.iloc[df_report[col].argmax()][0]].append(col) 

for col in cols_min:
    best[df_report.iloc[df_report[col].argmin()][0]].append(col) 

pprint.pprint(best)

# Обучение

* обучение (fine tuning) новых моделей протестировано только на ресурсах NVIDIA V100 (32GB GPU RAM)

In [None]:
# Обучение собственной модели my_model

!python train.py --experiment_name my_model --output_dir savedir --hf_model "sberbank-ai/rugpt3small_based_on_gpt2" --n_batch 2 --num_epochs 10 --pad_len 2048 --n_ctx 70