In [None]:
!pip install transformers

In [None]:
import re
import torch

from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

In [None]:
with open('esenin.txt', 'r') as f:
  text = f.read()

text = re.findall(r'\n\t\t.+\n\t\t', text)
text = [item.replace('\n\t\t', '') for item in text]
text[:10]

['–°—Ç–∏—Ö–∏ –º–æ–∏,',
 '‚Ä¶–ò –ø–æ—Å–ª—É—à–∞–π—Ç–µ –ø–µ—Å–Ω—é –ø—Ä–æ –≥–æ—Ä–µ,',
 '–î—É–º—ã –ø–µ—á–∞–ª—å–Ω—ã–µ, –¥—É–º—ã –≥–ª—É–±–æ–∫–∏–µ,',
 '–î—É–º—ã –æ—Ç —Å—á–∞—Å—Ç–∏—è –≤–µ—á–Ω–æ –¥–∞–ª–µ–∫–∏–µ,',
 '–ë–µ–ª–∞—è –±–µ—Ä–µ–∑–∞',
 '–ü—Ä–∏–Ω–∞–∫—Ä—ã–ª–∞—Å—å —Å–Ω–µ–≥–æ–º,',
 '–ü–æ–≥—É–ª—è–π—Ç–µ, —Ä–∞—Ç–Ω–∏—á–∫–∏,',
 '–õ–æ—à–∞–¥–∏ –∑–∞–ø—Ä—è–∂–µ–Ω—ã,',
 '–ü–æ–≤–µ—Å—Ç–∏–ª–∏ –ø–æ–¥ –æ–∫–Ω–∞–º–∏ —Å–æ—Ç—Å–∫–∏–µ',
 '–ó–∞–≥—ã–≥—ã–∫–∞–ª–∏ –±–∞–±—ã —Å–ª–æ–±–æ–¥—Å–∫–∏–µ,']

In [None]:
train_path = 'train_dataset.txt'
with open(train_path, "w") as f:
    f.write('\n'.join(text) + '\n')

In [None]:
model_name = "ai-forever/rugpt3medium_based_on_gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

Downloading (‚Ä¶)olve/main/vocab.json:   0%|          | 0.00/1.61M [00:00<?, ?B/s]

Downloading (‚Ä¶)olve/main/merges.txt:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

Downloading (‚Ä¶)lve/main/config.json:   0%|          | 0.00/674 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.73G [00:00<?, ?B/s]

In [None]:
train_dataset = TextDataset(tokenizer=tokenizer, file_path=train_path, block_size=32)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)



In [None]:
training_args = TrainingArguments(
    output_dir="./finetuned",         # The output directory
    overwrite_output_dir=True,        # Overwrite the content of the output dir
    num_train_epochs=40,              # number of training epochs
    per_device_train_batch_size=20,   # batch size for training
    per_device_eval_batch_size=32,    # batch size for evaluation
    warmup_steps=9,                   # number of warmup steps for learning rate scheduler
    gradient_accumulation_steps=5
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    optimizers = (torch.optim.AdamW(model.parameters(),lr=1e-5), None)
)

In [None]:
trainer.train()

Step,Training Loss
500,3.1848


TrainOutput(global_step=520, training_loss=3.168393428509052, metrics={'train_runtime': 1172.494, 'train_samples_per_second': 45.339, 'train_steps_per_second': 0.443, 'total_flos': 2994014951571456.0, 'train_loss': 3.168393428509052, 'epoch': 38.81})

In [None]:
model.save_pretrained('model_esenin')

In [None]:
def generate(input_text):
  input_ids = tokenizer.encode(input_text, return_tensors="pt").to('cuda:0')

  model.eval()
  with torch.no_grad():
      out = model.generate(
          input_ids,
          do_sample=True,
          num_beams=3,
          temperature=1.5,
          top_p=2.2,
          max_length=100
      )
  generated_text = list(map(tokenizer.decode, out))[0]
  print(generated_text)

In [None]:
generate("–ö—Ç–æ —è? –ß—Ç–æ —è? –¢–æ–ª—å–∫–æ –ª–∏—à—å –º–µ—á—Ç–∞—Ç–µ–ª—å,")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


–ö—Ç–æ —è? –ß—Ç–æ —è? –¢–æ–ª—å–∫–æ –ª–∏—à—å –º–µ—á—Ç–∞—Ç–µ–ª—å,
–Ø –∏—â—É —Å—á–∞—Å—Ç—å—è, –ø–æ–∫–æ—è –∏ —É—é—Ç–∞,
–ù–æ –Ω–µ –∑–¥–µ—Å—å –ª–∏ —Ç—ã, –º–æ—è –º—É–∑–∞,
–¢—ã –¥–∞–≤–Ω–æ —É–∂ —É—à–ª–∞ –æ—Ç –º–µ–Ω—è –≤ –¥–∞–ª—å–Ω—é—é –¥–æ—Ä–æ–≥—É,
–ö–∞–∫ –ø–µ—á–∞–ª—å–Ω–æ, –ø–µ—á–∞–ª—å–Ω–æ! –Ø –∏—Å–∫–∞–ª —Ç–µ–±—è, –∏—Å–∫–∞–ª –∏ –Ω–µ –º–æ–≥!
–£–∂–µ–ª–∏ —ç—Ç–æ —Ç—ã, –º–æ—è –º–∏–ª–∞—è –º—É–∑–∞?
–¢—ã –ª–∏ —ç—Ç–æ, –º–æ—è –º–∏–ª–∞—è –º—É–∑–∞?
–£–∂–µ–ª–∏ —ç—Ç–æ —Ç—ã, –º–æ—è –º–∏–ª–∞—è –º—É–∑–∞?
–£–∂–µ–ª–∏ —ç—Ç–æ —Ç—ã, –º–æ—è –º–∏–ª–∞—è –º—É


In [None]:
generate("–ö—Ç–æ —è?")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


–ö—Ç–æ —è?
–ö—Ç–æ —Ç—ã?
–Ø –∑–Ω–∞—é ‚Äî
–Ø –ø–æ–º–Ω—é ‚Äî
–Ø –≤—Å–µ –∑–∞–±—ã–ª.
–Ø —Ö–æ—á—É –≤ —Ç—É–º–∞–Ω—ã,
–¢–æ–ª—å–∫–æ –Ω–µ –∑–¥–µ—Å—å.
–î—Ä–æ–≥–Ω—É–ª –≤–µ—á–µ—Ä.
–ò —è –≤–∏–∂—É ‚Äî
–ù–µ–±–æ –ø–ª–∞—á–µ—Ç.
–ü–ª–∞—á–µ—Ç –æ —á–µ–º-—Ç–æ –¥–æ—Ä–æ–≥–æ–º
–°–ª–∏—à–∫–æ–º –º–∞–ª–æ —è –∂–∏–ª.
–ú–æ–∂–µ—Ç –±—ã—Ç—å, —ç—Ç–æ —Å–Ω–∏—Ç—Å—è
–Ø –Ω–µ –∑–Ω–∞—é.
–ù–µ –ø–æ–º–Ω—é, —á—Ç–æ —è –≤–∏–¥–µ–ª.
–Ø –Ω–µ —É–º–µ—é —Å–∫–∞–∑–∞—Ç—å.
–ú–æ–∂–µ—Ç –±—ã—Ç—å, —è —Å–ø–ª—é?
–ú–æ–∂–µ—Ç –±—ã—Ç—å


In [None]:
generate(" –î—Ä—É–≥ –º–æ–π, –¥—Ä—É–≥ –º–æ–π,")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


 –î—Ä—É–≥ –º–æ–π, –¥—Ä—É–≥ –º–æ–π,
–ù–∏–∑–∫–∏–π –¥–æ–º –±–µ–∑ –∫—Ä—ã—à–∏,
–ù–æ –∏ –¥–æ–º–∞ –±–µ–∑ —Å—Ç—Ä–æ–ø–∏–ª
–ì—Ä—É—Å—Ç–∏—Ç –≤–µ—Ç–µ—Ä,
–í–∏–¥–Ω–æ, –æ–Ω –ø–æ–º–Ω–∏—Ç
–°–ª—ã—à–µ–Ω –ø–ª–∞—á –¥–µ—Ç–µ–π,
–ü–æ–¥—ã–º–∞–µ—Ç —É—Å—Ç–∞–ª—ã–π
–ö–ª–µ–Ω —Ç—ã –º–æ–π –æ–ø–∞–≤—à–∏–π,
–•—Ä–∞–Ω–∏ —Ç–µ–±—è –±–æ–≥.
–ë—ã–ª –±—ã –∂–∏–≤ –õ–µ–Ω–∏–Ω,
–í—ã–Ω—É–ª –± –æ–Ω —ç—Ç–∏ —Å—Ç—Ä–æ–∫–∏
–ò —Å–∫–∞–∑–∞–ª –±—ã –æ–Ω:
¬´–°–ª–∏—à–∫–æ–º —á–∞—Å—Ç–æ –º—ã
–ù–µ –≤ —Å–∏–ª–∞—Ö –º—ã
–°–ª–∏—à–∫–æ–º —á–∞—Å—Ç–æ
–û—Ç—Ç–æ–≥–æ —á—Ç–æ


In [None]:
generate("–ó–∞—á–µ–º –≤—Å–µ —ç—Ç–æ?")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


–ó–∞—á–µ–º –≤—Å–µ —ç—Ç–æ? –ó–∞—á–µ–º?
–í–µ–¥—å –≤—Å–µ —Ä–∞–≤–Ω–æ —è –ª—é–±–∏—Ç—å –Ω–µ –ø–µ—Ä–µ—Å—Ç–∞–Ω—É,
–í—Å–µ —Ä–∞–≤–Ω–æ –±—É–¥—É —è –Ω–æ—á–∏ –∏ –¥–Ω–∏ —Å—á–∏—Ç–∞—Ç—å,
–ê—Ö, –∑–∞—á–µ–º –∂–µ —è —Ç–∞–∫ —á–∞—Å—Ç–æ –≤ –¥—É–º–∞—Ö —Ö–æ–∂—É,
–ê—Ö, –∑–∞—á–µ–º –∂–µ —è –ø–æ –Ω–æ—á–∞–º, –ø—Ä–∏–∂–∞–≤—à–∏—Å—å –∫ –∏–∑–≥–æ–ª–æ–≤—å—é,
–ë—É–¥—É –¥—É–º–∞—Ç—å –æ —Ç–æ–º, –∫–∞–∫ –±—ã –º–Ω–µ –ø–æ–¥ –æ–¥–Ω–æ—é —Å —Ç–æ–±–æ—é –∫—Ä—ã—à–µ–π
–ö–∞–∫ –∂–µ —è —Ç–µ–ø–µ—Ä—å —Å—á–∞—Å—Ç–ª–∏–≤, —á—Ç–æ –Ω–µ —É–º–µ—é —Ç–∞–∫ –ª—é–±–∏—Ç—å!
–ê—Ö, –∑–∞—á–µ–º –∂ —è —Ç–µ–ø–µ—Ä—å —Ç–∞–∫ —á–∞—Å—Ç–æ –ø–æ –Ω–æ—á–∞–º, –ø—Ä–∏–∂–∞–≤—à–∏—Å—å –∫ –∏–∑–≥–æ–ª–æ–≤—å—é,

