In [1]:
import torch
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  

model_name_or_path = "sberbank-ai/rugpt3small_based_on_gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path,
                                          cache_dir="rugpt3small_based_on_gpt2_cached_token/")
model = GPT2LMHeadModel.from_pretrained(model_name_or_path, pad_token_id=tokenizer.eos_token_id,
                                        cache_dir="rugpt3small_based_on_gpt2_cached_model/").to(DEVICE)



In [3]:
# –°–æ—Ö—Ä–∞–Ω–∏–º –æ–±—É—á–∞—é—â–∏–µ –¥–∞–Ω–Ω—ã–µ –≤ .txt —Ñ–∞–π–ª 
train_path = 'train.txt'
def load_doc(filename):
    file = open(filename, 'r', encoding='utf-8')
    text = file.read()
    file.close()
    return text

data = load_doc('book.txt')
with open(train_path, "w") as f:
    f.write(data)

# –°–æ–∑–¥–∞–Ω–∏–µ –¥–∞—Ç–∞—Å–µ—Ç–∞
train_dataset = TextDataset(tokenizer=tokenizer,file_path=train_path,block_size=64)
  
# –°–æ–∑–¥–∞–Ω–∏–µ –¥–∞—Ç–∞–ª–æ–¥–µ—Ä–∞ (–Ω–∞—Ä–µ–∑–∞–µ—Ç —Ç–µ–∫—Å—Ç –Ω–∞ –æ–ø—Ç–∏–º–∞–ª—å–Ω—ã–µ –ø–æ –¥–ª–∏–Ω–µ –∫—É—Å–∫–∏)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, 
                                                mlm=False)



In [4]:
print(len(train_dataset))

2124


In [5]:
training_args = TrainingArguments(
    output_dir="./finetuned", 
    overwrite_output_dir=True, 
    num_train_epochs=50, 
    per_device_train_batch_size=16, 
    per_device_eval_batch_size=16,  
    warmup_steps=10, 
    gradient_accumulation_steps=8, 
    )

torch.cuda.set_per_process_memory_fraction(0.8, 0)
torch.cuda.empty_cache()

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    optimizers = (torch.optim.AdamW(model.parameters(),lr=1e-5), None)
)
trainer.train()

***** Running training *****
  Num examples = 2124
  Num Epochs = 50
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 8
  Total optimization steps = 800
 62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 500/800 [36:16<21:01,  4.20s/it]  Saving model checkpoint to ./finetuned\checkpoint-500
Configuration saved in ./finetuned\checkpoint-500\config.json


{'loss': 3.2345, 'learning_rate': 3.7974683544303802e-06, 'epoch': 31.24}


Model weights saved in ./finetuned\checkpoint-500\pytorch_model.bin
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 800/800 [51:23<00:00,  2.76s/it]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 800/800 [51:23<00:00,  3.85s/it]

{'train_runtime': 3083.706, 'train_samples_per_second': 34.439, 'train_steps_per_second': 0.259, 'train_loss': 3.074898910522461, 'epoch': 49.96}





TrainOutput(global_step=800, training_loss=3.074898910522461, metrics={'train_runtime': 3083.706, 'train_samples_per_second': 34.439, 'train_steps_per_second': 0.259, 'train_loss': 3.074898910522461, 'epoch': 49.96})

In [9]:
torch.save(model, 'my_model.h5')

In [11]:
model = torch.load('my_model.h5')

In [12]:

text = "–° —á–µ–≥–æ –Ω–∞—á–∏–Ω–∞–µ—Ç—Å—è —á–µ–ª–æ–≤–µ–∫? "
input_ids = tokenizer.encode(text, return_tensors="pt").to(DEVICE)
model = torch.load('my_model.h5')
model.eval()
with torch.no_grad():
    out = model.generate(input_ids, 
                        repetition_penalty=5.0,
                        do_sample=True,
                        use_cache=False,
                        num_beams=2,
                        temperature=1,
                        top_p=0.95,
                        top_k=10,
                        max_length=1000,
                        )

# generated_text = list(map(tokenizer.decode, out))[0]
tokenizer.decode(out[0]).replace('\xa0‚Äî', ' ').replace('\n', ' ')

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'–° —á–µ–≥–æ –Ω–∞—á–∏–Ω–∞–µ—Ç—Å—è —á–µ–ª–æ–≤–µ–∫?  –° —Ç–æ–≥–æ, —á—Ç–æ –æ–Ω –¥–µ–ª–∞–µ—Ç.   ‚Äì –ê —Å —á–µ–≥–æ –Ω–∞—á–∏–Ω–∞–µ—Ç—Å—è –º—É–∂—á–∏–Ω–∞? ‚Äì —Å–ø—Ä–æ—Å–∏–ª —è.   ‚Äì –°–Ω–∞—á–∞–ª–∞ –≤—ã –Ω–∞—á–∏–Ω–∞–µ—Ç–µ –¥—É–º–∞—Ç—å –æ —Ç–æ–º, –∫–∞–∫ —Å–¥–µ–ª–∞—Ç—å —Ç–∞–∫, —á—Ç–æ–±—ã —É –≤–∞—Å –Ω–µ –±—ã–ª–æ –Ω–∏–∫–∞–∫–∏—Ö –æ–±—è–∑–∞—Ç–µ–ª—å—Å—Ç–≤ –ø–µ—Ä–µ–¥ –∫–µ–º –±—ã —Ç–æ –Ω–∏ –±—ã–ª–æ –≤–æ–æ–±—â–µ.  –¢–æ –µ—Å—Ç—å —ç—Ç–æ –º–æ–∂–µ—Ç –±—ã—Ç—å –ª—é–±–æ–π —á–µ–ª–æ–≤–µ–∫, –∫–æ—Ç–æ—Ä–æ–≥–æ –≤—ã –≤—Å—Ç—Ä–µ—Ç–∏—Ç–µ –Ω–∞ —Å–≤–æ–µ–º –ø—É—Ç–∏.  –ù–æ –µ—Å–ª–∏ –≤—ã –ø–æ–ø—ã—Ç–∞–µ—Ç–µ—Å—å –ø—Ä–µ–¥—Å—Ç–∞–≤–∏—Ç—å —Å–µ–±–µ —ç—Ç—É —Å–∏—Ç—É–∞—Ü–∏—é —Ç–∞–∫–∏–º –æ–±—Ä–∞–∑–æ–º, –≤–∞–º –ø—Ä–∏–¥–µ—Ç—Å—è –æ—á–µ–Ω—å –¥–æ–ª–≥–æ –∏ –º—É—á–∏—Ç–µ–ª—å–Ω–æ —Ä–∞–±–æ—Ç–∞—Ç—å –Ω–∞–¥ —Ç–µ–º, —á—Ç–æ–±—ã –ø–æ–Ω—è—Ç—å, –≤ —á–µ–º –∏–º–µ–Ω–Ω–æ –∑–∞–∫–ª—é—á–∞–µ—Ç—Å—è –≤–∞—à–∞ –ø—Ä–æ–±–ª–µ–º–∞.  –í—ã –¥–æ–ª–∂–Ω—ã —á–µ—Ç–∫–æ –ø—Ä–µ–¥—Å—Ç–∞–≤–ª—è—Ç—å —Å–µ–±–µ, –∑–∞—á–µ–º –≤—ã –∏–¥–µ—Ç–µ –∫ —ç—Ç–æ–π —Ü–µ–ª–∏.  –ò –≤–æ—Ç —Ç–æ–≥–¥–∞-—

In [13]:
text = "–° —á–µ–≥–æ –Ω–∞—á–∏–Ω–∞–µ—Ç—Å—è —á–µ–ª–æ–≤–µ–∫? "
input_ids = tokenizer.encode(text, return_tensors="pt").to(DEVICE)
model = torch.load('my_model.h5')
model.eval()
with torch.no_grad():
    out = model.generate(input_ids, 
                        repetition_penalty=5.0,
                        do_sample=True,
                        use_cache=False,
                        num_beams=2,
                        temperature=1,
                        top_p=0.95,
                        top_k=10,
                        max_length=1000,
                        )

# generated_text = list(map(tokenizer.decode, out))[0]
tokenizer.decode(out[0]).replace('\xa0‚Äî', ' ').replace('\n', ' ')

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'–° —á–µ–≥–æ –Ω–∞—á–∏–Ω–∞–µ—Ç—Å—è —á–µ–ª–æ–≤–µ–∫?  –° —Ç–æ–≥–æ, —á—Ç–æ –æ–Ω –¥–µ–ª–∞–µ—Ç?   ‚Äì –ù—É –¥–∞. –ê –∫–∞–∫ –∏–Ω–∞—á–µ? –ß–µ–ª–æ–≤–µ–∫ —Ä–æ–∂–¥–∞–µ—Ç—Å—è —Å —ç—Ç–∏–º —Å–∞–º—ã–º ¬´—è¬ª, –∏ —É –Ω–µ–≥–æ –µ—Å—Ç—å –¥–≤–∞ –ø—É—Ç–∏ ‚Äì –∏–ª–∏ –¥–∞–∂–µ —Ç—Ä–∏: –ª–∏–±–æ –æ–Ω –∏–¥–µ—Ç –ø–æ –æ–¥–Ω–æ–º—É –∏–∑ —ç—Ç–∏—Ö –ø—É—Ç–µ–π, –ª–∏–±–æ –æ–Ω –∏–¥–µ—Ç –ø–æ –¥—Ä—É–≥–æ–º—É‚Ä¶ –ù–æ –≤—Å–µ —ç—Ç–∏ –≤–∞—Ä–∏–∞–Ω—Ç—ã –Ω–µ –≤—Å–µ–≥–¥–∞ –æ–∫–∞–∑—ã–≤–∞—é—Ç—Å—è –ø—Ä–∞–≤–∏–ª—å–Ω—ã–º–∏. –ò–Ω–æ–≥–¥–∞ –æ–Ω–∏ —Å—Ç–∞–Ω–æ–≤—è—Ç—Å—è –ø—Ä–æ—Å—Ç–æ –Ω–µ–≤–æ–∑–º–æ–∂–Ω—ã–º–∏. –ò —Ç–æ–≥–¥–∞ –Ω–∞—Å—Ç—É–ø–∞–µ—Ç –∫–æ–Ω–µ—Ü —Å–≤–µ—Ç–∞. –í–æ—Ç —ç—Ç–æ —Å–∞–º–æ–µ —Å—Ç—Ä–∞—à–Ω–æ–µ. –ü–æ—Ç–æ–º—É —á—Ç–æ –º—ã –∂–∏–≤–µ–º –≤ —Ç–∞–∫–æ–µ –≤—Ä–µ–º—è, –∫–æ–≥–¥–∞ –∫–∞–∂–¥—ã–π –Ω–æ–≤—ã–π —à–∞–≥ –≤–ø–µ—Ä–µ–¥ –≤–µ–¥–µ—Ç –Ω–∞—Å –∫ —Ç–æ–º—É, —á—Ç–æ–±—ã —Å—Ç–∞—Ç—å –∫–µ–º-—Ç–æ –¥—Ä—É–≥–∏–º. –¢–æ –µ—Å—Ç—å –ø–µ—Ä–µ—Å—Ç–∞—Ç—å –±—ã—Ç—å —Ç–µ–º, –∫–µ–º —Ç—ã –±—ã–ª —Ä–∞–Ω—å—à–µ. –°—Ç–∞—Ç—å —Å–æ–±–æ–π. –ü–æ–Ω–∏–º–∞–µ—Ç–µ? –≠—Ç–æ –∑–Ω–∞—á–∏—Ç —Å–

In [14]:
text = "–° —á–µ–≥–æ –Ω–∞—á–∏–Ω–∞–µ—Ç—Å—è —á–µ–ª–æ–≤–µ–∫? "
input_ids = tokenizer.encode(text, return_tensors="pt").to(DEVICE)
model = torch.load('my_model.h5')
model.eval()
with torch.no_grad():
    out = model.generate(input_ids, 
                        repetition_penalty=5.0,
                        do_sample=True,
                        use_cache=False,
                        num_beams=2,
                        temperature=1,
                        top_p=0.95,
                        top_k=10,
                        max_length=1000,
                        )

# generated_text = list(map(tokenizer.decode, out))[0]
tokenizer.decode(out[0]).replace('\xa0‚Äî', ' ').replace('\n', ' ')

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'–° —á–µ–≥–æ –Ω–∞—á–∏–Ω–∞–µ—Ç—Å—è —á–µ–ª–æ–≤–µ–∫?  –° —Ç–æ–≥–æ, —á—Ç–æ –æ–Ω –¥–µ–ª–∞–µ—Ç?   ‚Äì –ù—É –¥–∞.  –°–Ω–∞—á–∞–ª–∞ —è –¥—É–º–∞–ª, —á—Ç–æ —ç—Ç–æ –∫–∞–∫–æ–π-—Ç–æ —Å–ª–æ–∂–Ω—ã–π –º–µ—Ö–∞–Ω–∏–∑–º, —Å –ø–æ–º–æ—â—å—é –∫–æ—Ç–æ—Ä–æ–≥–æ –º–æ–∑–≥ –ø–æ–ª—É—á–∞–µ—Ç –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏—é –∏–∑ –≤–Ω–µ—à–Ω–µ–≥–æ –º–∏—Ä–∞.  –ü–æ—Ç–æ–º —è –ø–æ–Ω—è–ª, —á—Ç–æ —ç—Ç–æ –Ω–µ —Ç–∞–∫.  –ò –≤–æ—Ç —Ç–æ–≥–¥–∞ –º–Ω–µ –ø–æ–∫–∞–∑–∞–ª–æ—Å—å, —á—Ç–æ –≤—Å–µ —ç—Ç–∏ —Å–ª–æ–∂–Ω—ã–µ –º–µ—Ö–∞–Ω–∏–∑–º—ã —Ä–∞–±–æ—Ç–∞—é—Ç –∫–∞–∫-—Ç–æ –ø–æ-–¥—Ä—É–≥–æ–º—É.   ‚Äì –¢–æ –µ—Å—Ç—å –≤—ã —Ö–æ—Ç–∏—Ç–µ —Å–∫–∞–∑–∞—Ç—å, —á—Ç–æ –≤ –≤–∞—à–µ–º –≤–æ—Å–ø—Ä–∏—è—Ç–∏–∏ –º–∏—Ä —Å–æ–≤—Å–µ–º –¥—Ä—É–≥–æ–π?   ‚Äì –°–æ–≤–µ—Ä—à–µ–Ω–Ω–æ —Ç–æ—á–Ω–æ.  –Ø –¥–∞–∂–µ –Ω–µ –º–æ–≥—É –æ–±—ä—è—Å–Ω–∏—Ç—å, –æ —á–µ–º –∏–º–µ–Ω–Ω–æ –∏–¥–µ—Ç —Ä–µ—á—å.  –ù–æ —Ç–æ, —á—Ç–æ —è —É–≤–∏–¥–µ–ª, –ø–æ—Ä–∞–∑–∏–ª–æ –º–µ–Ω—è –¥–æ –≥–ª—É–±–∏–Ω—ã –¥—É—à–∏.  –≠—Ç–æ –±—ã–ª–æ –∫–∞–∫–æ–µ-—Ç–æ —É–¥–∏–≤–∏—Ç–µ–ª—å–Ω–æ–µ –∫–æ—Å–º–∏—á–µ—Å–∫–æ–µ –ø—É—Ç–µ—à–µ—Å—Ç–≤–∏–µ –≤–æ –≤—Ä–µ–º–µ–Ω–∏ –

#### –ö–æ–Ω–µ—á–Ω–æ, –¥–æ –ø–æ–ª–Ω–æ—Ü–µ–Ω–Ω–æ–≥–æ –ø—Ä–æ–∏–∑–≤–µ–¥–µ–Ω–∏—è –µ—â–µ –¥–æ–ª–≥–æ, –Ω–æ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–º —è –¥–æ–≤–æ–ª—å–Ω–∞. –ß—É–≤—Å—Ç–≤—É–µ—Ç—Å—è —Ä—É–∫–∞ –º–∞—Å—Ç–µ—Ä–∞!