#### Libraries

In [1]:
import torch
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling)

##### config

In [None]:
MODEL_NAME = "ai-forever/rugpt3medium_based_on_gpt2"
CACHE_DIR = 'data/finetune_gpt/model_cache'
OUTPUT_DIR = 'data/finetune_gpt/fine_tuned_model'
MODEL_DIR = "./trained_model"

#### Data

In [3]:
data_full = pd.read_csv("/content/synthetic.csv")

In [4]:
data_full.shape

(14040, 4)

In [5]:
data_full['prompt'] = data_full.apply(lambda row: f"–ö–∞—Ç–µ–≥–æ—Ä–∏—è: {row['category']} | –û—Ç–∑—ã–≤: {row['clean_text']} | –ê–Ω–∞–ª–∏–∑: {row['synthetic']}", axis=1)

In [6]:
data_full.head(3)

Unnamed: 0,category,sentiment,clean_text,synthetic,prompt
0,–û–±—â–µ—Å—Ç–≤–µ–Ω–Ω–æ–µ –ø–∏—Ç–∞–Ω–∏–µ,–ù–µ–≥–∞—Ç–∏–≤–Ω–∞—è,–ù–∏–∫–æ–º—É –Ω–µ —Å–æ–≤–µ—Ç—É—é –∑–∞–∫–∞–∑—ã–≤–∞—Ç—å –¥–æ—Å—Ç–∞–≤–∫—É –∏–∑ —ç—Ç–æ–≥–æ...,–°–∏–ª—å–Ω—ã–µ —Å—Ç–æ—Ä–æ–Ω—ã:\n- –Ω–µ—Ç –∫–æ–Ω–∫—Ä–µ—Ç–∏–∫–∏\n\n–°–ª–∞–±—ã–µ —Å...,–ö–∞—Ç–µ–≥–æ—Ä–∏—è: –û–±—â–µ—Å—Ç–≤–µ–Ω–Ω–æ–µ –ø–∏—Ç–∞–Ω–∏–µ | –û—Ç–∑—ã–≤: –ù–∏–∫–æ–º...
1,–û–±—â–µ—Å—Ç–≤–µ–Ω–Ω–æ–µ –ø–∏—Ç–∞–Ω–∏–µ,–ù–µ–≥–∞—Ç–∏–≤–Ω–∞—è,"–£–∂–∞—Å–Ω–æ. –ù–æ—Ä–º–∞–ª—å–Ω–æ–≥–æ –ø–æ–¥—ä–µ–∑–¥–∞ –Ω–µ—Ç, –ø–æ—Å—Ç–æ—è–Ω–Ω–æ 3 ...",–°–∏–ª—å–Ω—ã–µ —Å—Ç–æ—Ä–æ–Ω—ã:\n- –æ—Ç—Å—É—Ç—Å—Ç–≤—É—é—Ç\n\n–°–ª–∞–±—ã–µ —Å—Ç–æ—Ä...,–ö–∞—Ç–µ–≥–æ—Ä–∏—è: –û–±—â–µ—Å—Ç–≤–µ–Ω–Ω–æ–µ –ø–∏—Ç–∞–Ω–∏–µ | –û—Ç–∑—ã–≤: –£–∂–∞—Å–Ω...
2,–û–±—â–µ—Å—Ç–≤–µ–Ω–Ω–æ–µ –ø–∏—Ç–∞–Ω–∏–µ,–ù–µ–≥–∞—Ç–∏–≤–Ω–∞—è,–°—É–ø –ª–∞–ø—à–∞ –∏ –ø—é—Ä–µ –∫–∞—Ä—Ç–æ—Ñ–µ–ª—å–Ω–æ–µ –∫–∏—Å–ª–æ–µ –∏ –∏—Å–ø–æ—Ä—á–µ...,–°–∏–ª—å–Ω—ã–µ —Å—Ç–æ—Ä–æ–Ω—ã:\n- –æ—Ç—Å—É—Ç—Å—Ç–≤—É—é—Ç\n\n–°–ª–∞–±—ã–µ —Å—Ç–æ—Ä...,–ö–∞—Ç–µ–≥–æ—Ä–∏—è: –û–±—â–µ—Å—Ç–≤–µ–Ω–Ω–æ–µ –ø–∏—Ç–∞–Ω–∏–µ | –û—Ç–∑—ã–≤: –°—É–ø –ª...


In [7]:
train_data, val_data = train_test_split(data_full, test_size = 0.1, random_state = 42)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)

In [None]:
def tokenize_data(texts, batch_size=1000, max_length=512):
    # –°–æ–∑–¥–∞–µ–º —Å–ª–æ–≤–∞—Ä—å –¥–ª—è —Ö—Ä–∞–Ω–µ–Ω–∏—è —Ç–æ–∫–µ–Ω–∏–∑–∏—Ä–æ–≤–∞–Ω–Ω—ã—Ö –¥–∞–Ω–Ω—ã—Ö
    encodings = {'input_ids': [], 'attention_mask': [], 'labels': []}
    
    # –ü—Ä–æ—Ö–æ–¥–∏–º –ø–æ —Ç–µ–∫—Å—Ç–∞–º –±–∞—Ç—á–∞–º–∏
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        # –¢–æ–∫–µ–Ω–∏–∑–∏—Ä—É–µ–º —Ç–µ–∫—É—â–∏–π –±–∞—Ç—á —Ç–µ–∫—Å—Ç–æ–≤
        batch_encodings = tokenizer(batch_texts, truncation=True, padding='max_length', max_length=max_length)
        encodings['input_ids'].extend(batch_encodings['input_ids'])
        encodings['attention_mask'].extend(batch_encodings['attention_mask'])
        encodings['labels'].extend(batch_encodings['input_ids'])

    return encodings

# –ü—Ä–µ–æ–±—Ä–∞–∑—É–µ–º —Ç–µ–∫—Å—Ç—ã –≤ —Å–ø–∏—Å–æ–∫
train_texts = train_data['prompt'].tolist()
val_texts = val_data['prompt'].tolist()

# –¢–æ–∫–µ–Ω–∏–∑–∏—Ä—É–µ–º
train_encodings = tokenize_data(train_texts)
val_encodings = tokenize_data(val_texts)

# –°–æ–∑–¥–∞–µ–º –¥–∞—Ç–∞—Å–µ—Ç—ã –∏–∑ —Ç–æ–∫–µ–Ω–∏–∑–∏—Ä–æ–≤–∞–Ω–Ω—ã—Ö –¥–∞–Ω–Ω—ã—Ö
train_dataset = Dataset.from_dict(train_encodings)
val_dataset = Dataset.from_dict(val_encodings)

# –í—ã–≤–æ–¥–∏–º –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –∑–∞–ø–∏—Å–µ–π
print(f"–û–±—É—á–∞—é—â–∞—è –≤—ã–±–æ—Ä–∫–∞: {len(train_dataset)} –∑–∞–ø–∏—Å–µ–π")
print(f"–í–∞–ª–∏–¥–∞—Ü–∏–æ–Ω–Ω–∞—è –≤—ã–±–æ—Ä–∫–∞: {len(val_dataset)} –∑–∞–ø–∏—Å–µ–π")

–û–±—É—á–∞—é—â–∞—è –≤—ã–±–æ—Ä–∫–∞: 12636 –∑–∞–ø–∏—Å–µ–π
–í–∞–ª–∏–¥–∞—Ü–∏–æ–Ω–Ω–∞—è –≤—ã–±–æ—Ä–∫–∞: 1404 –∑–∞–ø–∏—Å–µ–π


In [None]:
# –ó–∞–≥—Ä—É–∂–∞–µ–º –º–æ–¥–µ–ª—å –¥–ª—è –æ–±—É—á–µ–Ω–∏—è
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)

# –ù–∞—Å—Ç—Ä–æ–π–∫–∏ –¥–ª—è –æ–±—É—á–µ–Ω–∏—è –º–æ–¥–µ–ª–∏
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR, 
    overwrite_output_dir=True,
    num_train_epochs=1,  # –ö–æ–ª–∏—á–µ—Å—Ç–≤–æ —ç–ø–æ—Ö –æ–±—É—á–µ–Ω–∏—è
    per_device_train_batch_size=4, 
    per_device_eval_batch_size=4, 
    gradient_accumulation_steps=2,  # –®–∞–≥–∏ –Ω–∞–∫–æ–ø–ª–µ–Ω–∏—è –≥—Ä–∞–¥–∏–µ–Ω—Ç–∞
    evaluation_strategy="epoch", 
    eval_steps=None,  # –û—Ü–µ–Ω–∫–∞ –≤ –∫–æ–Ω—Ü–µ –∫–∞–∂–¥–æ–π —ç–ø–æ—Ö–∏
    save_strategy="epoch", 
    save_steps=None,  # –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –≤ –∫–æ–Ω—Ü–µ –∫–∞–∂–¥–æ–π —ç–ø–æ—Ö–∏
    save_total_limit=2, 
    learning_rate=5e-5,
    lr_scheduler_type="linear", 
    warmup_ratio=0.1,
    weight_decay=0.01,  # –ö–æ—ç—Ñ—Ñ–∏—Ü–∏–µ–Ω—Ç —Ä–µ–≥—É–ª—è—Ä–∏–∑–∞—Ü–∏–∏
    fp16=True, 
    load_best_model_at_end=True,  # –ó–∞–≥—Ä—É–∑–∫–∞ –ª—É—á—à–µ–π –º–æ–¥–µ–ª–∏ –≤ –∫–æ–Ω—Ü–µ –æ–±—É—á–µ–Ω–∏—è
    metric_for_best_model="loss",
    greater_is_better=False, 
    logging_strategy="steps", 
    logging_steps=50,
    report_to="none", 
)

# –°–æ–∑–¥–∞–µ–º –∫–æ–ª–ª–∞—Ç–æ—Ä –¥–∞–Ω–Ω—ã—Ö 
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# –°–æ–∑–¥–∞–µ–º –æ–±—ä–µ–∫—Ç Trainer –¥–ª—è –æ–±—É—á–µ–Ω–∏—è –º–æ–¥–µ–ª–∏
trainer = Trainer(
    model=model, 
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator, 
)

# –ó–∞–ø—É—Å–∫ 
trainer.train()




Epoch,Training Loss,Validation Loss
0,2.0563,1.996687


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=1579, training_loss=2.3727000202084736, metrics={'train_runtime': 2002.9292, 'train_samples_per_second': 6.309, 'train_steps_per_second': 0.788, 'total_flos': 1.1731347173277696e+16, 'train_loss': 2.3727000202084736, 'epoch': 0.9996834441278886})

In [11]:
trainer.evaluate()

model.save_pretrained("./trained_model")
tokenizer.save_pretrained("./trained_model")

('./trained_model/tokenizer_config.json',
 './trained_model/special_tokens_map.json',
 './trained_model/vocab.json',
 './trained_model/merges.txt',
 './trained_model/added_tokens.json',
 './trained_model/tokenizer.json')

In [12]:
MODEL_DIR = "./trained_model"

tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
model = AutoModelForCausalLM.from_pretrained(MODEL_DIR)

model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(2048, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=3072, nx=1024)
          (c_proj): Conv1D(nf=1024, nx=1024)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=4096, nx=1024)
          (c_proj): Conv1D(nf=1024, nx=4096)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=50257, bias=False)
)

In [None]:
def generate_review(category, review, prompt_text="", max_length=256, min_length=64, num_return_sequences=1, temperature=0.4, top_p=0.50, top_k=30):
    input_prompt = f"–ö–∞—Ç–µ–≥–æ—Ä–∏—è: {category} | –û—Ç–∑—ã–≤: {review}"
    inputs = tokenizer.encode(input_prompt, return_tensors="pt")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    inputs = inputs.to(device)

    with torch.no_grad():
        outputs = model.generate(
            inputs,
            do_sample=True,
            max_length=max_length,
            min_length=min_length,
            num_return_sequences=num_return_sequences,
            no_repeat_ngram_size=2,
            top_p=top_p,
            temperature=temperature,
            top_k=top_k,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    generated_reviews = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

    return generated_reviews

category = "–û–±—â–µ—Å—Ç–≤–µ–Ω–Ω–æ–µ –ø–∏—Ç–∞–Ω–∏–µ"
review = "–ü—Ä–µ–∫—Ä–∞—Å–Ω–æ–µ –∫–∞—Ñ–µ —Å –≤–∫—É—Å–Ω—ã–º–∏ –∑–∞–≤—Ç—Ä–∞–∫–∞–º–∏ –¥–æ 16.00 –∏ –Ω–µ —Ç–æ–ª—å–∫–æ. –ó–∞–∫–∞–∑—ã–≤–∞–ª–∞ –∑–∞–≤—Ç—Ä–∞–∫ —Å –∫–æ—Ñ–µ –æ–º–ª–µ—Ç–æ–º- –∑–∞–ø–ª–∞—Ç–∏–ª–∞ 350 —Ä—É–±. –ü—Ä–∏–Ω–µ—Å–ª–∏ –æ–º–ª–µ—Ç —Å —Ö–ª–µ–±—É—à–∫–æ–º, –æ–≤–æ—â–∞–º–∏ –∏ –∫–∞–ø—É—á–∏–Ω–æ —Å –º–æ–∏–º –ø–æ—Ä—Ç—Ä–µ—Ç–æ–º, —è –æ—á–µ–Ω—å –±—ã–ª–∞ —É–¥–∏–≤–ª–µ–Ω–∞ –∏ –æ–±—Ä–∞–¥–æ–≤–∞–Ω–∞. –ö—Ä–∞—Å–∏–≤–∞—è –ø–æ—Å—É–¥–∞, –∑–∞–º–µ—á–∞—Ç–µ–ª—å–Ω–∞—è –ø–æ–¥–∞—á–∞ –±–ª—é–¥–∞, –ø—Ä–∏—è—Ç–Ω–∞—è –∞—Ç–º–æ—Å—Ñ–µ—Ä–∞. –†–µ–∫–æ–º–µ–Ω–¥—É—é"
prompt_text = ""

output = generate_review(category, review, prompt_text)

for i, review in enumerate(output, 1):
    print(f"–û—Ç–∑—ã–≤ {i}:\n{review}\n")

–û—Ç–∑—ã–≤ 1:
–ö–∞—Ç–µ–≥–æ—Ä–∏—è: –û–±—â–µ—Å—Ç–≤–µ–Ω–Ω–æ–µ –ø–∏—Ç–∞–Ω–∏–µ | –û—Ç–∑—ã–≤: –ü—Ä–µ–∫—Ä–∞—Å–Ω–æ–µ –∫–∞—Ñ–µ —Å –≤–∫—É—Å–Ω—ã–º–∏ –∑–∞–≤—Ç—Ä–∞–∫–∞–º–∏ –¥–æ 16.00 –∏ –Ω–µ —Ç–æ–ª—å–∫–æ. –ó–∞–∫–∞–∑—ã–≤–∞–ª–∞ –∑–∞–≤—Ç—Ä–∞–∫ —Å –∫–æ—Ñ–µ –æ–º–ª–µ—Ç–æ–º- –∑–∞–ø–ª–∞—Ç–∏–ª–∞ 350 —Ä—É–±. –ü—Ä–∏–Ω–µ—Å–ª–∏ –æ–º–ª–µ—Ç —Å —Ö–ª–µ–±—É—à–∫–æ–º, –æ–≤–æ—â–∞–º–∏ –∏ –∫–∞–ø—É—á–∏–Ω–æ —Å –º–æ–∏–º –ø–æ—Ä—Ç—Ä–µ—Ç–æ–º, —è –æ—á–µ–Ω—å –±—ã–ª–∞ —É–¥–∏–≤–ª–µ–Ω–∞ –∏ –æ–±—Ä–∞–¥–æ–≤–∞–Ω–∞. –ö—Ä–∞—Å–∏–≤–∞—è –ø–æ—Å—É–¥–∞, –∑–∞–º–µ—á–∞—Ç–µ–ª—å–Ω–∞—è –ø–æ–¥–∞—á–∞ –±–ª—é–¥–∞, –ø—Ä–∏—è—Ç–Ω–∞—è –∞—Ç–º–æ—Å—Ñ–µ—Ä–∞. –†–µ–∫–æ–º–µ–Ω–¥—É—é –∫ –ø–æ—Å–µ—â–µ–Ω–∏—é. –í–∫—É—Å–Ω—ã–π –∫–æ—Ñ–µ, –æ—á–µ–Ω—å –≤–∫—É—Å–Ω—ã–π, –Ω–æ –Ω–µ –æ—á–µ–Ω—å –¥–æ—Ä–æ–≥–æ–π. –û—á–µ–Ω—å –≤–∫—É—Å–Ω–æ, –æ—Å–æ–±–µ–Ω–Ω–æ —Å –º–æ–ª–æ–∫–æ–º. –ù–∞–ø–∏—Ç–∫–∏ —Ç–æ–∂–µ –æ—á–µ–Ω—å –≤–∫—É—Å–Ω—ã–µ, –≤—Å–µ —Å–≤–µ–∂–µ–µ. –ü–µ—Ä—Å–æ–Ω–∞–ª –≤–µ–∂–ª–∏–≤—ã–π, –≤—Å–µ–≥–¥–∞ –ø–æ–¥—Å–∫–∞–∂—É—Ç, –ø–æ–º–æ–≥—É—Ç. –ù–æ, –∫ —Å–æ–∂–∞–ª–µ–Ω–∏—é, –Ω–µ –≤—Å–µ–≥–¥–∞ –µ—Å—Ç—å –≤–æ