In [None]:
!pip install unsloth -q

In [2]:
!pip install --upgrade pyarrow -q

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report

from unsloth import FastModel
from trl import SFTTrainer, SFTConfig
from transformers import TrainingArguments, pipeline
from datasets import Dataset
from unsloth.chat_templates import train_on_responses_only

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [4]:
df = pd.read_csv('combined_tat_detox_corpus.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25449 entries, 0 to 25448
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   toxic_tt  25449 non-null  object
 1   detox_tt  25449 non-null  object
dtypes: object(2)
memory usage: 397.8+ KB


In [5]:
dataset = Dataset.from_pandas(df)

In [6]:
train_valid_test = dataset.train_test_split(test_size=0.2, seed=42)
train_valid = train_valid_test['train'].train_test_split(test_size=0.5, seed=42)
train_dataset = train_valid['train']
valid_dataset = train_valid['test']
test_dataset = train_valid_test['test']

print(f"Train: {len(train_dataset)}, Valid: {len(valid_dataset)}, Test: {len(test_dataset)}")

Train: 10179, Valid: 10180, Test: 5090


In [7]:
# моделька
model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/gemma-3-270m-it",
    max_seq_length = 256,
    load_in_4bit = True,
)

# LoRA
model = FastModel.get_peft_model(
    model,
    r = 8,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
)

==((====))==  Unsloth 2025.12.9: Fast Gemma3 patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Using float16 precision for gemma3 won't work! Using float32.
Unsloth: Gemma3 does not support SDPA - switching to fast eager.


model.safetensors:   0%|          | 0.00/393M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/233 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/670 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

Unsloth: Making `model.base_model.model.model` require gradients


In [8]:
def convert_to_chatml(row):
    return {
        "conversations": [
            {"role": "system", "content": "Rewrite a toxic tatar phrase in non-toxic style. Save the wording, but detoxify it."},
            # также пробовали на татарском, результат был совсем плохой
            {"role": "user", "content": row['toxic_tt']},
            {"role": "assistant", "content": row['detox_tt']}
        ]
    }

train_dataset = train_dataset.map(convert_to_chatml)
valid_dataset = valid_dataset.map(convert_to_chatml)
test_dataset  = test_dataset.map(convert_to_chatml)

Map:   0%|          | 0/10179 [00:00<?, ? examples/s]

Map:   0%|          | 0/10180 [00:00<?, ? examples/s]

Map:   0%|          | 0/5090 [00:00<?, ? examples/s]

In [9]:
def formatting_prompts_func(examples):
       convos = examples["conversations"]
       texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False).removeprefix('<bos>') for convo in convos]
       return { "text" : texts, }

train_dataset = train_dataset.map(formatting_prompts_func, batched=True)
valid_dataset = valid_dataset.map(formatting_prompts_func, batched=True)
test_dataset  = test_dataset.map(formatting_prompts_func, batched=True)

Map:   0%|          | 0/10179 [00:00<?, ? examples/s]

Map:   0%|          | 0/10180 [00:00<?, ? examples/s]

Map:   0%|          | 0/5090 [00:00<?, ? examples/s]

Учим

In [10]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = valid_dataset,
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 4,
        warmup_steps = 10,
        max_steps = 300,
        learning_rate = 5e-5,
        logging_steps = 10,
        eval_steps=50,
        save_steps=100,
        optim = "adamw_8bit",
        weight_decay = 0.001,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",
    ),
)

Unsloth: Switching to float32 training since model cannot work with float16


Unsloth: Tokenizing ["text"] (num_proc=4):   0%|          | 0/10179 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=4):   0%|          | 0/10180 [00:00<?, ? examples/s]

🦥 Unsloth: Padding-free auto-enabled, enabling faster training.


In [11]:
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<start_of_turn>user\n",
    response_part = "<start_of_turn>model\n",
)

Map (num_proc=6):   0%|          | 0/10179 [00:00<?, ? examples/s]

Map (num_proc=6):   0%|          | 0/10180 [00:00<?, ? examples/s]

In [12]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 10,179 | Num Epochs = 1 | Total steps = 300
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 1,898,496 of 269,996,672 (0.70% trained)


Step,Training Loss
10,3.1563
20,2.4718
30,2.1225
40,2.2584
50,2.1927
60,2.0115
70,2.0122
80,1.9957
90,2.1718
100,2.0987


In [13]:
trainer.save_model("./gemma3-lora-tat-detox-new-prompt")
tokenizer.save_pretrained("./gemma3-lora-tat-detox-new-prompt")

('./gemma3-lora-tat-detox-new-prompt/tokenizer_config.json',
 './gemma3-lora-tat-detox-new-prompt/special_tokens_map.json',
 './gemma3-lora-tat-detox-new-prompt/chat_template.jinja',
 './gemma3-lora-tat-detox-new-prompt/tokenizer.model',
 './gemma3-lora-tat-detox-new-prompt/added_tokens.json',
 './gemma3-lora-tat-detox-new-prompt/tokenizer.json')

In [14]:
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=128,
    do_sample=False,
    pad_token_id=tokenizer.eos_token_id,
)

Device set to use cuda:0


In [17]:
def extract_response(full_text, input_text):
    # Убираем входной промпт, оставляем только ответ модели
    if full_text.startswith(input_text):
        return full_text[len(input_text):].strip()
    return full_text.strip()

print("\n=== Примеры ответов модели на тестовых данных ===")
test_samples = test_dataset.select(range(15))
examples = []

for i, sample in enumerate(test_samples):
    prompt = tokenizer.apply_chat_template(
        [
            {"role": "system", "content": "Rewrite a toxic tatar phrase in non-toxic style. Save the wording, but detoxify it."},
            {"role": "user", "content": sample['toxic_tt']}
        ],
        tokenize=False,
        add_generation_prompt=True
    ).removeprefix('<bos>')

    output = generator(prompt)[0]['generated_text']
    predicted = extract_response(output, prompt)
    ground_truth = sample['detox_tt']

    examples.append({
        "toxic": sample['toxic_tt'],
        "predicted": predicted,
        "ground_truth": ground_truth
    })

    print(f"\n{i+1}. Токсик:      {sample['toxic_tt']}")
    print(f"   Предсказано: {predicted}")
    print(f"   Оригинал:    {ground_truth}")
    print("-" * 80)


=== Примеры ответов модели на тестовых данных ===

1. Токсик:      Рәхмәт , сукмак , мин сине яратам !
   Предсказано: Мин сине яратомсыз, сукмак.
   Оригинал:    Рәхмәт, Бести, мин сине яратам!
--------------------------------------------------------------------------------

2. Токсик:      Ни өчен мин 13 нче елгы корабны сатып алганчы бу хәл була алмаган ?
   Предсказано: Ни өчен мин 13 нче елгы балды хәл була алмаган?
   Оригинал:    Ни өчен мин 13нче елгы корабны сатып алганчы бу хәл була алмаган?
--------------------------------------------------------------------------------

3. Токсик:      Бу чүпне шактый кисү кирәк .
   Предсказано: Бу некаңыл кисү кирәк.
   Оригинал:    Бу өлешне кисеп ташларга кирәк.
--------------------------------------------------------------------------------

4. Токсик:      Мин шулкадәр генә туямын , мин моны эшли алам .
   Предсказано: Мин моны эшли алам.
   Оригинал:    Мин шулкадәр генә туямын, мин моны эшли алам.
----------------------------------

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset



5. Токсик:      LOL зур Шон шифасы мин бар кайнар дип уйлаган идем
   Предсказано: LOL сур Шон тагым мин бар кайнар дип уйлап када бипре։
   Оригинал:    Бу зур Шон, мин барны кызу дип уйлаган идем.
--------------------------------------------------------------------------------

6. Токсик:      Минем өчен зур бүләкләр !
   Предсказано: Минем өчен сур түткәрләр!
   Оригинал:    Миндә киң күңелле аерым яшәү яхшы түгел.
--------------------------------------------------------------------------------

7. Токсик:      Узе тудыра бара шул уйны кут курсэтеп
   Предсказано: Узе тудыра бара тулны кут курсэтеп.
   Оригинал:    Үзе тудыра бара шул уйны арт күрсәтеп
--------------------------------------------------------------------------------

8. Токсик:      Ну мин анлар идем, когда кеше булмаганда эйтсэлэр, дебилы!, шуны аңла
   Предсказано: Ну мин анлар идем, кеше булмаганда эйтсэлэр, шуны аңла
   Оригинал:    Ну мин аңлар идем, когда кеше булмаганда әйтсәләр, дебилы!!!
-------------------

In [18]:
import shutil
shutil.make_archive("gemma3-lora-tat-detox-new-prompt", 'zip', "gemma3-lora-tat-detox-new-prompt")

'/content/gemma3-lora-tat-detox-new-prompt.zip'