In [None]:
import pandas as pd

df = pd.read_csv("/content/combined_tat_detox_corpus.csv")

df = df.rename(columns={
    "toxic_tt": "input",
    "detox_tt": "target"
})

print(df.head(3))



                                               input  \
0  @user, ну чапай чапай эйтер идем инде;-)эйтеп ...   
1  @user, ну чапай чапай эйтер идем яла;-)эйтеп т...   
2  @USER, ну чапай чапай эйтер идем инде;-)эйтеп ...   

                                              target  
0  @user, ну чапай чапай әйтер идем инде;-)әйтеп ...  
1  @user, ну чапай чапай әйтер идем инде;-)әйтеп ...  
2  @user, ну чапай чапай әйтер идем инде;-)әйтеп ...  


In [None]:
print(df.columns.tolist())


['toxic_tt', 'detox_tt']


In [None]:
# ----------------------------
# 1. Импорты
# ----------------------------
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

# ----------------------------
# 2. Загрузите ваш датасет (замените на ваш путь)
# ----------------------------
# Пример:
# df = pd.read_csv("tat_detox.csv")
# Должны быть колонки: 'tat_toxic', 'tat_detox'

# 3.1 Определяем, какие колонки есть в df
cols = set(df.columns)

if {"toxic_tt", "detox_tt"}.issubset(cols):
    SRC_COL, TGT_COL = "toxic_tt", "detox_tt"
elif {"input", "target"}.issubset(cols):
    SRC_COL, TGT_COL = "input", "target"
elif {"tat_toxic", "tat_detox"}.issubset(cols):
    SRC_COL, TGT_COL = "tat_toxic", "tat_detox"
else:
    raise ValueError(f"Не нашёл ожидаемые колонки. Сейчас в df: {df.columns.tolist()}")

# 3.2 Берём только нужные колонки, убираем NaN
df2 = df[[SRC_COL, TGT_COL]].dropna().copy()

# (опционально) приводим к строкам
df2[SRC_COL] = df2[SRC_COL].astype(str)
df2[TGT_COL] = df2[TGT_COL].astype(str)

# 3.3 Dataset + создание input_text/target_text
dataset = Dataset.from_pandas(df2)

dataset = dataset.map(
    lambda x: {
        "input_text": f"detox: {x[SRC_COL].strip()}",
        "target_text": x[TGT_COL].strip()
    },
    remove_columns=[SRC_COL, TGT_COL]
)

# 3.4 Удаляем пустые примеры
dataset = dataset.filter(
    lambda x: x["input_text"].strip() != "" and x["target_text"].strip() != ""
)

# 3.5 Разделение
dataset_split = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = dataset_split["train"]
eval_dataset = dataset_split["test"]

print("Использую колонки:", SRC_COL, "->", TGT_COL)
print(f"Train: {len(train_dataset)} примеров")
print(f"Eval:  {len(eval_dataset)} примеров")
print("Пример:", train_dataset[0])

# ----------------------------
# 4. Модель и токенизатор
# ----------------------------
model_name = "google/mt5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# ----------------------------
# 5. Токенизация
# ----------------------------
def tokenize_function(examples):
    inputs = tokenizer(
        examples["input_text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )
    outputs = tokenizer(
        examples["target_text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )
    labels = [
        [(-100 if token == tokenizer.pad_token_id else int(token)) for token in label]
        for label in outputs["input_ids"]
    ]
    inputs["labels"] = labels
    return inputs

train_tokenized = train_dataset.map(tokenize_function, batched=True)
eval_tokenized = eval_dataset.map(tokenize_function, batched=True)

# Удаляем текстовые колонки
train_tokenized = train_tokenized.remove_columns(["input_text", "target_text"])
eval_tokenized = eval_tokenized.remove_columns(["input_text", "target_text"])

# ----------------------------
# 6. Обучение
# ----------------------------
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

training_args = Seq2SeqTrainingArguments(
    output_dir="./mt5-tat-detox-full",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=3e-5,
    num_train_epochs=5,
    logging_steps=50,
    save_strategy="epoch",
    eval_strategy="epoch",
    predict_with_generate=True,  # ← чтобы видеть генерацию!
    fp16=False,                  # ← ОБЯЗАТЕЛЬНО False для mT5
    optim="adamw_torch",
    report_to="none",
    save_total_limit=2,
    logging_nan_inf_filter=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=eval_tokenized,
    data_collator=data_collator,
    tokenizer=tokenizer  # (предупреждение можно игнорировать)
)

# Запуск обучения
trainer.train()

# Сохранение
trainer.save_model("./mt5-tat-detox-full")
tokenizer.save_pretrained("./mt5-tat-detox-full")

Map:   0%|          | 0/25449 [00:00<?, ? examples/s]

Filter:   0%|          | 0/25449 [00:00<?, ? examples/s]

Использую колонки: input -> target
Train: 22904 примеров
Eval:  2545 примеров
Пример: {'input_text': 'detox: пидораслар чүп-чарга чумдылар динне акчага саталар!, аңлыйсыңмы', 'target_text': 'сәер кешеләр чүп-чарга чумдылар динне акчага саталар!'}


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/22904 [00:00<?, ? examples/s]

Map:   0%|          | 0/2545 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss
1,2.4851,1.812266
2,2.1026,1.55787
3,1.9759,1.450458
4,1.8803,1.402033
5,1.7991,1.387328


('./mt5-tat-detox-full/tokenizer_config.json',
 './mt5-tat-detox-full/special_tokens_map.json',
 './mt5-tat-detox-full/spiece.model',
 './mt5-tat-detox-full/added_tokens.json',
 './mt5-tat-detox-full/tokenizer.json')

In [None]:
!zip -r mt5-tat-detox-full.zip /content/mt5-tat-detox-full


  adding: content/mt5-tat-detox-full/ (stored 0%)
  adding: content/mt5-tat-detox-full/special_tokens_map.json (deflated 73%)
  adding: content/mt5-tat-detox-full/tokenizer_config.json (deflated 95%)
  adding: content/mt5-tat-detox-full/training_args.bin (deflated 54%)
  adding: content/mt5-tat-detox-full/config.json (deflated 47%)
  adding: content/mt5-tat-detox-full/spiece.model (deflated 46%)
  adding: content/mt5-tat-detox-full/checkpoint-5728/ (stored 0%)
  adding: content/mt5-tat-detox-full/checkpoint-5728/special_tokens_map.json (deflated 73%)
  adding: content/mt5-tat-detox-full/checkpoint-5728/tokenizer_config.json (deflated 95%)
  adding: content/mt5-tat-detox-full/checkpoint-5728/training_args.bin (deflated 54%)
  adding: content/mt5-tat-detox-full/checkpoint-5728/config.json (deflated 47%)
  adding: content/mt5-tat-detox-full/checkpoint-5728/scheduler.pt (deflated 61%)
  adding: content/mt5-tat-detox-full/checkpoint-5728/spiece.model (deflated 46%)
  adding: content/mt5-tat

In [None]:
from google.colab import files
files.download("/content/mt5-tat-detox-full.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
text = "detox: синең сүзләрең бик начар һәм мыскыллы"

inputs = tokenizer(text, return_tensors="pt", truncation=True)
outputs = model.generate(
    **inputs,
    max_length=128,
    num_beams=5
)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


RuntimeError: Expected all tensors to be on the same device, but got index is on cpu, different from other tensors on cuda:0 (when checking argument in method wrapper_CUDA__index_select)

# **Попытка номер миллион**
Облегченный файнтюн

In [None]:
import pandas as pd
from datasets import Dataset

DATA_PATH = "/content/combined_tat_detox_corpus.csv"
SRC_COL, TGT_COL = "toxic_tt", "detox_tt"

df = pd.read_csv(DATA_PATH)

# Проверка колонок
missing = [c for c in [SRC_COL, TGT_COL] if c not in df.columns]
if missing:
    raise ValueError(f"Не нашёл колонки {missing}. Сейчас в df: {df.columns.tolist()}")

# Чистим
df2 = df[[SRC_COL, TGT_COL]].dropna().copy()
df2[SRC_COL] = df2[SRC_COL].astype(str).str.strip()
df2[TGT_COL] = df2[TGT_COL].astype(str).str.strip()
df2 = df2[(df2[SRC_COL] != "") & (df2[TGT_COL] != "")]
print("Rows after cleaning:", len(df2))

dataset = Dataset.from_pandas(df2).rename_columns({SRC_COL:"src", TGT_COL:"tgt"})
split = dataset.train_test_split(test_size=0.1, seed=42)
train_ds, eval_ds = split["train"], split["test"]

print("Train:", len(train_ds), "Eval:", len(eval_ds))
print("Sample:", train_ds[0])



Rows after cleaning: 25449
Train: 22904 Eval: 2545
Sample: {'src': 'пидораслар чүп-чарга чумдылар динне акчага саталар!, аңлыйсыңмы', 'tgt': 'сәер кешеләр чүп-чарга чумдылар динне акчага саталар!'}


*облегчённый full fine-tune mT5, динамический padding*

In [None]:
# ----------------------------
# 0. Импорты
# ----------------------------
import os
import pandas as pd
import numpy as np
from datasets import Dataset
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

# ----------------------------
# 1. Загрузка датасета
# ----------------------------
DATA_PATH = "/content/combined_tat_detox_corpus.csv"
df = pd.read_csv(DATA_PATH)

cols = set(df.columns)
if {"toxic_tt", "detox_tt"}.issubset(cols):
    SRC_COL, TGT_COL = "toxic_tt", "detox_tt"
elif {"input", "target"}.issubset(cols):
    SRC_COL, TGT_COL = "input", "target"
elif {"tat_toxic", "tat_detox"}.issubset(cols):
    SRC_COL, TGT_COL = "tat_toxic", "tat_detox"
else:
    raise ValueError(f"Не нашёл ожидаемые колонки. Сейчас в df: {df.columns.tolist()}")

df2 = df[[SRC_COL, TGT_COL]].dropna().copy()
df2[SRC_COL] = df2[SRC_COL].astype(str).str.strip()
df2[TGT_COL] = df2[TGT_COL].astype(str).str.strip()

# Жёсткая фильтрация пустых строк + слишком коротких target (частая причина nan/0)
df2 = df2[(df2[SRC_COL] != "") & (df2[TGT_COL] != "")]
df2 = df2[df2[TGT_COL].str.len() >= 2]   # target хотя бы 2 символа

print("Использую колонки:", SRC_COL, "->", TGT_COL)
print("После чистки строк:", len(df2))

# ----------------------------
# 2. Dataset + input_text/target_text
# ----------------------------
dataset = Dataset.from_pandas(df2, preserve_index=False)

dataset = dataset.map(
    lambda x: {
        "input_text": f"detox: {x[SRC_COL]}",
        "target_text": x[TGT_COL]
    },
    remove_columns=[SRC_COL, TGT_COL]
)

# Дополнительная защита от скрытых пустот
dataset = dataset.filter(
    lambda x: (x["input_text"] is not None) and (x["target_text"] is not None)
              and (x["input_text"].strip() != "")
              and (x["target_text"].strip() != "")
)

# ----------------------------
# 3. Модель и токенизатор
# ----------------------------
model_name = "google/mt5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# для mT5 обычно pad_token_id уже есть, но подстрахуемся
if tokenizer.pad_token_id is None and tokenizer.eos_token_id is not None:
    tokenizer.pad_token = tokenizer.eos_token

# ----------------------------
# 4. Отсечение "аномально длинных" примеров ДО токенизации (чтобы не было взрывов)
# ----------------------------
# Быстро оцениваем длины по символам (дёшево), убираем хвост распределения.
# Порог можно менять. Для 25k строк это часто спасает от нестабильности.
MAX_CHARS_SRC = 600
MAX_CHARS_TGT = 600

dataset = dataset.filter(
    lambda x: len(x["input_text"]) <= MAX_CHARS_SRC and len(x["target_text"]) <= MAX_CHARS_TGT
)

# ----------------------------
# 5. Split
# ----------------------------
dataset_split = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = dataset_split["train"]
eval_dataset = dataset_split["test"]

print(f"Train: {len(train_dataset)} примеров")
print(f"Eval:  {len(eval_dataset)} примеров")
print("Пример:", train_dataset[0])

# ----------------------------
# 6. Токенизация (как у вас: max_length + ручное -100)
# ----------------------------
MAX_LEN = 160   # можно 128/160/192; 160 часто хороший компромисс

def tokenize_function(examples):
    inputs = tokenizer(
        examples["input_text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN
    )
    outputs = tokenizer(
        examples["target_text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN
    )

    # РУЧНОЙ -100 как у вас (это стабильно для старых версий)
    labels = []
    for seq in outputs["input_ids"]:
        labels.append([(-100 if tok == tokenizer.pad_token_id else int(tok)) for tok in seq])

    inputs["labels"] = labels
    return inputs

train_tokenized = train_dataset.map(tokenize_function, batched=True)
eval_tokenized  = eval_dataset.map(tokenize_function, batched=True)

train_tokenized = train_tokenized.remove_columns(["input_text", "target_text"])
eval_tokenized  = eval_tokenized.remove_columns(["input_text", "target_text"])

# ----------------------------
# 7. Sanity-check: убеждаемся, что labels реально содержат НЕ -100
# ----------------------------
def check_labels(ds, n=3):
    for i in range(n):
        lab = ds[i]["labels"]
        real = sum(1 for t in lab if t != -100)
        print(f"sample {i}: non_-100_tokens={real}")

print("Label check (train):")
check_labels(train_tokenized, 3)
print("Label check (eval):")
check_labels(eval_tokenized, 3)

# ----------------------------
# 8. Обучение (консервативные настройки против nan)
# ----------------------------
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

training_args = Seq2SeqTrainingArguments(
    output_dir="/content/mt5-tat-detox-final",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=3e-5,
    num_train_epochs=5,
    logging_steps=50,
    save_strategy="epoch",

    # КРИТИЧНО:
    eval_strategy="no",        # ← НЕТ eval
    predict_with_generate=False,     # ← НЕТ генерации
    remove_unused_columns=False,     # ← ОБЯЗАТЕЛЬНО для mt5

    fp16=False,
    optim="adamw_torch",
    report_to="none",
    save_total_limit=2,
    logging_nan_inf_filter=True,
)


trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=eval_tokenized,
    data_collator=data_collator,
    tokenizer=tokenizer
)

trainer.train()

# ----------------------------
# 9. Сохранение
# ----------------------------
save_dir = "/content/mt5-tat-detox-stable"
trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)

print("Saved to:", save_dir)
print("Files:", os.listdir(save_dir))



Использую колонки: toxic_tt -> detox_tt
После чистки строк: 25448


Map:   0%|          | 0/25448 [00:00<?, ? examples/s]

Filter:   0%|          | 0/25448 [00:00<?, ? examples/s]



Filter:   0%|          | 0/25448 [00:00<?, ? examples/s]

Train: 22901 примеров
Eval:  2545 примеров
Пример: {'input_text': 'detox: Исраилне Канада да яклаячак , әгәр Генераль Ассамблея башка бер АКШны куяр өчен җитәрлек ахмаклык күрсәтсә .', 'target_text': 'Исраил шулай ук Канада якланачак , әгәр Генераль Ассамблея башка бер U куяр өчен җитәрлек аңламаса'}


Map:   0%|          | 0/22901 [00:00<?, ? examples/s]

Map:   0%|          | 0/2545 [00:00<?, ? examples/s]

Label check (train):
sample 0: non_-100_tokens=36
sample 1: non_-100_tokens=17
sample 2: non_-100_tokens=27
Label check (eval):
sample 0: non_-100_tokens=23
sample 1: non_-100_tokens=19
sample 2: non_-100_tokens=26


  trainer = Seq2SeqTrainer(


Step,Training Loss


KeyboardInterrupt: 

# **ЛОРА**

In [None]:
!pip -q install -U peft accelerate



In [None]:
import os
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)
from peft import LoraConfig, get_peft_model

# ---- data
DATA_PATH = "/content/combined_tat_detox_corpus.csv"
SRC_COL, TGT_COL = "toxic_tt", "detox_tt"

df = pd.read_csv(DATA_PATH)
df2 = df[[SRC_COL, TGT_COL]].dropna().copy()
df2[SRC_COL] = df2[SRC_COL].astype(str).str.strip()
df2[TGT_COL] = df2[TGT_COL].astype(str).str.strip()
df2 = df2[(df2[SRC_COL] != "") & (df2[TGT_COL] != "")]
df2 = df2[(df2[SRC_COL].str.len() <= 800) & (df2[TGT_COL].str.len() <= 800)]

dataset = Dataset.from_pandas(df2, preserve_index=False)
split = dataset.train_test_split(test_size=0.1, seed=42)
train_ds, eval_ds = split["train"], split["test"]

print("Train:", len(train_ds), "Eval:", len(eval_ds))

# ---- model/tokenizer
model_name = "google/mt5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# gradient checkpointing для экономии памяти
model.gradient_checkpointing_enable()
model.config.use_cache = False

# LoRA
lora_cfg = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ_LM",
)
model = get_peft_model(model, lora_cfg)
model.print_trainable_parameters()

MAX_LEN = 160

def tokenize_function(batch):
    inputs = tokenizer(
        [f"detox: {x}" for x in batch[SRC_COL]],
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN
    )
    outputs = tokenizer(
        batch[TGT_COL],
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN
    )
    labels = [
        [(-100 if t == tokenizer.pad_token_id else int(t)) for t in seq]
        for seq in outputs["input_ids"]
    ]
    inputs["labels"] = labels
    return inputs

train_tok = train_ds.map(tokenize_function, batched=True, remove_columns=train_ds.column_names)
eval_tok  = eval_ds.map(tokenize_function, batched=True, remove_columns=eval_ds.column_names)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# ВАЖНО: без eval/generate во время обучения
# У вас старая версия transformers, поэтому используем eval_strategy
training_args = Seq2SeqTrainingArguments(
    output_dir="/content/mt5-tat-detox-lora-no8bit",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,   # эффективный батч 16
    learning_rate=2e-4,
    num_train_epochs=3,
    logging_steps=50,
    save_strategy="epoch",
    eval_strategy="no",
    predict_with_generate=False,
    fp16=False,
    report_to="none",
    save_total_limit=2,
    max_grad_norm=1.0,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()

save_dir = "/content/mt5-tat-detox-lora-no8bit"
trainer.save_model(save_dir)          # сохранит LoRA-адаптеры
tokenizer.save_pretrained(save_dir)

print("Saved to:", save_dir)
print("Files:", os.listdir(save_dir))


Train: 22904 Eval: 2545




trainable params: 688,128 || all params: 300,864,896 || trainable%: 0.2287


Map:   0%|          | 0/22904 [00:00<?, ? examples/s]

Map:   0%|          | 0/2545 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(


Step,Training Loss
50,23.0047
100,16.0584
150,10.8184
200,7.5787
250,5.8658
300,5.1189
350,4.8754
400,4.5169
450,4.3838
500,4.3258


Saved to: /content/mt5-tat-detox-lora-no8bit
Files: ['checkpoint-2864', 'adapter_model.safetensors', 'special_tokens_map.json', 'adapter_config.json', 'tokenizer_config.json', 'training_args.bin', 'spiece.model', 'README.md', 'tokenizer.json', 'checkpoint-4296']


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import PeftModel

base = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-small")
tok = AutoTokenizer.from_pretrained("/content/mt5-tat-detox-lora-no8bit")
model = PeftModel.from_pretrained(base, "/content/mt5-tat-detox-lora-no8bit")

model.eval()

text = "detox: " + df2["toxic_tt"].iloc[0]
inputs = tok(text, return_tensors="pt", truncation=True, max_length=160)

with torch.no_grad():
    out = model.generate(**inputs, max_new_tokens=128)

print(tok.decode(out[0], skip_special_tokens=True))


Он чапай эйтер идем инде;-)эйтеп тормыйм


In [None]:
def test_pair(i):
    src = df2["toxic_tt"].iloc[i]
    print("IN :", src)
    print("OUT:", generate(src))
    print("-"*50)

def generate(text):
    inputs = tok("detox: " + text, return_tensors="pt", truncation=True, max_length=160)
    out = model.generate(
        **inputs,
        max_new_tokens=128,
        do_sample=False,
        num_beams=4
    )
    return tok.decode(out[0], skip_special_tokens=True)

for i in [0, 5, 10, 20]:
    test_pair(i)


IN : @user, ну чапай чапай эйтер идем инде;-)эйтеп тормыйм
OUT: Он чапай эйтер идем инде;-)эйтеп тормыйм
--------------------------------------------------
IN : Купме ашарга була яла? Симереп дуңгыз буласыз бит
OUT: Купме ашарга була яла? Симереп дуңгыз буласыз бит
--------------------------------------------------
IN : Айнур.Джамшед!башта яхшылап укыгыз авторнын нэрсэ эйтэсе килгэнен,бармак белэн кут....ы бутамагыз!
OUT: Айнур.Джамшед!башта яхшылап укыгыз авторнын нэрсэ эйтэсе килгэнен,бармак белэн кут....ы бутамагыз!
--------------------------------------------------
IN : Егетен булгач с ночевкой бармаска иде! Или калмаска иде! Уз кутенэ приключение эзлэгэнсен инде булдыргансын!!
OUT: Егетен булгач с ночевкой бармаска иде! Или калмаска иде! Уз кутенэ приключение эзлэгэнсен инде булдыргансын!!
--------------------------------------------------


In [None]:
import os, glob
print(glob.glob("/content/mt5-tat-detox*"))


['/content/mt5-tat-detox-lora-no8bit', '/content/mt5-tat-detox-lite', '/content/mt5-tat-detox-stable', '/content/mt5-tat-detox-final']


In [None]:
!zip -r /content/mt5-tat-detox-lora-no8bit.zip /content/mt5-tat-detox-lora-no8bit


  adding: content/mt5-tat-detox-lora-no8bit/ (stored 0%)
  adding: content/mt5-tat-detox-lora-no8bit/checkpoint-2864/ (stored 0%)
  adding: content/mt5-tat-detox-lora-no8bit/checkpoint-2864/adapter_model.safetensors (deflated 7%)
  adding: content/mt5-tat-detox-lora-no8bit/checkpoint-2864/special_tokens_map.json (deflated 73%)
  adding: content/mt5-tat-detox-lora-no8bit/checkpoint-2864/adapter_config.json (deflated 57%)
  adding: content/mt5-tat-detox-lora-no8bit/checkpoint-2864/tokenizer_config.json (deflated 95%)
  adding: content/mt5-tat-detox-lora-no8bit/checkpoint-2864/training_args.bin (deflated 53%)
  adding: content/mt5-tat-detox-lora-no8bit/checkpoint-2864/scheduler.pt (deflated 61%)
  adding: content/mt5-tat-detox-lora-no8bit/checkpoint-2864/spiece.model (deflated 46%)
  adding: content/mt5-tat-detox-lora-no8bit/checkpoint-2864/optimizer.pt (deflated 8%)
  adding: content/mt5-tat-detox-lora-no8bit/checkpoint-2864/README.md (deflated 66%)
  adding: content/mt5-tat-detox-lora-n

In [None]:
from google.colab import files
files.download("/content/mt5-tat-detox-lora-no8bit.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>