In [None]:
import os
import re
from dataclasses import dataclass
from time import strftime
from typing import Optional, Dict, Union, Any

import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import roc_auc_score

import datasets
from datasets import Dataset, load_dataset, concatenate_datasets
import evaluate

import torch
import transformers
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    DataCollatorForSeq2Seq,
    DataCollatorWithPadding,
    GPTNeoXConfig,
    LlamaConfig,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    T5Config,
    Trainer,
    TrainingArguments,
    logging as hf_logging,
)
from peft import LoraConfig, TaskType, get_peft_model
torch.backends.cuda.matmul.allow_tf32 = True
torch.set_float32_matmul_precision("high")

chrf_metric = evaluate.load("chrf")


In [None]:
os.environ['CLEARML_API_ACCESS_KEY'] = ''
os.environ['CLEARML_API_SECRET_KEY'] = ''
os.environ['CLEARML_API_HOST'] = 'https://api.clear.ml'
os.environ['CLEARML_LOG_MODEL'] = 'FALSE'
os.environ['CLEARML_TASK'] = 'ru-kazakh-post'

In [None]:
from transformers import AutoTokenizer
import datasets
from datasets import concatenate_datasets

MODEL = 'tencent/HY-MT1.5-7B'

tokenizer = AutoTokenizer.from_pretrained(MODEL)

def build_text_en_tt(example):
    """
    Format English->Chuvash training examples using HY-MT1.5 chat template.
    """
    source_text = example['en']
    target_text = example['tt']
    
    messages = [
        {
            "role": "user",
            "content": f"Translate the following segment into kazakh, without additional explanation.\n\n{source_text}"
        },
        {
            "role": "assistant",
            "content": target_text
        }
    ]
    
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=False
    )
    
    example["text"] = text
    return example

def build_text_en_tt(example):
    """
    Format Russian->Chuvash training examples using HY-MT1.5 chat template.
    """
    source_text = example['ru']
    target_text = example['kk']

    messages = [
        {
            "role": "user",
            "content": f"Translate the following segment into Kazakh, without additional explanation.\n\n{source_text}"
        },
        {
            "role": "assistant",
            "content": target_text
        }
    ]
    
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=False
    )
    
    example["text"] = text
    return example

def tokenize_example(example):
    """
    Tokenize the formatted text for training.
    """
    encoded = tokenizer(
        example["text"],
        truncation=True,
        max_length=512,
        padding=False
    )
    example["input_ids"] = encoded["input_ids"]
    example["attention_mask"] = encoded["attention_mask"]
    
    example["labels"] = encoded["input_ids"].copy()
    
    return example

ru_kz_full = datasets.load_dataset('issai/kazparc')['train']


ru_kz_split = ru_kz_full.train_test_split(test_size=0.05, seed=42)
ru_kz_train = ru_kz_split['train']
ru_kz_test = ru_kz_split['test']

ru_kz_train = ru_kz_train.map(build_text_en_tt, num_proc=64)
ru_kz_train = ru_kz_train.map(tokenize_example, batched=False, num_proc=64)

ru_kz_test = ru_kz_test.map(build_text_en_tt, num_proc=8)
ru_kz_test = ru_kz_test.map(tokenize_example, batched=False, num_proc=16)

print(f"EN-CHV train: {len(ru_kz_train)} examples")
print(f"EN-CHV test: {len(ru_kz_test)} examples")


ds = datasets.DatasetDict({
    'train': ru_kz_train,
    'test': ru_kz_test
})

ds['test'] = ds['test'].select(range(10))

print(f"\nTrain: {len(ds['train'])} examples (EN+RU -> CHV, shuffled)")
print(f"Test: {len(ds['test'])} examples (EN -> CHV)")

EN-CHV train: 353306 examples
EN-CHV test: 18596 examples

Train: 353306 examples (EN+RU -> CHV, shuffled)
Test: 10 examples (EN -> CHV)


In [14]:
print(ds['test']['text'][0])

<|startoftext|>Translate the following segment into Kazakh, without additional explanation.

Для контрактов на переработку техногенных минеральных образований стартовый размер подписного бонуса устанавливается по формуле (С1 х 0,01%), но не менее 300-кратного размера месячного расчётного показателя, установленного законом о республиканском бюджете и действующего на дату опубликования условий конкурса или дату подписания протокола прямых переговоров по предоставлению права недропользования в соответствии с законодательством Республики Казахстан о недрах и недропользовании.<|extra_0|>Қол қою бонусының бастапқы мөлшері техногендік минералдық түзілімдерді қайта өңдеуге арналған келісімшарттар үшін (Қ1 х 0,01%) формуласы бойынша белгіленеді, бірақ республикалық бюджет туралы заңда белгіленген және конкурс шарттары жарияланған күнге немесе Қазақстан Республикасының жер қойнауы және жер қойнауын пайдалану туралы заңнамасына сәйкес жер қойнауын пайдалану құқығын беру жөніндегі тікелей келіссөз

In [15]:
ds['train'] = ds['train'].remove_columns(['kk', 'ru', 'text'])
ds['test'] = ds['test'].remove_columns(['kk', 'ru', 'text'])


In [None]:
MODEL = 'tencent/HY-MT1.5-7B'

tokenizer = AutoTokenizer.from_pretrained(MODEL, padding_side='left')

if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

model = AutoModelForCausalLM.from_pretrained(
    MODEL, 
    dtype=torch.bfloat16, 
).cuda()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
from peft import LoraConfig, TaskType, get_peft_model

target_modules = [
    "q_proj",
    "k_proj",
    "v_proj",
    "o_proj",
]

lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=target_modules,
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 13,631,488 || all params: 7,518,199,808 || trainable%: 0.1813


In [None]:
IS_7B = True
if IS_7B:
    ASSISTANT_TOKEN = 127962
else:
    ASSISTANT_TOKEN = 120007

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    if isinstance(preds, tuple):
        preds = preds[0]

    # 2) If preds are logits: (bs, seq, vocab) -> take argmax
    if isinstance(preds, np.ndarray) and preds.ndim == 3:
        preds = np.argmax(preds, axis=-1)

    # 3) Replace -100 (or any negative ids) in both preds and labels
    preds = np.where(preds < 0, tokenizer.pad_token_id, preds)
    labels = np.where(labels < 0, tokenizer.pad_token_id, labels)

    # 4) just to be sure
    preds = preds.astype(np.int32)
    labels = labels.astype(np.int32)

    # 5) Slice to keep only tokens after assistant token (not including it)
    truncated_preds = []
    truncated_labels = []
    
    for pred_seq, label_seq in zip(preds, labels):
        pred_asst_pos = np.where(pred_seq == ASSISTANT_TOKEN)[0]
        if len(pred_asst_pos) > 0:
            start_pos = pred_asst_pos[0] + 1
            truncated_preds.append(pred_seq[start_pos:])
        else:
            truncated_preds.append(pred_seq)
        
        # Find assistant token in labels
        label_asst_pos = np.where(label_seq == ASSISTANT_TOKEN)[0]
        if len(label_asst_pos) > 0:
            start_pos = label_asst_pos[0] + 1
            truncated_labels.append(label_seq[start_pos:])
        else:
            truncated_labels.append(label_seq)

    # Decode only the answer portions (no prompts)
    decoded_preds = tokenizer.batch_decode(truncated_preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(truncated_labels, skip_special_tokens=True)

    decoded_preds = [p.strip() for p in decoded_preds]
    decoded_labels = [l.strip() for l in decoded_labels]
    
    print(f'Generated (first 3): {decoded_preds[:10]}')
    print(f'Labels (first 3): {decoded_labels[:10]}')
    
    result = chrf_metric.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        word_order=2
    )
    return {"chrf": result["score"]}


class HYMTSeq2SeqTrainer(Seq2SeqTrainer):

    def _mask_prompt_labels(self, inputs):
        if "labels" not in inputs or "input_ids" not in inputs:
            return inputs

        input_ids = inputs["input_ids"]
        labels = inputs["labels"]
        if not torch.is_tensor(labels):
            return inputs

        labels = labels.clone()

        for i in range(input_ids.size(0)):
            pos = (input_ids[i] == ASSISTANT_TOKEN).nonzero(as_tuple=True)[0]
            if pos.numel() == 0:
                labels[i, :] = -100
            else:
                cut = int(pos[0].item()) + 1
                labels[i, :cut] = -100

        inputs["labels"] = labels
        return inputs

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        inputs = self._mask_prompt_labels(inputs)
        return super().compute_loss(model, inputs, return_outputs=return_outputs, **kwargs)

    def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys=None, **gen_kwargs):
        has_labels = "labels" in inputs
        inputs = self._prepare_inputs(inputs)

        if has_labels:
            inputs = self._mask_prompt_labels(inputs)

        with torch.no_grad():
            if has_labels:
                loss, _ = self.compute_loss(model, inputs, return_outputs=True)
                loss = loss.mean().detach()
            else:
                loss = None

        if prediction_loss_only:
            return (loss, None, None)

        if self.args.predict_with_generate:
            ASSISTANT_TOKEN = 120007
            input_ids = inputs["input_ids"]
            batch_size = input_ids.shape[0]

            truncated_ids = []
            truncated_masks = []

            for i in range(batch_size):
                seq = input_ids[i]
                pos = (seq == ASSISTANT_TOKEN).nonzero(as_tuple=True)[0]
                if pos.numel() > 0:
                    cut = int(pos[0].item()) + 1
                    seq = seq[:cut]
                truncated_ids.append(seq)
                truncated_masks.append(torch.ones_like(seq))

            max_len = max(x.numel() for x in truncated_ids)
            pad_token_id = self.processing_class.pad_token_id

            padded_ids = []
            padded_masks = []
            for ids, mask in zip(truncated_ids, truncated_masks):
                pad_len = max_len - ids.numel()
                if pad_len > 0:
                    ids = torch.cat([ids, torch.full((pad_len,), pad_token_id, dtype=ids.dtype, device=ids.device)])
                    mask = torch.cat([mask, torch.zeros(pad_len, dtype=mask.dtype, device=mask.device)])
                padded_ids.append(ids)
                padded_masks.append(mask)

            gru_inputs = {
                "input_ids": torch.stack(padded_ids),
                "attention_mask": torch.stack(padded_masks),
            }

            gen_kwargs = self._gen_kwargs.copy()
            gen_kwargs = dict(
                max_new_tokens=512,
                num_beams=1,
                do_sample=True,
                num_return_sequences=1,
                repetition_penalty=1.1,
                early_stopping=True,
                temperature=0.7,
                pad_token_id=tokenizer.pad_token_id,
            )
            if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None:
                gen_kwargs["max_new_tokens"] = self.args.generation_max_length
            if gen_kwargs.get("num_beams") is None:
                gen_kwargs["num_beams"] = self.args.generation_num_beams

            generated_tokens = model.module.generate(**gru_inputs, **gen_kwargs)
        else:
            generated_tokens = None

        labels = inputs["labels"] if has_labels else None
        return (loss, generated_tokens, labels)



data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True,
    pad_to_multiple_of=8,
    label_pad_token_id=-100
)

training_args = Seq2SeqTrainingArguments(
    output_dir="checkpoints_7B_lora_translated/ru-kz-final",

    num_train_epochs=2,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=1,

    # Learning rate
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,

    # Optimizer
    optim="adamw_torch",
    weight_decay=0.01,

    # Gradient handling
    max_grad_norm=1.0,

    # Logging
    logging_steps=10,
    logging_dir="./logs",
    logging_strategy="steps",

    # Evaluation
    eval_strategy="steps",
    eval_steps=1_000,

    # Generation for chrf++
    predict_with_generate=True,
    generation_max_length=512,  # Max tokens to generate
    generation_num_beams=5,  # Beam search for better quality

    # Saving
    save_strategy="steps",
    save_steps=1_000,
    save_total_limit=1,
    save_only_model=True,
    save_safetensors=False,
    load_best_model_at_end=False,
    metric_for_best_model="chrf",  # Use chrF++ for best model
    greater_is_better=True,  # Higher chrF is better

    dataloader_num_workers=4,
    dataloader_pin_memory=True,

    # Reproducibility
    seed=42,

    # Reporting
    report_to="clearml",

    remove_unused_columns=True,
    push_to_hub=False,
)


trainer = HYMTSeq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=ds['train'],
    eval_dataset=ds['test'],
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)


trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 127958}.


ClearML Task: created new task id=d1d4188cd1db44a289d7646105d0fd40
2026-01-12 04:30:23,070 - clearml.Task - INFO - No repository found, storing script code instead
ClearML results page: https://app.clear.ml/projects/c26a6bbc77a549af84da889d0ac9231f/experiments/d1d4188cd1db44a289d7646105d0fd40/output/log


Step,Training Loss,Validation Loss


: 

In [None]:
tokenizer.save_pretrained('checkpoints_7B_lora_translated/kazakh_final_lora')

('checkpoints_7B_lora_translated/tatar_final_lora/tokenizer_config.json',
 'checkpoints_7B_lora_translated/tatar_final_lora/special_tokens_map.json',
 'checkpoints_7B_lora_translated/tatar_final_lora/chat_template.jinja',
 'checkpoints_7B_lora_translated/tatar_final_lora/tokenizer.json')

In [None]:
trainer.model.save_pretrained('checkpoints_7B_lora_translated/kazakh_final_lora')

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 83b06db4-5513-4c6d-baa0-8e5a3b0ece78)')' thrown while requesting HEAD https://huggingface.co/tencent/HY-MT1.5-7B/resolve/main/config.json
Retrying in 1s [Retry 1/5].
