In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
torch.backends.cuda.matmul.allow_tf32 = True
torch.set_float32_matmul_precision("high")
from huggingface_hub import login

MODEL = 'checkpoints_7B_lora_translated/ru-kz-final/checkpoint-23000'

tokenizer = AutoTokenizer.from_pretrained(MODEL)

tokenizer.padding_side='left'
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

def generate_translation(english_sentences, model, max_new_tokens=128, target_lang='chv', temperature=0.7, **gen_kwargs):
    """
    Generate Chuvash translations for a list of English sentences using HY-MT1.5.
    
    Uses the official prompt template for non-Chinese translations.
    """
    translations = []

    target_lang_mapping = {"chv": "Chuvash", "kz": "Kazakh", "kyr": "Kyrgyz", 'ru': "Russian", 'cn': "Chinese", 'en': "English", 'bk': "Bashkir", 'tt': "Tatar"}
    target_lang_str = target_lang_mapping[target_lang]

    for en_sent in english_sentences:
        messages = [
            {
                "role": "user", 
                "content": f"Translate the following segment into {target_lang_str}, without additional explanation.\n\n{en_sent}"
            }
        ]
        tokenized_chat = tokenizer.apply_chat_template(
            messages,
            tokenize=True,
            add_generation_prompt=False,
            return_tensors="pt"
        )
        
        tokenized_chat = tokenized_chat.to(model.device)
        with torch.no_grad():
            outputs = model.generate(
                tokenized_chat,
                max_new_tokens=max_new_tokens,
                do_sample=False,
                temperature=temperature,
                top_k=20,
                top_p=0.6,
                repetition_penalty=1.05,
                pad_token_id=tokenizer.pad_token_id,

            )

        generated_tokens = outputs[0][len(tokenized_chat[0]):]
        translation = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()

        translations.append(translation)

    return translations

model = AutoModelForCausalLM.from_pretrained(
    MODEL,
    dtype=torch.bfloat16,
    device_map='auto',
)

test_sentences = [
    "Привет, как дела?",
    "Погода сегодня хорошая"
]

test_sentences_en = [
    "Hello, how are you?",
    "The weather is good today"
]

translations = generate_translation(test_sentences, model, target_lang='kz')
for en, cv in zip(test_sentences, translations):
    print(f"EN: {en}")
    print(f"CV: {cv}\n")

In [None]:
import torch
from tqdm.auto import tqdm
from transformers import pipeline
import math

def generate_translation_batched(
    english_sentences,
    batch_size=8,
    max_new_tokens=512,
    target_lang='chv',
    **gen_kwargs
):
    """
    Generate translations for a list of English sentences in batches using HY-MT1.5.
    """
    translations = []
    original_padding_side = tokenizer.padding_side
    tokenizer.padding_side = 'left'
    
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = tokenizer.eos_token_id

    target_lang_mapping = {
        "chv": "Chuvash", "kz": "Kazakh", "kyr": "Kyrgyz", 
        "ru": "Russian", "cn": "Chinese", "en": "English", 
        "de": "German", "bk": "Bashkir"
    }
    target_lang_str = target_lang_mapping.get(target_lang, target_lang)

    default_gen_kwargs = dict(
        max_new_tokens=max_new_tokens,
        num_beams=5,
        do_sample=True,
        num_return_sequences=1,
        repetition_penalty=1.1,
        early_stopping=True,
        temperature=0.7,
        pad_token_id=tokenizer.pad_token_id,
    )

    default_gen_kwargs.update(gen_kwargs)

    total_batches = (len(english_sentences) + batch_size - 1) // batch_size
    
    for start_idx in tqdm(range(0, len(english_sentences), batch_size), total=total_batches, desc="Translating"):
        batch_sents = english_sentences[start_idx : start_idx + batch_size]
        
        batch_prompts = []
        for sent in batch_sents:
            messages = [{
                "role": "user", 
                "content": f"Translate the following segment into {target_lang_str}, without additional explanation.\n\n{sent}"
            }]
            prompt = tokenizer.apply_chat_template(
                messages, 
                tokenize=False, 
                add_generation_prompt=False
            )
            batch_prompts.append(prompt)
        
        inputs = tokenizer(
            batch_prompts,
            return_tensors="pt",
            padding=True,
            truncation=False,
            return_token_type_ids=False,
        ).to(model.device)
        
        inputs.pop("token_type_ids", None)
        input_len = inputs.input_ids.shape[1]
        with torch.no_grad():
            outputs = model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs.get("attention_mask", None),
                **default_gen_kwargs,
            )
        

        generated_sequences = outputs[:, input_len:]
        
        decoded_batch = tokenizer.batch_decode(generated_sequences, skip_special_tokens=True)
        
        for text in decoded_batch:
            clean_text = text.strip()
            clean_text = clean_text.replace("<|extra_0|>", "").strip()
            translations.append(clean_text)

    tokenizer.padding_side = original_padding_side
    
    return translations

def generate_translation_pipeline(
    english_sentences,
    batch_size=32,
    max_new_tokens=512,
    target_lang="bk",
    bad_assistant_token_id=127962, 
    **gen_kwargs,
):
    target_lang_mapping = {
        "chv": "Chuvash",
        "kz": "Kazakh",
        "kyr": "Kyrgyz",
        "ru": "Russian",
        "cn": "Chinese",
        "en": "English",
        "de": "German",
        "bk": "Bashkir",
        "tt": "Tatar"
    }
    target_lang_str = target_lang_mapping[target_lang]

    tokenizer.padding_side = "left"
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = tokenizer.eos_token_id

    prompts = []
    for s in english_sentences:
        messages = [{
            "role": "user",
            "content": (
                f"Translate the following segment into {target_lang_str}, "
                f"without additional explanation.\n\n{s}"
            ),
        }]
        prompt = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=False,
        )
        prompts.append(prompt)

    bad_words_ids = gen_kwargs.pop("bad_words_ids", [[bad_assistant_token_id]])

    pipe = pipeline(
        task="text-generation",
        model=model,
        tokenizer=tokenizer,
    )

    translations = []
    n = len(prompts)
    total_batches = math.ceil(n / batch_size)

    for start in tqdm(range(0, n, batch_size), total=total_batches, desc="Translating", unit="batch"):
        batch_prompts = prompts[start : start + batch_size]

        out = pipe(
            batch_prompts,
            batch_size=len(batch_prompts),
            max_new_tokens=max_new_tokens,
            return_full_text=False,
            pad_token_id=tokenizer.pad_token_id,
            bad_words_ids=bad_words_ids,
            **gen_kwargs,
        )

        translations.extend([x[0]["generated_text"].strip() for x in out])

    return translations

translations = generate_translation_pipeline(test_sentences, target_lang='kz',
                temperature=0.8,
                do_sample=True,
                top_k=20,
                top_p=0.6,
                repetition_penalty=1.05,
)

for en, cv in zip(test_sentences, translations):
    print(f"RU: {en}")
    print(f"BK: {cv}\n")

Device set to use cuda:0


Translating:   0%|          | 0/1 [00:00<?, ?batch/s]

RU: Привет, как дела?
BK: Сәлам, ничек идегез?

RU: Погода сегодня хорошая
BK: Бүген яхшы хава



In [None]:
import pandas as pd
from tqdm import tqdm

df = pd.read_csv('test_dataset/ru-kazakh.csv')
samples = df['source_en'].to_list()

chv_translations = generate_translation_pipeline(samples, target_lang='kz',
                do_sample=True,
                num_beams=5,
                repetition_penalty=1.2, batch_size=128,
)

Device set to use cuda:0
Translating: 100%|██████████| 6/6 [09:24<00:00, 94.05s/batch] 


In [10]:
submission_df = pd.DataFrame({'id': df['id'], 'submission': chv_translations})
submission_df

Unnamed: 0,id,submission
0,valid_1,Бүген яңгыр ява башлармы?
1,valid_2,бүген никадәр кызу булачак
2,valid_3,Никадәр эссе?
3,valid_4,Бүген көн якты булырмы?
4,valid_5,Бүген болытлымы?
...,...,...
685,test_496,Бу альбомны Old School Death Metalга өстә
686,test_497,"""Jersey Boys""дигән телесериалны карагыз"
687,test_498,Алты ай эчендә Танҗир янында урнашкан иң яхшы ...
688,test_499,Миңа Харкинс театрларындагы фильмнарны күрсәте...


In [5]:
submission_df['submission'][2]

'Никадәр эссе?'

In [11]:
submission_df_с = submission_df.copy()
submission_df_с['eng_src'] = samples

In [7]:
chv_translations[2]

'Никадәр эссе?'

In [None]:
import pandas as pd
import re
import html

def clean_translation(text):
    if not isinstance(text, str):
        return text
    
    text = html.unescape(text)
    text = text.replace("& apos;", "'").replace("& quot;", '"')
    pattern = r'(.+?)\1{3,}'
    
    text = re.sub(pattern, r'\1', text)
    text = re.sub(pattern, r'\1', text)
    
    return text.strip()

def process_dataframe(df):
    """
    Takes a dataframe, finds the submission column, cleans it, and returns the df.
    """
    target_col = None
    if 'submission' in df.columns:
        target_col = 'submission'
    elif 'sub_beam5' in df.columns:
        target_col = 'sub_beam5'
    else:
        target_col = df.columns[-1]
    
    print(f"Processing column: {target_col}")
    df[target_col] = df[target_col].apply(clean_translation)
    
    return df

submission_df = process_dataframe(submission_df)
submission_df

Processing column: submission


Unnamed: 0,id,submission
0,valid_1,Бүген яңгыр ява башлармы?
1,valid_2,бүген никадәр кызу булачак
2,valid_3,Никадәр эссе?
3,valid_4,Бүген көн якты булырмы?
4,valid_5,Бүген болытлымы?
...,...,...
685,test_496,Бу альбомны Old School Death Metalга өстә
686,test_497,"""Jersey Boys""дигән телесериалны карагыз"
687,test_498,Алты ай эчендә Танҗир янында урнашкан иң яхшы ...
688,test_499,Миңа Харкинс театрларындагы фильмнарны күрсәте...


In [15]:
submission_df.to_csv('tt_7B_lora_pipe_5_beams_clean.csv', index=False)

In [13]:
submission_df_с.to_csv('test_tt_7B_lora_pipe_5_beams.csv', index=False)