Датасет - https://www.kaggle.com/datasets/hijest/englishrussian-dictionary-for-machine-translate/data

#1. Установка необходимых библиотек

In [None]:
!pip install evaluate
!pip install hf_xet
!pip install sacremoses
!pip install sacrebleu

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting dill (from evaluate)
  Downloading dill-0.4.0-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.18-py311-none-any.whl.metadata (7.5 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [

#2. Импорт библиотек

In [None]:
import torch
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    pipeline,
    FSMTTokenizer,
    FSMTForConditionalGeneration
)
from evaluate import load
import pandas as pd
from tqdm import tqdm
import random
import numpy as np
from sacrebleu import BLEU

#3. Загрузка и подготовка данных

In [None]:
text = "/content/rus.txt"
with open(text) as file:
    lines = file.read().split("\n")[:-1]

pairs = []
for line in lines:
    parts = line.strip().split("\t")
    if len(parts) >= 2:
        eng, rus = parts[0].strip(), parts[1].strip()
        pairs.append((eng, rus))

df = pd.DataFrame(pairs)
df.head(5)

print("Всего пар =", len(pairs))

# Использование только части данных
test_pairs = pairs
print("Используем только", len(test_pairs))

Всего пар = 363386
Используем только 363386


#4. Список моделей для оценки

In [None]:
models = [
    "Helsinki-NLP/opus-mt-en-ru",
    "google-t5/t5-base",
    "facebook/wmt19-en-ru",
]

#5. Функция для оценки BLEU

In [None]:
def evaluate_bleu_fast(model_name, test_pairs, sample_size=10000, batch_size=32, max_length=128):
    sacre_bleu = BLEU()

    # Выбор подвыборки данных
    if len(test_pairs) > sample_size:
        indices = np.random.choice(len(test_pairs), sample_size, replace=False)
        test_pairs = [test_pairs[i] for i in indices]

    try:
        # Определение типа модели и соответствующей обработки
        if "opus-mt" in model_name.lower():
            # Оптимизация для Helsinki-NLP
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            model.to(device)

            pipe = pipeline(
                "translation",
                model=model,
                tokenizer=tokenizer,
                device=device
            )

            sources = [src for src, _ in test_pairs]
            predictions = []
            for out in tqdm(pipe(sources, batch_size=batch_size, max_length=max_length),
                          desc=f"Processing {model_name}"):
                predictions.append(out['translation_text'])

        elif "t5" in model_name.lower():
            # Обработка для T5 моделей
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            model = AutoModelForSeq2SeqLM.from_pretrained(
                model_name,
                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
            )
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            model.to(device)

            predictions = []
            with torch.no_grad(), torch.autocast("cuda" if torch.cuda.is_available() else "cpu"):
                for i in tqdm(range(0, len(test_pairs), batch_size), desc=f"Processing {model_name}"):
                    batch = test_pairs[i:i+batch_size]
                    sources, targets = zip(*batch)
                    inputs = [f"translate to ru: {src}" for src in sources]

                    tokenized = tokenizer(
                        inputs,
                        return_tensors="pt",
                        padding=True,
                        truncation=True,
                        max_length=max_length
                    ).to(model.device)

                    outputs = model.generate(
                        **tokenized,
                        max_new_tokens=max_length,
                        num_beams=1,
                        do_sample=False
                    )
                    predictions.extend(tokenizer.batch_decode(outputs, skip_special_tokens=True))

        elif "wmt19" in model_name.lower():
            # Обработка для моделей Facebook WMT19
            tokenizer = FSMTTokenizer.from_pretrained(model_name)
            model = FSMTForConditionalGeneration.from_pretrained(model_name)
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            model.to(device)

            predictions = []
            with torch.no_grad():
                for i in tqdm(range(0, len(test_pairs), batch_size), desc=f"Processing {model_name}"):
                    batch = test_pairs[i:i+batch_size]
                    sources = [src for src, _ in batch]

                    input_ids = tokenizer(
                        sources,
                        return_tensors="pt",
                        padding=True,
                        truncation=True,
                        max_length=max_length
                    ).input_ids.to(device)

                    outputs = model.generate(input_ids, max_length=max_length)
                    predictions.extend(tokenizer.batch_decode(outputs, skip_special_tokens=True))

        else:
            raise ValueError(f"Unsupported model type: {model_name}")

        references = [tgt for _, tgt in test_pairs]
        return sacre_bleu.corpus_score(predictions, [references]).score

    except Exception as e:
        print(f"Error evaluating {model_name}: {str(e)}")
        return 0.0

#6. Оценка моделей и сравнение

In [None]:
bleu_scores = {}

# Оценка BLEU для всех моделей
for model in models:
    bleu_score = evaluate_bleu_fast(model, test_pairs)
    bleu_scores[model] = bleu_score
    print(f"Модель {model} имеет BLEU: {bleu_score}")

# Сортировка моделей по BLEU
sorted_models = sorted(bleu_scores.items(), key=lambda x: x[1], reverse=True)

# Вывод топ-3 моделей
print("\nТоп-3 модели:")
for model, bleu in sorted_models[:3]:
    print(f"Модель: {model}, BLEU: {bleu}")

# Выбор лучшей модели
best_model = sorted_models[0][0]
print(f"\nЛучшая модель: {best_model}")

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/803k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.60M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/307M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Device set to use cuda


model.safetensors:   0%|          | 0.00/307M [00:00<?, ?B/s]

Processing Helsinki-NLP/opus-mt-en-ru: 100%|██████████| 10000/10000 [00:00<00:00, 2609209.33it/s]


Модель Helsinki-NLP/opus-mt-en-ru имеет BLEU: 50.5747128824216


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Processing google-t5/t5-base: 100%|██████████| 313/313 [05:15<00:00,  1.01s/it]


Модель google-t5/t5-base имеет BLEU: 0.05434727996520763


tokenizer_config.json:   0%|          | 0.00/67.0 [00:00<?, ?B/s]

vocab-src.json:   0%|          | 0.00/639k [00:00<?, ?B/s]

vocab-tgt.json:   0%|          | 0.00/776k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/315k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/827 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.16G [00:00<?, ?B/s]

Some weights of FSMTForConditionalGeneration were not initialized from the model checkpoint at facebook/wmt19-en-ru and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

Processing facebook/wmt19-en-ru:   0%|          | 0/313 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/1.16G [00:00<?, ?B/s]

Processing facebook/wmt19-en-ru: 100%|██████████| 313/313 [02:35<00:00,  2.01it/s]


Модель facebook/wmt19-en-ru имеет BLEU: 42.20461008133821

Топ-3 модели:
Модель: Helsinki-NLP/opus-mt-en-ru, BLEU: 50.5747128824216
Модель: facebook/wmt19-en-ru, BLEU: 42.20461008133821
Модель: google-t5/t5-base, BLEU: 0.05434727996520763

Лучшая модель: Helsinki-NLP/opus-mt-en-ru


#7. Загрузка и использование лучшей модели

In [None]:
# Загрузка лучшей модели
best_model_loaded = AutoModelForSeq2SeqLM.from_pretrained(best_model)
best_tokenizer = AutoTokenizer.from_pretrained(best_model)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
best_model_loaded.to(device)

# Функция для перевода
def translate(text):
    input_ids = best_tokenizer.encode(text, return_tensors="pt").to(device)
    output = best_model_loaded.generate(input_ids)
    return best_tokenizer.decode(output[0], skip_special_tokens=True)

In [None]:
# Интерактивный перевод
print("\nИнтерактивный перевод (для выхода введите 'exit')")
while True:
    text = input("Введите текст на английском для перевода на русский: ")
    if text.lower() == 'exit':
        break
    translated_text = translate(text)
    print(f"Перевод: {translated_text}\n")


Интерактивный перевод (для выхода введите 'exit')
Введите текст на английском для перевода на русский: What's the weather like outside?
Перевод: Какая погода снаружи?

Введите текст на английском для перевода на русский: This is a model for text translation.
Перевод: Это модель для перевода текста.

Введите текст на английском для перевода на русский: How well does she translate text?
Перевод: Насколько хорошо она переводит смс?

Введите текст на английском для перевода на русский: exit
