In [None]:
!pip install pymorphy3
!pip uninstall -y torch torchvision transformers
!pip install datasets torch torchvision transformers scikit-learn imbalanced-learn

Found existing installation: torch 2.6.0
Uninstalling torch-2.6.0:
  Successfully uninstalled torch-2.6.0
Found existing installation: torchvision 0.21.0
Uninstalling torchvision-0.21.0:
  Successfully uninstalled torchvision-0.21.0
Found existing installation: transformers 4.50.3
Uninstalling transformers-4.50.3:
  Successfully uninstalled transformers-4.50.3
Collecting torch
  Using cached torch-2.6.0-cp311-cp311-manylinux1_x86_64.whl.metadata (28 kB)
Collecting torchvision
  Using cached torchvision-0.21.0-cp311-cp311-manylinux1_x86_64.whl.metadata (6.1 kB)
Collecting transformers
  Using cached transformers-4.50.3-py3-none-any.whl.metadata (39 kB)
Using cached torch-2.6.0-cp311-cp311-manylinux1_x86_64.whl (766.7 MB)
Using cached torchvision-0.21.0-cp311-cp311-manylinux1_x86_64.whl (7.2 MB)
Using cached transformers-4.50.3-py3-none-any.whl (10.2 MB)
Installing collected packages: torch, transformers, torchvision
Successfully installed torch-2.6.0 torchvision-0.21.0 transformers-4.50

In [None]:
import nltk
import numpy as np
import os
import pandas as pd
import pymorphy3
import random
import re
import torch
import torch.nn as nn
import transformers
import warnings
from datasets import Dataset, concatenate_datasets, IterableDataset, load_dataset
from google.colab import drive
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from transformers import (AutoModelForSequenceClassification,
                         AutoTokenizer,
                         DataCollatorForLanguageModeling,
                         DataCollatorForSeq2Seq,
                         MBart50Tokenizer,
                         MBart50TokenizerFast,
                         MBartForConditionalGeneration,
                         TrainingArguments,
                         Trainer
)

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
input_dir = "extracted_text"
output_file = "cleaned_text.txt"
doc_tag_pattern = re.compile(r'<doc.*?>|</doc>', re.DOTALL)
with open(output_file, 'w', encoding='utf-8') as outfile:
    for root, dirs, files in os.walk(input_dir):
        for file in files:
            if file.startswith('wiki_'):
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding='utf-8') as infile:
                    content = infile.read()
                    cleaned_content = doc_tag_pattern.sub('', content)
                    outfile.write(cleaned_content + '\n')


In [None]:
nltk.download('punkt')
nltk.download('punkt_tab')
input_file = "cleaned_text.txt"
output_file = "sentences.txt"
with open(output_file, 'w', encoding='utf-8') as outfile:
    with open(input_file, 'r', encoding='utf-8') as infile:
        for line in infile:
            sentences = nltk.tokenize.sent_tokenize(line, language='russian')
            for sent in sentences:
                outfile.write(sent.strip() + '\n')

In [None]:
import re

input_file = "/content/drive/MyDrive/Coursework/Bart_fine-tuning/sentences.txt"
output_file = "/content/drive/MyDrive/Coursework/Bart_fine-tuning/cleaned_russian_text.txt"

def clean_to_russian_only(text):
    cleaned_text = re.sub(r'[^А-Яа-яЁё\s]', '', text)
    cleaned_text = ' '.join(cleaned_text.split())
    return cleaned_text

with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
    for line in infile:
        stripped = line.strip()
        if stripped:
            cleaned_line = clean_to_russian_only(stripped)
            if cleaned_line:
                outfile.write(cleaned_line + '\n')

In [None]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    tokenizer = MBart50Tokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
    model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
    print("model loaded")
input_file = "sentences.txt"
tokenized_file = "tokenized_sentences.txt"
with open(input_file, 'r', encoding='utf-8') as infile:
    with open(tokenized_file, 'w', encoding='utf-8') as outfile:
        for line in infile:
            tokens = tokenizer(line.strip(), return_tensors="pt")
            tokenized_text = tokenizer.convert_ids_to_tokens(tokens['input_ids'][0])
            outfile.write(" ".join(tokenized_text) + "\n")

In [None]:
file_path = "/content/drive/MyDrive/Coursework/Bart_fine-tuning/cleaned_russian_text.txt"

num_lines = 0
file_size = 0
total_words = 0

with open(file_path, "r", encoding="utf-8") as f:
    for line in f:
        num_lines += 1
        file_size += len(line)
        total_words += len(line.split())  # Считаем слова в строке

avg_words_per_line = total_words / num_lines if num_lines > 0 else 0

print(f"Количество строк: {num_lines}")
print(f"Размер файла: {file_size} байт")
print(f"Среднее количество слов в строке: {avg_words_per_line:.2f}")


Количество строк: 155462964
Размер файла: 8383157790 байт
Среднее количество слов в строке: 6.81


In [None]:
torch.cuda.empty_cache()
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer = MBart50Tokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt", src_lang="ru_RU")

max_length = 128

def gen_sentences(input_file):
    with open(input_file, 'r', encoding='utf-8') as infile:
        for line in infile:
            stripped = line.strip()
            if stripped:
                original_text = f"{tokenizer.bos_token}{stripped}{tokenizer.eos_token}"
                words = stripped.split()
                if len(words) > 5:
                    num_to_mask = max(1, int(len(words) * 0.2))
                    mask_indices = random.sample(range(len(words)), num_to_mask)
                    for idx in mask_indices:
                        words[idx] = "<mask>"
                    noisy_text = f"{tokenizer.bos_token}{' '.join(words)}{tokenizer.eos_token}"
                    yield {'input_text': noisy_text, 'target_text': original_text}

input_file = "/content/drive/MyDrive/Coursework/Bart_fine-tuning/cleaned_russian_text.txt"
dataset = IterableDataset.from_generator(lambda: gen_sentences(input_file))

def tokenize_function(examples):
    model_inputs = tokenizer(examples["input_text"], max_length=max_length, truncation=True, padding="max_length", return_tensors="pt")
    tokenizer.tgt_lang = "ru_RU"
    labels = tokenizer(text_target=examples["target_text"], max_length=128, truncation=True, padding="max_length", return_tensors="pt")["input_ids"]
    model_inputs["labels"] = labels
    return model_inputs

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["input_text", "target_text"])
model.to(torch.device("cuda"))

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True,
    max_length=max_length
)

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Coursework/Bart_fine-tuning/mbart_finetuned",
    overwrite_output_dir=True,
    max_steps=50000,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=2,
    save_steps=2500,
    save_total_limit=2,
    logging_dir="/content/drive/MyDrive/Coursework/Bart_fine-tuning/logs",
    logging_steps=500,
    learning_rate=5e-5,
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
cuda_available = torch.cuda.is_available()
print(f"CUDA доступна: {cuda_available}")
device = model.device
print(f"Модель находится на устройстве: {device}")

CUDA доступна: True
Модель находится на устройстве: cuda:0


In [None]:
trainer.train()
model.save_pretrained("/content/drive/MyDrive/Coursework/Bart_fine-tuning/mbart_finetuned")
tokenizer.save_pretrained("/content/drive/MyDrive/Coursework/Bart_fine-tuning/mbart_finetuned")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33md-d-pushkarev-d-d[0m ([33md-d-pushkarev-d-d-hse-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


Step,Training Loss
500,0.5565
1000,0.1746
1500,0.1502
2000,0.1417
2500,0.1162
3000,0.1364
3500,0.1067
4000,0.1323
4500,0.1285
5000,0.1399




('/content/drive/MyDrive/Coursework/Bart_fine-tuning/mbart_finetuned/tokenizer_config.json',
 '/content/drive/MyDrive/Coursework/Bart_fine-tuning/mbart_finetuned/special_tokens_map.json',
 '/content/drive/MyDrive/Coursework/Bart_fine-tuning/mbart_finetuned/sentencepiece.bpe.model',
 '/content/drive/MyDrive/Coursework/Bart_fine-tuning/mbart_finetuned/added_tokens.json')

In [None]:
dataset = load_dataset('d0rj/geo-reviews-dataset-2023', split='train')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
def preprocess_data(example):
    rating = example['rating']
    label = 1 if rating > 3 else 0
    return {'text': example['text'], 'label': label}

dataset = dataset.map(preprocess_data, remove_columns=['address', 'name_ru', 'rubrics', 'rating'])

In [None]:
def oversample_dataset(dataset):
    positive = dataset.filter(lambda x: x['label'] == 1)
    negative = dataset.filter(lambda x: x['label'] == 0)
    num_pos= len(positive)
    num_neg = len(negative)
    oversample_factor = int(np.ceil(num_pos / num_neg))
    negative_oversampled = negative.select(np.tile(range(num_neg), oversample_factor)[:num_pos])
    balanced_dataset = concatenate_datasets([positive, negative_oversampled])
    return balanced_dataset

def preprocess_dataset(balanced_dataset):
    positive = balanced_dataset.filter(lambda x: x['label'] == 1)
    negative = balanced_dataset.filter(lambda x: x['label'] == 0)
    num_pos= len(positive)
    num_neg = len(negative)
    pos_indices = np.random.choice(num_pos, 250000, replace=False)
    neg_indices = np.random.choice(num_neg, 250000, replace=False)
    balanced_positive = positive.select(pos_indices)
    balanced_negative = negative.select(neg_indices)
    balanced_dataset = concatenate_datasets([balanced_positive, balanced_negative]).shuffle(seed=42)
    return balanced_dataset

balanced_dataset = oversample_dataset(dataset)
train_test_split = balanced_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = train_test_split['train']
train_dataset = preprocess_dataset(train_dataset)
test_dataset = train_test_split['test']

Filter:   0%|          | 0/777015 [00:00<?, ? examples/s]

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("facebook/mbart-large-50-many-to-many-mmt", num_labels=2)
tokenizer_sentiment = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

def tokenize_function(examples):
    return tokenizer_sentiment(examples['text'], padding="max_length", truncation=True, max_length=128)


tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)
tokenized_train.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
tokenized_test.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

Some weights of MBartForSequenceClassification were not initialized from the model checkpoint at facebook/mbart-large-50-many-to-many-mmt and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

Map:   0%|          | 0/500000 [00:00<?, ? examples/s]

Map:   0%|          | 0/86335 [00:00<?, ? examples/s]

In [None]:
model.to(torch.device("cuda"))
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Coursework/Bart_fine-tuning/mbart_finetuned_sentiment",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=6,
    per_device_eval_batch_size=6,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    save_strategy="epoch",
    save_total_limit=2,
    logging_dir="/content/drive/MyDrive/Coursework/Bart_fine-tuning/logs_sentiment",
    logging_steps=500,
    learning_rate=5e-5,
    fp16=True,
    metric_for_best_model="eval_loss"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics
)

In [None]:
cuda_available = torch.cuda.is_available()
print(f"CUDA доступна: {cuda_available}")
device = model.device
print(f"Модель находится на устройстве: {device}")

CUDA доступна: True
Модель находится на устройстве: cuda:0


In [None]:
torch.cuda.empty_cache()
trainer.train()
print("\033[1mИтоговые метрики на тестовом наборе:", trainer.evaluate())

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33md-d-pushkarev-d-d[0m ([33md-d-pushkarev-d-d-hse-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
500,0.3286
1000,0.2441
1500,0.2462
2000,0.2432
2500,0.239
3000,0.2245
3500,0.2252
4000,0.2033
4500,0.2139
5000,0.2108


Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5}


In [None]:
model.eval()
test_reviews = [
    "Отличный сервис, всё быстро и качественно!",
    "Ужасное обслуживание, больше не приду."
]
for review in test_reviews:
    inputs = tokenizer(review, padding="max_length", truncation=True, max_length=128, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        prediction = torch.argmax(outputs.logits, dim=-1).item()
    sentiment = "Положительный" if prediction == 1 else "Отрицательный"
    print(f"Отзыв: {review}")
    print(f"Предсказанная тональность: {sentiment}\n")

In [None]:
predictions = trainer.predict(tokenized_test)
metrics = predictions.metrics
print("Метрики на тестовой выборке:", metrics)

print(f"\033[1mПервые 5 примеров из тестовой выборки:")
preds = predictions.predictions.argmax(-1)
labels = predictions.label_ids
for i in range(5):
    text = test_dataset[i]['text']
    true_label = labels[i]
    pred_label = preds[i]
    print(f"Отзыв: {text}")
    print(f"Истинная тональность: {'Положительный' if true_label == 1 else 'Отрицательный'}")
    print(f"Предсказанная тональность: {'Положительный' if pred_label == 1 else 'Отрицательный'}\n")

In [None]:
model.save_pretrained("/content/drive/MyDrive/Coursework/Bart_fine-tuning/mbart_finetuned_sentiment")
tokenizer.save_pretrained("/content/drive/MyDrive/Coursework/Bart_fine-tuning/mbart_finetuned_sentiment")

In [None]:
class ReviewSummarizer(nn.Module):
    def __init__(self, generative_model_path, classification_model_path, gen_tokenizer_path, class_tokenizer_path, num_labels=2):
        super(ReviewSummarizer, self).__init__()
        self.gen_model = MBartForConditionalGeneration.from_pretrained(generative_model_path)
        self.class_model = AutoModelForSequenceClassification.from_pretrained(classification_model_path, num_labels=num_labels)
        self.gen_tokenizer = AutoTokenizer.from_pretrained(gen_tokenizer_path)
        self.class_tokenizer = AutoTokenizer.from_pretrained(class_tokenizer_path)
        self.num_labels = num_labels

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.gen_model.to(self.device)
        self.class_model.to(self.device)

    def summarize(self, text, max_length=128, summary_max_length=50):
        class_inputs = self.class_tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=max_length,
            return_tensors="pt"
        ).to(self.device)

        with torch.no_grad():
            prediction = torch.argmax(self.class_model(**class_inputs).logits, dim=-1).item()
            sentiment = "положительный" if prediction == 1 else "отрицательный"

        gen_inputs = self.gen_tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=max_length,
            return_tensors="pt"
        ).to(self.device)

        with torch.no_grad():
            gen_outputs = self.gen_model.generate(
                **gen_inputs,
                max_length=summary_max_length,
                num_beams=5,
                early_stopping=True
            )
            summary = self.gen_tokenizer.decode(gen_outputs[0], skip_special_tokens=True)
        print(f"Данный отзыв: {sentiment}")
        print(f"Суммаризация отзыва: {summary_placeholder}")