In [None]:
%%capture
!pip install transformers datasets evaluate torch pandas scikit-learn

In [None]:
from typing import List, Dict
import pandas as pd
import numpy as np
import torch
from torch import nn
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
)
from datasets import Dataset
from sklearn.model_selection import train_test_split
import evaluate
from scipy.stats import entropy
from sklearn.metrics import accuracy_score, f1_score
import json

In [None]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        # Добавляем **kwargs для совместимости
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
        labels = torch.tensor(labels, dtype=torch.float32).to(logits.device)
        loss_fct = nn.KLDivLoss(reduction="batchmean")
        loss = loss_fct(log_probs, labels)
        return (loss, outputs) if return_outputs else loss

In [None]:
class TopicClassifier:
    def __init__(
        self,
        data_path: str,
        text_columns: List[str],
        topic_columns: List[str],
        maximum_sequence_length: int = 200,
        output_dir: str = "./model",
    ):
        try:
            self.data = pd.read_excel(data_path)
        except FileNotFoundError:
            raise ValueError(f"File {data_path} not found!")

        self.model_name = "nikitast/multilang-classifier-roberta"
        self.text_columns = text_columns
        self.topic_columns = topic_columns
        self.maximum_sequence_length = maximum_sequence_length
        self.output_dir = output_dir
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.id2topic: Dict[int, str] = {}
        self.num_labels: int = 0
        self.tokenizer = None
        self.model = None
        self.trainer = None
        self.evaluation_results: Dict[str, float] = {}

    def __prepare_data__(self):
        self.data["text"] = self.data[self.text_columns].apply(
            lambda x: " ".join(x.dropna().astype(str)), axis=1
        )

        if not set(self.topic_columns).issubset(self.data.columns):
            raise ValueError(f"Columns {self.topic_columns} not found in data")

        self.num_labels = len(self.topic_columns)
        if self.num_labels < 2:
            raise ValueError("At least 2 topics required")

        self.data["main_topic"] = self.data[self.topic_columns].idxmax(axis=1)
        self.id2topic = {i: topic for i, topic in enumerate(self.topic_columns)}

    def __load_model__(self):
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(
            self.model_name,
            num_labels=self.num_labels,
            # problem_type="multi_label_classification",
            ignore_mismatched_sizes=True  # Решаем проблему несоответствия размеров
        ).to(self.device)

        # base_model = None
        # model_arch = self.model.__class__.__name__

        # if "Bert" in model_arch:
        #     base_model = self.model.bert
        # elif "Roberta" in model_arch:
        #     base_model = self.model.roberta
        # elif "DistilBert" in model_arch:
        #     base_model = self.model.distilbert
        # else:
        #     base_model = getattr(self.model, "base_model", None)

        # if base_model:
        #     for param in base_model.parameters():
        #         param.requires_grad = False

        #     if hasattr(base_model, "encoder"):
        #         for layer in base_model.encoder.layer[-2:]:
        #             for param in layer.parameters():
        #                 param.requires_grad = True
        #     elif hasattr(base_model, "transformer"):
        #         for layer in base_model.transformer.layer[-2:]:
        #             for param in layer.parameters():
        #                 param.requires_grad = True

        # classifier = getattr(self.model, "classifier", None) or \
        #            getattr(self.model, "fc", None) or \
        #            getattr(self.model, "classification_head", None)

        # if classifier:
        #     for param in classifier.parameters():
        #         param.requires_grad = True
        # else:
        #     raise ValueError("Classifier layer not found")

    def __tokenize_data__(self, df: pd.DataFrame) -> Dataset:
        dataset = Dataset.from_pandas(df[["text"]])
        labels = df[self.topic_columns].values.astype(np.float32)
        dataset = dataset.add_column("labels", labels.tolist())

        def tokenize_function(examples):
            return self.tokenizer(
                examples["text"],
                padding="max_length",
                truncation=True,
                max_length=self.maximum_sequence_length,
            )

        return dataset.map(tokenize_function, batched=True)

    def __compute_metrics__(self, eval_pred) -> Dict[str, float]:
        logits, labels = eval_pred
        predictions = torch.softmax(torch.tensor(logits), dim=-1).numpy()

        pred_labels = np.argmax(predictions, axis=1)
        true_labels = np.argmax(labels, axis=1)

        accuracy = accuracy_score(true_labels, pred_labels)
        f1_micro = f1_score(true_labels, pred_labels, average="micro")
        f1_macro = f1_score(true_labels, pred_labels, average="macro")
        f1_weighted = f1_score(true_labels, pred_labels, average="weighted")

        mse = np.mean((predictions - labels) ** 2)
        kl_div = np.mean(entropy(labels.T, predictions.T, base=2))

        metrics = {
            "accuracy": accuracy,
            "f1_micro": f1_micro,
            "f1_macro": f1_macro,
            "f1_weighted": f1_weighted,
            "mse": mse,
            "kl_div": kl_div,
        }

        return metrics

    def __print_final_metrics__(self):
        if not self.evaluation_results:
            raise ValueError("Model not evaluated yet. Call train_model() first")

        print("\n" + "="*50)
        print("Final Model Evaluation Metrics:")
        print("-"*50)
        for metric, value in self.evaluation_results.items():
            if metric not in ["eval_loss", "epoch"]:
                print(f"{metric.upper():<15}: {value:.4f}")
        print("="*50 + "\n")

    def train_model(self):
        self.__prepare_data__()

        train_df, val_df = train_test_split(
            self.data,
            test_size=0.2,
            random_state=42,
            stratify=self.data["main_topic"],
        )

        self.__load_model__()

        train_dataset = self.__tokenize_data__(train_df)
        val_dataset = self.__tokenize_data__(val_df)

        training_args = TrainingArguments(
            output_dir=self.output_dir,
            eval_strategy="epoch",
            save_strategy="epoch",
            learning_rate=2e-5,
            lr_scheduler_type="linear",
            warmup_steps=100,
            per_device_train_batch_size=8,
            per_device_eval_batch_size=8,
            num_train_epochs=10,
            weight_decay=0.01,
            load_best_model_at_end=True,
            metric_for_best_model="accuracy",
            logging_dir="./logs",
            logging_steps=10,
            report_to="none",
            save_total_limit=1,
        )

        self.trainer = CustomTrainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=self.__compute_metrics__,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
        )

        self.trainer.train()

        self.evaluation_results = self.trainer.evaluate()
        self.__print_final_metrics__()

        # self.model.save_pretrained(self.output_dir)
        # self.tokenizer.save_pretrained(self.output_dir)

        # with open(f"{self.output_dir}/id2topic.json", "w") as f:
        #     json.dump({str(k): v for k, v in self.id2topic.items()}, f)

    def load_trained_model(self, model_path: str):
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_path).to(self.device)

        with open(f"{model_path}/id2topic.json", "r") as f:
            loaded = json.load(f)
            self.id2topic = {int(k): v for k, v in loaded.items()}

    def predict(self, text: str) -> str:
        self.model.eval()
        inputs = self.tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            max_length=self.maximum_sequence_length,
        ).to(self.device)

        with torch.no_grad():
            logits = self.model(**inputs).logits

        predicted_id = torch.argmax(logits, dim=-1).item()
        return self.id2topic[predicted_id]

In [None]:
classifier_add = TopicClassifier(
    data_path="relative_labeled_add_news.xlsx",
    text_columns=['title', 'summary', 'content'],
    topic_columns=['topic_0', 'topic_1', 'topic_2', 'topic_3', 'topic_4', 'topic_5', 'topic_6', 'topic_7'],
    output_dir="./drive/MyDrive/classificator_add"
)
classifier_add.train_model()

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at nikitast/multilang-classifier-roberta and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([10, 768]) in the checkpoint and torch.Size([8, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([10]) in the checkpoint and torch.Size([8]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


Map:   0%|          | 0/13943 [00:00<?, ? examples/s]

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


Map:   0%|          | 0/3486 [00:00<?, ? examples/s]

  labels = torch.tensor(labels, dtype=torch.float32).to(logits.device)


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted,Mse,Kl Div
1,0.5794,0.534941,0.166954,0.166954,0.035767,0.047771,0.019921,0.771757
2,0.488,0.535679,0.172978,0.172978,0.036867,0.051018,0.019945,0.772821


  labels = torch.tensor(labels, dtype=torch.float32).to(logits.device)
