# Необходимые библиотеки

In [1]:
%%capture
!pip install transformers datasets evaluate torch pandas scikit-learn

In [2]:
import pandas as pd
import numpy as np
import torch
import json
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
import evaluate
from sklearn.metrics import f1_score, roc_auc_score
from typing import List, Dict

# Подключение диска

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Класс предсказателя

In [5]:
class TopicClassifier:
    def __init__(
        self,
        data_path: str,
        columns: List[str],
        maximum_sequence_length: int = 200,
        output_dir: str = "./model"
    ):
        try:
            self.data = pd.read_excel(data_path)
        except FileNotFoundError:
            raise ValueError(f"File {data_path} not found!")

        self.model_name = "nikitast/multilang-classifier-roberta"
        self.columns = columns
        self.maximum_sequence_length = maximum_sequence_length
        self.output_dir = output_dir
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.topic2id: Dict[str, int] = {}
        self.id2topic: Dict[int, str] = {}
        self.num_labels: int = 0
        self.tokenizer = None
        self.model = None
        self.trainer = None
        self.evaluation_results: Dict[str, float] = {}

    def __prepare_data__(self):
        self.data['text'] = self.data[self.columns].apply(
            lambda x: ' '.join(x.dropna().astype(str)), axis=1
        )

        unique_topics = self.data['topic'].unique()
        self.topic2id = {topic: i for i, topic in enumerate(unique_topics)}
        self.id2topic = {i: topic for i, topic in enumerate(unique_topics)}

        self.num_labels = len(self.topic2id)
        if self.num_labels < 2:
            raise ValueError("At least 2 classes required for classification")

        self.data['label'] = self.data['topic'].map(self.topic2id)

    def __load_model__(self):
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(
            self.model_name,
            num_labels=self.num_labels,
            problem_type="single_label_classification",
            ignore_mismatched_sizes=True
        ).to(self.device)

    def __tokenize_data__(self, df: pd.DataFrame) -> Dataset:
        dataset = Dataset.from_pandas(df[['text', 'label']])

        def tokenize_function(examples):
            return self.tokenizer(
                examples["text"],
                padding="max_length",
                truncation=True,
                max_length=self.maximum_sequence_length
            )

        return dataset.map(tokenize_function, batched=True)

    def __compute_metrics__(self, eval_pred) -> Dict[str, float]:
        accuracy_metric = evaluate.load("accuracy")
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)

        metrics = {
            "accuracy": accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"],
            "f1_micro": f1_score(labels, predictions, average="micro"),
            "f1_macro": f1_score(labels, predictions, average="macro"),
            "f1_weighted": f1_score(labels, predictions, average="weighted"),
        }

        try:
            if logits.shape[1] == 2:
                metrics["roc_auc"] = roc_auc_score(labels, logits[:, 1])
            else:
                metrics["roc_auc"] = roc_auc_score(
                    labels, logits, multi_class="ovo", average="macro"
                )
        except ValueError:
            metrics["roc_auc"] = float("nan")

        return metrics

    def __print_final_metrics__(self):
        if not self.evaluation_results:
            raise ValueError("Model not evaluated yet. Call train_model() first")

        print("\n" + "="*50)
        print("Final Model Evaluation Metrics:")
        print("-"*50)
        for metric, value in self.evaluation_results.items():
            if metric not in ["eval_loss", "epoch"]:
                print(f"{metric.upper():<15}: {value:.4f}")
        print("="*50 + "\n")

    def train_model(self):
        self.__prepare_data__()
        train_df, val_df = train_test_split(
            self.data,
            test_size=0.2,
            random_state=42,
            stratify=self.data['topic']
        )

        self.__load_model__()

        train_dataset = self.__tokenize_data__(train_df)
        val_dataset = self.__tokenize_data__(val_df)

        training_args = TrainingArguments(
            output_dir=self.output_dir,
            eval_strategy="epoch",
            save_strategy="epoch",
            learning_rate=2e-5,
            lr_scheduler_type="linear",
            warmup_steps=100,
            per_device_train_batch_size=32,
            per_device_eval_batch_size=32,
            num_train_epochs=10,
            weight_decay=0.01,
            load_best_model_at_end=True,
            metric_for_best_model="accuracy",
            logging_dir='./logs',
            logging_steps=10,
            report_to="none",
            save_total_limit=1
        )

        self.trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=self.__compute_metrics__,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
        )

        self.trainer.train()

        self.evaluation_results = self.trainer.evaluate()
        self.__print_final_metrics__()

        # self.model.save_pretrained(self.output_dir)
        # self.tokenizer.save_pretrained(self.output_dir)

        # with open(f"{self.output_dir}/id2topic.json", "w") as f:
        #     json.dump({str(k): v for k, v in self.id2topic.items()}, f)

    def load_trained_model(self, model_path: str):
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_path).to(self.device)

        with open(f"{model_path}/id2topic.json", "r") as f:
            self.id2topic = {int(k): v for k, v in json.load(f).items()}

    def predict(self, text: str) -> str:
        self.model.eval()
        inputs = self.tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            max_length=self.maximum_sequence_length
        ).to(self.device)

        with torch.no_grad():
            logits = self.model(**inputs).logits

        predicted_id = torch.argmax(logits, dim=-1).item()
        return self.id2topic[predicted_id]

# Обучение различных классификаторов

## Без tfidf слов

In [None]:
classifier = TopicClassifier(
    data_path="labeled_news.xlsx",
    columns=["title", "summary", "content"],
    output_dir="./drive/MyDrive/classificator"
)
classifier.train_model()

# trained_classifier = TopicClassifier(
#     data_path="labeled_news.xlsx",
#     columns=["title", "summary", "content"]
# )
# trained_classifier.load_trained_model("./my_model")
# print(trained_classifier.predict("Пример текста для классификации"))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/398 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at nikitast/multilang-classifier-roberta and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([10, 768]) in the checkpoint and torch.Size([7, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([10]) in the checkpoint and torch.Size([7]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Map:   0%|          | 0/9488 [00:00<?, ? examples/s]

Map:   0%|          | 0/2372 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted,Roc Auc
1,1.9455,1.912547,0.207841,0.207841,0.049165,0.071529,
2,1.8637,1.858548,0.273187,0.273187,0.182793,0.216839,
3,1.9089,1.84147,0.291315,0.291315,0.218391,0.252497,
4,1.7863,1.828014,0.280776,0.280776,0.236304,0.260405,
5,1.7026,1.858875,0.283727,0.283727,0.252036,0.271397,
6,1.6544,1.887111,0.288786,0.288786,0.247983,0.271252,


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]


Final Model Evaluation Metrics:
--------------------------------------------------
EVAL_ACCURACY  : 0.2913
EVAL_F1_MICRO  : 0.2913
EVAL_F1_MACRO  : 0.2184
EVAL_F1_WEIGHTED: 0.2525
EVAL_ROC_AUC   : nan
EVAL_RUNTIME   : 27.1463
EVAL_SAMPLES_PER_SECOND: 87.3780
EVAL_STEPS_PER_SECOND: 2.7630



## С tfidf стоп-словами 1 процент

In [None]:
classifier_tfidf_1 = TopicClassifier(
    data_path="labeled_tfidf_1_news.xlsx",
    columns=["title", "summary", "content"],
    output_dir="./drive/MyDrive/classificator_tfidf_1"
)
classifier_tfidf_1.train_model()

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at nikitast/multilang-classifier-roberta and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([10, 768]) in the checkpoint and torch.Size([8, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([10]) in the checkpoint and torch.Size([8]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/9488 [00:00<?, ? examples/s]

Map:   0%|          | 0/2372 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted,Roc Auc
1,2.0091,2.03712,0.177909,0.177909,0.037773,0.053761,
2,2.0388,2.034255,0.181703,0.181703,0.087552,0.117796,
3,2.0425,2.032336,0.174115,0.174115,0.065957,0.085455,
4,2.0436,2.026479,0.18086,0.18086,0.06187,0.081652,
5,1.9633,2.046991,0.18339,0.18339,0.090609,0.118566,
6,2.0408,2.048009,0.191821,0.191821,0.095337,0.12698,
7,1.9431,2.108483,0.175379,0.175379,0.108491,0.135053,
8,1.8957,2.141278,0.171585,0.171585,0.107942,0.134375,
9,1.8665,2.18621,0.172007,0.172007,0.114947,0.1404,



Final Model Evaluation Metrics:
--------------------------------------------------
EVAL_ACCURACY  : 0.1918
EVAL_F1_MICRO  : 0.1918
EVAL_F1_MACRO  : 0.0953
EVAL_F1_WEIGHTED: 0.1270
EVAL_ROC_AUC   : nan
EVAL_RUNTIME   : 27.3924
EVAL_SAMPLES_PER_SECOND: 86.5930
EVAL_STEPS_PER_SECOND: 2.7380



## С tfidf стоп-словами 2 процента

In [None]:
classifier_tfidf_2 = TopicClassifier(
    data_path="labeled_tfidf_2_news.xlsx",
    columns=["title", "summary", "content"],
    output_dir="./drive/MyDrive/classificator_tfidf_2"
)
classifier_tfidf_2.train_model()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/398 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at nikitast/multilang-classifier-roberta and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([10, 768]) in the checkpoint and torch.Size([8, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([10]) in the checkpoint and torch.Size([8]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Map:   0%|          | 0/9488 [00:00<?, ? examples/s]

Map:   0%|          | 0/2372 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted,Roc Auc
1,2.0447,2.036699,0.177909,0.177909,0.037759,0.053742,
2,2.0593,2.032524,0.177909,0.177909,0.061341,0.086616,
3,2.0251,2.034219,0.18339,0.18339,0.065409,0.092102,
4,2.0522,2.033181,0.174536,0.174536,0.073302,0.095642,
5,2.0372,2.030471,0.177909,0.177909,0.047515,0.065268,
6,2.054,2.031509,0.18086,0.18086,0.090016,0.120588,


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]


Final Model Evaluation Metrics:
--------------------------------------------------
EVAL_ACCURACY  : 0.1834
EVAL_F1_MICRO  : 0.1834
EVAL_F1_MACRO  : 0.0654
EVAL_F1_WEIGHTED: 0.0921
EVAL_ROC_AUC   : nan
EVAL_RUNTIME   : 28.8402
EVAL_SAMPLES_PER_SECOND: 82.2460
EVAL_STEPS_PER_SECOND: 2.6010



## С tfidf стоп-словами 3 процента

In [None]:
classifier_tfidf_3 = TopicClassifier(
    data_path="labeled_tfidf_3_news.xlsx",
    columns=["title", "summary", "content"],
    output_dir="./drive/MyDrive/classificator_tfidf_3"
)
classifier_tfidf_3.train_model()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/398 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at nikitast/multilang-classifier-roberta and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([10, 768]) in the checkpoint and torch.Size([8, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([10]) in the checkpoint and torch.Size([8]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Map:   0%|          | 0/9488 [00:00<?, ? examples/s]

Map:   0%|          | 0/2372 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted,Roc Auc
1,2.0643,2.035522,0.178752,0.178752,0.037911,0.054214,
2,2.0489,2.034549,0.178752,0.178752,0.037911,0.054214,
3,2.0641,2.035202,0.178752,0.178752,0.037911,0.054214,
4,2.0512,2.034507,0.17285,0.17285,0.036844,0.050948,


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]


Final Model Evaluation Metrics:
--------------------------------------------------
EVAL_ACCURACY  : 0.1788
EVAL_F1_MICRO  : 0.1788
EVAL_F1_MACRO  : 0.0379
EVAL_F1_WEIGHTED: 0.0542
EVAL_ROC_AUC   : nan
EVAL_RUNTIME   : 27.3235
EVAL_SAMPLES_PER_SECOND: 86.8120
EVAL_STEPS_PER_SECOND: 2.7450



## С tfidf стоп-словами 4 процента

In [None]:
classifier_tfidf_4 = TopicClassifier(
    data_path="labeled_tfidf_4_news.xlsx",
    columns=["title", "summary", "content"],
    output_dir="./drive/MyDrive/classificator_tfidf_4"
)
classifier_tfidf_4.train_model()

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at nikitast/multilang-classifier-roberta and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([10, 768]) in the checkpoint and torch.Size([7, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([10]) in the checkpoint and torch.Size([7]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/9488 [00:00<?, ? examples/s]

Map:   0%|          | 0/2372 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted,Roc Auc
1,1.9371,1.910423,0.198145,0.198145,0.04725,0.065537,
2,1.8963,1.909449,0.198145,0.198145,0.04725,0.065537,
3,1.9444,1.909448,0.198145,0.198145,0.04725,0.065537,
4,1.9225,1.907353,0.198145,0.198145,0.083406,0.105405,



Final Model Evaluation Metrics:
--------------------------------------------------
EVAL_ACCURACY  : 0.1981
EVAL_F1_MICRO  : 0.1981
EVAL_F1_MACRO  : 0.0473
EVAL_F1_WEIGHTED: 0.0655
EVAL_ROC_AUC   : nan
EVAL_RUNTIME   : 27.1827
EVAL_SAMPLES_PER_SECOND: 87.2610
EVAL_STEPS_PER_SECOND: 2.7590



## С tfidf стоп-словами 5 процентов

In [None]:
classifier_tfidf_5 = TopicClassifier(
    data_path="labeled_tfidf_5_news.xlsx",
    columns=["title", "summary", "content"],
    output_dir="./drive/MyDrive/classificator_tfidf_5"
)
classifier_tfidf_5.train_model()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/398 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at nikitast/multilang-classifier-roberta and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([10, 768]) in the checkpoint and torch.Size([6, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([10]) in the checkpoint and torch.Size([6]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/9488 [00:00<?, ? examples/s]

Map:   0%|          | 0/2372 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted,Roc Auc
1,1.7736,1.752169,0.200253,0.200253,0.05755,0.069232,
2,1.7854,1.74976,0.206998,0.206998,0.079856,0.099897,
3,1.7288,1.739797,0.230607,0.230607,0.117045,0.14485,
4,1.7361,1.741758,0.214165,0.214165,0.110911,0.136364,
5,1.7625,1.73518,0.235245,0.235245,0.119843,0.148272,
6,1.7431,1.756683,0.216695,0.216695,0.125023,0.154647,
7,1.6637,1.822737,0.209528,0.209528,0.167097,0.188276,
8,1.5818,1.863067,0.201518,0.201518,0.141114,0.167695,


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]


Final Model Evaluation Metrics:
--------------------------------------------------
EVAL_ACCURACY  : 0.2352
EVAL_F1_MICRO  : 0.2352
EVAL_F1_MACRO  : 0.1198
EVAL_F1_WEIGHTED: 0.1483
EVAL_ROC_AUC   : nan
EVAL_RUNTIME   : 24.9828
EVAL_SAMPLES_PER_SECOND: 94.9450
EVAL_STEPS_PER_SECOND: 3.0020



## С tfidf стоп-словами 6 процентов

In [None]:
classifier_tfidf_6 = TopicClassifier(
    data_path="labeled_tfidf_6_news.xlsx",
    columns=["title", "summary", "content"],
    output_dir="./drive/MyDrive/classificator_tfidf_6"
)
classifier_tfidf_6.train_model()

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at nikitast/multilang-classifier-roberta and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([10, 768]) in the checkpoint and torch.Size([7, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([10]) in the checkpoint and torch.Size([7]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/9488 [00:00<?, ? examples/s]

Map:   0%|          | 0/2372 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted,Roc Auc
1,1.91,1.906467,0.194772,0.194772,0.046577,0.063504,
2,1.92,1.907254,0.196459,0.196459,0.081319,0.10633,
3,1.9119,1.905993,0.194772,0.194772,0.046577,0.063504,
4,1.9175,1.906049,0.194772,0.194772,0.046577,0.063504,
5,1.9306,1.905915,0.194772,0.194772,0.046577,0.063504,



Final Model Evaluation Metrics:
--------------------------------------------------
EVAL_ACCURACY  : 0.1965
EVAL_F1_MICRO  : 0.1965
EVAL_F1_MACRO  : 0.0813
EVAL_F1_WEIGHTED: 0.1063
EVAL_ROC_AUC   : nan
EVAL_RUNTIME   : 24.7515
EVAL_SAMPLES_PER_SECOND: 95.8330
EVAL_STEPS_PER_SECOND: 3.0300



## С tfidf стоп-словами 7 процентов

In [5]:
classifier_tfidf_7 = TopicClassifier(
    data_path="labeled_tfidf_7_news.xlsx",
    columns=["title", "summary", "content"],
    output_dir="./drive/MyDrive/classificator_tfidf_7"
)
classifier_tfidf_7.train_model()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/398 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at nikitast/multilang-classifier-roberta and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([10, 768]) in the checkpoint and torch.Size([7, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([10]) in the checkpoint and torch.Size([7]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Map:   0%|          | 0/9488 [00:00<?, ? examples/s]

Map:   0%|          | 0/2372 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted,Roc Auc
1,1.9205,1.908692,0.1914,0.1914,0.045917,0.061519,
2,1.9374,1.908169,0.193508,0.193508,0.085159,0.108745,
3,1.8953,1.90797,0.177487,0.177487,0.043067,0.053507,
4,1.9137,1.908095,0.1914,0.1914,0.0459,0.061497,
5,1.9011,1.907422,0.1914,0.1914,0.0459,0.061497,


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]


Final Model Evaluation Metrics:
--------------------------------------------------
EVAL_ACCURACY  : 0.1935
EVAL_F1_MICRO  : 0.1935
EVAL_F1_MACRO  : 0.0852
EVAL_F1_WEIGHTED: 0.1087
EVAL_ROC_AUC   : nan
EVAL_RUNTIME   : 25.5584
EVAL_SAMPLES_PER_SECOND: 92.8070
EVAL_STEPS_PER_SECOND: 2.9340



## С tfidf стоп-словами 8 процентов

In [5]:
classifier_tfidf_8 = TopicClassifier(
    data_path="labeled_tfidf_8_news.xlsx",
    columns=["title", "summary", "content"],
    output_dir="./drive/MyDrive/classificator_tfidf_8"
)
classifier_tfidf_8.train_model()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/398 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at nikitast/multilang-classifier-roberta and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([10, 768]) in the checkpoint and torch.Size([8, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([10]) in the checkpoint and torch.Size([8]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/9488 [00:00<?, ? examples/s]

Map:   0%|          | 0/2372 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted,Roc Auc
1,2.0623,2.043301,0.166526,0.166526,0.035688,0.047545,
2,2.0419,2.042789,0.159359,0.159359,0.060866,0.077343,
3,2.0455,2.039937,0.166526,0.166526,0.03574,0.047613,
4,2.0223,2.03848,0.166526,0.166526,0.04139,0.055361,


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]


Final Model Evaluation Metrics:
--------------------------------------------------
EVAL_ACCURACY  : 0.1665
EVAL_F1_MICRO  : 0.1665
EVAL_F1_MACRO  : 0.0357
EVAL_F1_WEIGHTED: 0.0475
EVAL_ROC_AUC   : nan
EVAL_RUNTIME   : 27.6107
EVAL_SAMPLES_PER_SECOND: 85.9090
EVAL_STEPS_PER_SECOND: 2.7160



## С tfidf стоп-словами 9 процентов

In [6]:
classifier_tfidf_9 = TopicClassifier(
    data_path="labeled_tfidf_9_news.xlsx",
    columns=["title", "summary", "content"],
    output_dir="./drive/MyDrive/classificator_tfidf_9"
)
classifier_tfidf_9.train_model()

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at nikitast/multilang-classifier-roberta and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([10, 768]) in the checkpoint and torch.Size([8, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([10]) in the checkpoint and torch.Size([8]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/9488 [00:00<?, ? examples/s]

Map:   0%|          | 0/2372 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted,Roc Auc
1,2.0444,2.036258,0.179595,0.179595,0.038077,0.054707,
2,2.0402,2.036,0.172428,0.172428,0.050107,0.068128,
3,2.0326,2.03671,0.179595,0.179595,0.038063,0.054687,
4,2.0328,2.036962,0.179595,0.179595,0.038077,0.054707,



Final Model Evaluation Metrics:
--------------------------------------------------
EVAL_ACCURACY  : 0.1796
EVAL_F1_MICRO  : 0.1796
EVAL_F1_MACRO  : 0.0381
EVAL_F1_WEIGHTED: 0.0547
EVAL_ROC_AUC   : nan
EVAL_RUNTIME   : 28.1226
EVAL_SAMPLES_PER_SECOND: 84.3450
EVAL_STEPS_PER_SECOND: 2.6670



## С tfidf стоп-словами 10 процентов

In [6]:
classifier_tfidf_10 = TopicClassifier(
    data_path="labeled_tfidf_10_news.xlsx",
    columns=["title", "summary", "content"],
    output_dir="./drive/MyDrive/classificator_tfidf_10"
)
classifier_tfidf_10.train_model()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/398 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at nikitast/multilang-classifier-roberta and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([10, 768]) in the checkpoint and torch.Size([8, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([10]) in the checkpoint and torch.Size([8]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/9488 [00:00<?, ? examples/s]

Map:   0%|          | 0/2372 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted,Roc Auc
1,2.0633,2.03619,0.166526,0.166526,0.035701,0.047562,
2,2.0461,2.036884,0.179595,0.179595,0.038691,0.055524,
3,2.0466,2.035558,0.171164,0.171164,0.057058,0.078332,
4,2.0186,2.029102,0.195194,0.195194,0.095443,0.126731,
5,2.0426,2.02283,0.201518,0.201518,0.109185,0.134869,
6,2.0561,2.036783,0.193086,0.193086,0.109644,0.134722,
7,1.9619,2.048548,0.194772,0.194772,0.112306,0.138294,
8,1.9145,2.089578,0.189713,0.189713,0.12426,0.149175,


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]


Final Model Evaluation Metrics:
--------------------------------------------------
EVAL_ACCURACY  : 0.2015
EVAL_F1_MICRO  : 0.2015
EVAL_F1_MACRO  : 0.1092
EVAL_F1_WEIGHTED: 0.1349
EVAL_ROC_AUC   : nan
EVAL_RUNTIME   : 26.1674
EVAL_SAMPLES_PER_SECOND: 90.6470
EVAL_STEPS_PER_SECOND: 2.8660

