# Решение с обучением BERT
Весь код выполнялся в Collab

In [1]:
from pathlib import Path

BASE_PATH = Path('/content/drive/MyDrive/ml/Tochka_bank')
test_path = BASE_PATH / 'test.parquet'
train_path = BASE_PATH / 'train.parquet'

In [2]:
import pandas as pd

train_df = pd.read_parquet(train_path)
test_df = pd.read_parquet(test_path)

print(f"Train DataFrame loaded with {train_df.shape[0]} rows and {train_df.shape[1]} columns.")
print(f"Test DataFrame loaded with {test_df.shape[0]} rows and {test_df.shape[1]} columns.")
train_df.head()

Train DataFrame loaded with 15000 rows and 7 columns.
Test DataFrame loaded with 5000 rows and 1 columns.


Unnamed: 0_level_0,text,integrity,integrity_reasoning,factuality,factuality_reasoning,truthfulness,truthfulness_reasoning
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
dff0d182-5434-46e2-9183-be278f66667f,"Соцветия ромашки, которые продаются в аптеках,...",1.0,The content is an informative text about the u...,1.0,The content provides informative and fact-base...,1.0,The content provides credible information abou...
8268f315-03db-4f12-aa46-0b968c3b1b19,Кто из черниговцев сам будет убирать придомову...,1.0,The content is an informative text about a dec...,1.0,The content is a coherent and informative news...,1.0,The content provides a detailed and credible a...
dc7cd0dd-9eca-418e-8356-9050c4a17cdc,Тамбовчане смогут услышать романсы 20-30 годов...,1.0,This is an informative announcement about a cu...,1.0,The content provides detailed information abou...,1.0,The content provides credible information abou...
e9f43939-22b1-4a4e-ab5a-34a25b269039,Человек может отказаться от ТВ и проигрывателе...,1.0,The content is an informative text discussing ...,1.0,The content is a detailed exploration of super...,0.0,The content is based on superstitions and esot...
412b31bb-ba09-44ae-abd7-49c9b783b005,Простая интеграция в системы PROFINET®\nВозмож...,1.0,The content is an informative text about a PRO...,1.0,The content provides a detailed and coherent d...,1.0,The content provides a detailed and coherent d...


In [3]:
from sklearn.model_selection import train_test_split

stratify_col = (
    train_df["integrity"].astype(str) + "_" +
    train_df["factuality"].astype(str) + "_" +
    train_df["truthfulness"].astype(str)
)

train_part, val_part = train_test_split(
    train_df,
    test_size=0.2,
    random_state=42,
    stratify=stratify_col
)

print(f"Val with 0.5: {len(val_part)}")

val_part = val_part[(val_part["integrity"] != 0.5) & (val_part["factuality"] != 0.5) & (val_part["truthfulness"] != 0.5)]
print(f"Val:   {len(val_part)}")
print(f"Train: {len(train_part)}")

Val with 0.5: 3000
Val:   1871
Train: 12000


## Датасеты

In [4]:
import torch
from torch.utils.data import Dataset

class TextQualityDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=512, is_test=False):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.is_test = is_test
        self.targets = ['integrity', 'factuality', 'truthfulness']

    def __len__(self):
        return len(self.df)

    def __getitem__(self, item):
        row = self.df.iloc[item]
        text = str(row['text'])

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        output = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }

        if not self.is_test:
            labels = row[self.targets].values.astype(float)
            output['labels'] = torch.tensor(labels, dtype=torch.float)

        return output

train_dataset = TextQualityDataset(train_part, tokenizer, is_test=False)
val_dataset = TextQualityDataset(val_part, tokenizer, is_test=False)

## Custom Trainer с маскированием Loss
Стандартный лосс умножается на маску (labels != 0.5). Это позволяет модели обучаться на данных, где часть меток определена (0 или 1), а часть спорная (0.5), игнорируя спорные значения при расчете градиентов.

In [5]:
import torch.nn as nn
from transformers import Trainer
import numpy as np
from sklearn.metrics import f1_score

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        loss_fct = nn.BCEWithLogitsLoss(reduction='none')
        loss = loss_fct(logits, labels)

        mask = (labels != 0.5).float()
        loss = loss * mask
        final_loss = loss.sum() / mask.sum().clamp(min=1e-6)

        return (final_loss, outputs) if return_outputs else final_loss

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = 1 / (1 + np.exp(-logits))
    predictions = (probs > 0.5).astype(int)

    y_true = labels.flatten()
    y_pred = predictions.flatten()

    mask = y_true != 0.5
    y_true_clean = y_true[mask]
    y_pred_clean = y_pred[mask]

    f1 = f1_score(y_true_clean, y_pred_clean, average='macro')

    return {'f1_macro': f1}

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ai-forever/sbert_large_nlu_ru and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Обучение

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='/kaggle/working/bert_model_output',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    fp16=True,
    learning_rate=1e-5,
    num_train_epochs=5,
    weight_decay=0.1,
    label_smoothing_factor=0.1,
    warmup_ratio=0.15,
    lr_scheduler_type="cosine",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1_macro",
    save_total_limit=2,
    logging_steps=50,
    report_to="none"
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

print("Starting training...")
trainer.train()

  trainer = CustomTrainer(


Epoch,Training Loss,Validation Loss,F1 Macro
1,0.2836,0.205344,0.747413
2,0.2389,0.18763,0.801054
3,0.168,0.179224,0.827647
4,0.1138,0.182273,0.835151
5,0.0825,0.188354,0.832565


Final Evaluation F1: 0.8351505191738371


In [None]:
torch.save(model.state_dict(), BASE_PATH / 'BERT solve/second_model_5epoch_.pth')