In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, f1_score
import torch
import torch.nn.functional as F
from transformers import TrainerCallback
import logging
import numpy as np
from pathlib import Path

In [18]:
# Кастомный коллбек для мониторинга
class CustomCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, logs=None, **kwargs):
        logging.info(f"Epoch {state.epoch}: {logs}")
        print(f"Epoch {state.epoch}: {logs}")

In [19]:
DATA_PATH = Path('/kaggle/input/dls-nlp-workshop/NLP DATA')

In [20]:
data = pd.read_csv(DATA_PATH / 'train.csv.csv')

In [21]:
data['tags'] = data['tags'].fillna('')
data['input_text'] = data['tags'] + " " + data['text'].fillna('').apply(lambda x: x.lower().strip())
labels = data[[f'trend_id_res{i}' for i in range(50)]].values  # Assuming 50 classes

In [22]:
X_train, X_test, y_train, y_test = train_test_split(data['input_text'], labels, test_size=0.2)

In [23]:
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)



In [24]:
train_data = pd.DataFrame({'text': X_train.tolist(), 'labels': list(y_train)})
test_data = pd.DataFrame({'text': X_test.tolist(), 'labels': list(y_test)})

train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/3698 [00:00<?, ? examples/s]

Map:   0%|          | 0/925 [00:00<?, ? examples/s]

In [25]:
train_dataset = train_dataset.remove_columns(['text'])
test_dataset = test_dataset.remove_columns(['text'])
train_dataset.set_format('torch')
test_dataset.set_format('torch')

In [26]:
model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=50)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
class CustomTrainer(Trainer):
    def __init__(self, label_weights, **kwargs):
        super().__init__(**kwargs)
        self.label_weights = label_weights
    
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop('labels')
        
        # Move labels and inputs to the same device as the model
        labels = labels.to(model.device)
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
        
        outputs = model(**inputs)
        logits = outputs.get('logits')
        
        # Custom loss function with label weights
        loss = F.binary_cross_entropy_with_logits(
            logits, labels.to(torch.float32), pos_weight=self.label_weights
        )
        return (loss, outputs) if return_outputs else loss

In [28]:
label_weights = 1 - labels.sum(axis=0) / labels.sum()
label_weights = torch.from_numpy(label_weights).to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
label_weights

tensor([0.8761, 0.9486, 0.9116, 0.9466, 0.9813, 0.9923, 0.9969, 0.9948, 0.9793,
        0.9985, 0.9858, 0.9840, 0.9076, 0.9950, 0.9892, 0.9880, 0.9717, 0.9987,
        0.9658, 0.9467, 0.9805, 0.9853, 0.9963, 0.9904, 0.9985, 0.9981, 0.9960,
        0.9216, 0.9378, 0.9910, 0.9547, 0.9926, 0.9981, 0.9959, 0.9984, 0.9914,
        0.9753, 0.9932, 0.9959, 0.9935, 0.9893, 0.9969, 0.9948, 0.9935, 0.9960,
        0.9982, 0.9990, 0.9954, 0.9957, 0.9982], device='cuda:0',
       dtype=torch.float64)

In [29]:
def compute_metrics(p):
    predictions, labels = p
    f1_micro = f1_score(labels, predictions > 0, average = 'micro')
    f1_macro = f1_score(labels, predictions > 0, average = 'macro')
    f1_weighted = f1_score(labels, predictions > 0, average = 'weighted')
    accuracy = accuracy_score(labels, predictions > 0)
    return {
        'f1_micro': f1_micro,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted,
        'accuracy': accuracy
    }

In [30]:
training_args = TrainingArguments(
    output_dir='./results',          
    report_to='none',
    evaluation_strategy="epoch",     
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=16,   
    num_train_epochs=3,              
    weight_decay=0.01,               
    logging_dir='./logs',            
    logging_steps=10,
    save_steps=500,                  
    save_total_limit=2,              
    metric_for_best_model="accuracy", 
    logging_strategy="epoch",        
)




In [31]:
trainer = CustomTrainer(
    model=model,                          
    args=training_args,                   
    train_dataset=train_dataset,          
    eval_dataset=test_dataset,            
    compute_metrics=compute_metrics,      
    label_weights=label_weights,          
    callbacks=[CustomCallback()]          
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1 Micro,F1 Macro,F1 Weighted,Accuracy
1,0.1687,0.104126,0.0,0.0,0.0,0.0
2,0.1094,0.102929,0.0,0.0,0.0,0.0


Epoch 1.0: None


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


Epoch 2.0: None


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
