In [1]:
# sdadas/polish-distilroberta

In [2]:
import pandas as pd
import numpy as np

import torch
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers.training_args import TrainingArguments
from transformers.trainer import Trainer
from transformers.trainer_callback import EarlyStoppingCallback

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, hamming_loss

In [3]:
class EmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.FloatTensor(self.labels[idx])
        }


In [4]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = torch.sigmoid(torch.tensor(predictions))
    predictions = (predictions > 0.5).float()
    
    # Hamming loss (im mniejszy tym lepiej)
    hamming = hamming_loss(labels, predictions)
    
    # F1 score macro
    f1_macro = f1_score(labels, predictions, average='macro', zero_division=0)
    
    # F1 score micro
    f1_micro = f1_score(labels, predictions, average='micro', zero_division=0)
    
    return {
        'hamming_loss': hamming,
        'f1_macro': f1_macro,
        'f1_micro': f1_micro
    }

In [5]:
def prepare_data(df):
    """
    df - DataFrame z kolumnami:
    - kolumny 0-768: embedding values (będą ignorowane)
    - kolumny emotion: Joy, Trust, Anticipation, Surprise, Fear, Sadness, Disgust, Anger, Positive, Negative, Neutral
    - kolumna 'text': oryginalny tekst (musisz ją dodać do DataFrame)
    """
    
    emotion_columns = ['Joy', 'Trust', 'Anticipation', 'Surprise', 'Fear',
                      'Sadness', 'Disgust', 'Anger', 'Positive', 'Negative', 'Neutral']
    
    # Sprawdź czy masz kolumnę z tekstem
    if 'text' not in df.columns:
        raise ValueError("Musisz dodać kolumnę 'text' z oryginalnym tekstem do DataFrame!")
    
    texts = df['text'].tolist()
    labels = df[emotion_columns].values.astype(float)
    
    return texts, labels, emotion_columns

In [6]:
def train_emotion_classifier(df, model_name='sdadas/polish-distilroberta', test_size=0.2):
    # Przygotuj dane
    texts, labels, emotion_columns = prepare_data(df)
    
    # Podział na train/test
    X_train, X_test, y_train, y_test = train_test_split(
        texts, labels, test_size=test_size, random_state=42, stratify=None
    )
    
    # Załaduj tokenizer i model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=len(emotion_columns),
        problem_type="multi_label_classification"
    )
    
    # Przygotuj datasety
    train_dataset = EmotionDataset(X_train, y_train, tokenizer)
    test_dataset = EmotionDataset(X_test, y_test, tokenizer)
    
    # Argumenty treningu
    training_args = TrainingArguments(
        output_dir='./emotion_model',
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=100,
        eval_strategy="steps",
        eval_steps=500,
        save_strategy="steps",
        save_steps=500,
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        greater_is_better=True,
        report_to=None,  # wyłącz wandb
    )
    
    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )
    
    # Trenuj model
    print("Rozpoczynam fine-tuning...")
    trainer.train()
    
    # Zapisz model
    trainer.save_model('./emotion_model_final')
    tokenizer.save_pretrained('./emotion_model_final')
    
    # Ewaluuj na test set
    print("\nEwaluacja na test set:")
    test_results = trainer.evaluate(test_dataset)
    for key, value in test_results.items():
        print(f"{key}: {value:.4f}")
    
    return trainer, model, tokenizer

In [7]:
def predict_emotions(texts, model_path='./emotion_model_final'):
    emotion_labels = ['Joy', 'Trust', 'Anticipation', 'Surprise', 'Fear',
                     'Sadness', 'Disgust', 'Anger', 'Positive', 'Negative', 'Neutral']
    
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    model.eval()
    
    predictions = []
    
    with torch.no_grad():
        for text in texts:
            inputs = tokenizer(
                text, 
                return_tensors='pt', 
                truncation=True, 
                padding=True, 
                max_length=512
            )
            
            outputs = model(**inputs)
            probs = torch.sigmoid(outputs.logits).squeeze().cpu().numpy()
            
            # Threshold 0.5
            pred_labels = (probs > 0.5).astype(int)
            
            predictions.append({
                'text': text,
                'probabilities': dict(zip(emotion_labels, probs)),
                'predicted_labels': dict(zip(emotion_labels, pred_labels))
            })
    
    return predictions

In [8]:
df = pd.read_csv('../data/raw/test.csv')

def delete_hashs(df: pd.DataFrame) -> pd.DataFrame:
    df = df[~df['text'].astype(str).str.startswith('#')]
    df = df.reset_index(drop=True)
    return df

def encode_labels(df: pd.DataFrame) -> pd.DataFrame:
    for col in ['Joy', 'Trust', 'Anticipation', 'Surprise', 'Fear','Sadness', 'Disgust', 'Anger', 'Positive', 'Negative', 'Neutral']:
        df[col] = df[col].apply(lambda x: 1 if x else 0)
    return df

df = delete_hashs(df)
df = encode_labels(df)

In [None]:


# Trenuj model
trainer, model, tokenizer = train_emotion_classifier(df)

# # Przykład predykcji
# sample_texts = ["Jestem bardzo szczęśliwy dzisiaj!", "To mnie denerwuje..."]
# results = predict_emotions(sample_texts)
# for result in results:
#     print(f"Tekst: {result['text']}")
#     print(f"Przewidziane emocje: {result['predicted_labels']}")
#     print("---")


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at sdadas/polish-distilroberta and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Rozpoczynam fine-tuning...




Step,Training Loss,Validation Loss
