# Text + Transformers

In [1]:
import pandas as pd

def concate_comments(df: pd.DataFrame) -> pd.DataFrame:  
    processed_df = df.copy()
    current_texts = []
    processed_texts = []
    
    for idx, row in df.iterrows():
        if row['text'].startswith('#'):
            current_texts = []
            processed_texts.append(row['text'])
            continue
            
        if not current_texts:
            current_texts.append(row['text'])
            processed_texts.append(f"[COMMENT_START] {row['text']} [COMMENT_END]")
        else:
            current_texts.append(row['text'])
            processed_texts.append(f"{' '.join(current_texts[:-1])} [COMMENT_START] {current_texts[-1]} [COMMENT_END]")
    
    processed_df['text'] = processed_texts
    return processed_df

def delete_hashs(df: pd.DataFrame) -> pd.DataFrame:
    df = df[~df['text'].astype(str).str.startswith('#')]
    df = df.reset_index(drop=True)
    return df

def encode_labels(df: pd.DataFrame) -> pd.DataFrame:
    for col in ['Joy', 'Trust', 'Anticipation', 'Surprise', 'Fear','Sadness', 'Disgust', 'Anger', 'Positive', 'Negative', 'Neutral']:
        df[col] = df[col].apply(lambda x: 1 if x else 0)
    return df

def rename_columns(df: pd.DataFrame) -> pd.DataFrame:
    df.columns = df.columns.str.lower()
    return df

def delete_empty(df: pd.DataFrame) -> pd.DataFrame:
    df = df[df['text'].astype(str).str.len() > 2]
    df = df.reset_index(drop=True)
    return df

def transform(df: pd.DataFrame) -> pd.DataFrame:
    result_df = concate_comments(df=df)
    result_df = delete_hashs(df=result_df)
    result_df = encode_labels(df=result_df)
    result_df = rename_columns(df=result_df)
    return result_df

In [2]:
import os
from typing import List

def load_data() -> List:
    data = []
    
    for type in ['train', 'val', 'test']:
        if os.path.exists(f'../data/clean/tf_{type}.csv'):
                df = pd.read_csv(f'../data/clean/tf_{type}.csv', index_col=0)
        else:
            df = pd.read_csv(f'../data/raw/{type}.csv')
            df = transform(df=df)
            df.to_csv(f'../data/clean/tf_{type}.csv')

        data.append(df)  
    
    return data   

In [3]:
data = load_data()

In [4]:
train = data[0]
val = data[1]
test = data[2]

In [5]:
train.head(10)

Unnamed: 0,text,joy,trust,anticipation,surprise,fear,sadness,disgust,anger,positive,negative,neutral
0,[COMMENT_START] 2 gwiazdki. [COMMENT_END],0,0,0,1,0,1,0,1,0,1,0
1,2 gwiazdki. [COMMENT_START] Na tyle maksymalni...,0,0,0,0,0,1,1,1,0,1,0
2,2 gwiazdki. Na tyle maksymalnie zasługuje ten ...,0,0,0,0,0,0,1,1,0,1,0
3,2 gwiazdki. Na tyle maksymalnie zasługuje ten ...,0,0,0,0,0,1,1,1,0,1,0
4,2 gwiazdki. Na tyle maksymalnie zasługuje ten ...,1,0,0,0,0,0,0,1,0,1,1
5,2 gwiazdki. Na tyle maksymalnie zasługuje ten ...,1,0,0,0,0,0,0,0,1,0,1
6,2 gwiazdki. Na tyle maksymalnie zasługuje ten ...,1,0,0,0,0,0,0,0,1,0,1
7,2 gwiazdki. Na tyle maksymalnie zasługuje ten ...,0,0,0,1,0,1,0,0,0,1,0
8,2 gwiazdki. Na tyle maksymalnie zasługuje ten ...,1,0,0,0,0,0,0,1,1,1,1
9,2 gwiazdki. Na tyle maksymalnie zasługuje ten ...,0,0,0,0,0,0,0,1,0,1,0


In [6]:
from torch.utils.data import Dataset, DataLoader

class EmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.FloatTensor(self.labels[idx])
        }


In [7]:
from typing import Dict
from sklearn.metrics import accuracy_score, f1_score, hamming_loss

def compute_metrics(eval_pred) -> Dict:
    predictions, labels = eval_pred
    predictions = torch.sigmoid(torch.tensor(predictions))
    predictions = (predictions > 0.5).float()
    
    f1_macro = f1_score(labels, predictions, average='macro', zero_division=0)
    
    f1_micro = f1_score(labels, predictions, average='micro', zero_division=0)
    
    return {
        'f1_macro': f1_macro,
        'f1_micro': f1_micro
    }

In [8]:
from typing import Tuple

def prepare_data(df: pd.DataFrame) -> Tuple:
    emotion_columns = ['joy', 'trust', 'anticipation', 'surprise', 'fear','sadness', 'disgust', 'anger', 'positive', 'negative', 'neutral']

    texts = df['text'].tolist()
    labels = df[emotion_columns].values.astype(float)
    
    return texts, labels, emotion_columns

In [9]:
import pandas as pd
import numpy as np

import torch

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers.training_args import TrainingArguments
from transformers.trainer import Trainer
from transformers.trainer_callback import EarlyStoppingCallback

def train_emotion_classifier(train: pd.DataFrame, test: pd.DataFrame, val: pd.DataFrame, model_name: str='sdadas/polish-distilroberta', test_size: float=0.2):
    train_texts, train_labels, train_emotion_columns = prepare_data(train)
    test_texts, test_labels, test_emotion_columns = prepare_data(test)
    val_texts, val_labels, val_emotion_columns = prepare_data(val)
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=len(train_emotion_columns),
        problem_type="multi_label_classification"
    )
    
    train_dataset = EmotionDataset(train_texts, train_labels, tokenizer)
    test_dataset = EmotionDataset(test_texts, test_labels, tokenizer)
    val_dataset = EmotionDataset(val_texts, val_labels, tokenizer)

    training_args = TrainingArguments(
        output_dir='./emotion_model',
        num_train_epochs=8,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        gradient_accumulation_steps=2,
        fp16=True,
        learning_rate=2e-5,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=100,
        eval_strategy="steps",
        eval_steps=250,
        save_strategy="steps",
        save_steps=250,
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        greater_is_better=True,
        report_to=None,  # wyłącz wandb
    )

    # training_args = TrainingArguments(
    #     output_dir='../models/transformers',
    #     num_train_epochs=8,                     # Increased from 8
    #     per_device_train_batch_size=8,          # Reduced to prevent OOM
    #     per_device_eval_batch_size=8,           # Reduced to prevent OOM
    #     gradient_accumulation_steps=4,          # Increased to compensate for smaller batch size
    #     fp16=True,
    #     learning_rate=3e-5,                     # Slightly increased
    #     warmup_ratio=0.1,                       # Using ratio instead of steps
    #     weight_decay=0.01,
    #     logging_dir='../logs',
    #     logging_steps=50,                       # More frequent logging
    #     eval_strategy="steps",                  # Changed from eval_strategy
    #     eval_steps=250,                         # More frequent evaluation
    #     save_strategy="steps",
    #     save_steps=250,                         # More frequent saving
    #     save_total_limit=3,                     # Keep only the best 3 checkpoints
    #     load_best_model_at_end=True,
    #     metric_for_best_model="f1_macro",
    #     greater_is_better=True,
    #     report_to=None,
    #     remove_unused_columns=True,
    #     dataloader_num_workers=2,              # Parallel data loading
    #     eval_accumulation_steps=4,             # Accumulate eval batches
    #     group_by_length=True,                  # Speeds up training
    #     seed=42,                               # For reproducibility
    # )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    )
    
    print("Staring fine-tuning...")
    trainer.train()
    
    trainer.save_model('../models/transformers_final')
    tokenizer.save_pretrained('../models/transformers_final')
    
    print("\nEvaluation:")
    test_results = trainer.evaluate(test_dataset)
    for key, value in test_results.items():
        print(f"{key}: {value:.4f}")
    
    return trainer, model, tokenizer




In [10]:
trainer, model, tokenizer = train_emotion_classifier(train=train, test=test, val=val, model_name='sdadas/polish-roberta-base-v2')

tokenizer_config.json:   0%|          | 0.00/343 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at sdadas/polish-roberta-base-v2 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Staring fine-tuning...




Step,Training Loss,Validation Loss,F1 Macro,F1 Micro
250,0.5127,0.395444,0.356632,0.688107
500,0.3009,0.296459,0.556419,0.785477
750,0.2488,0.263826,0.57867,0.80554
1000,0.2056,0.251273,0.592161,0.810029
1250,0.1828,0.248687,0.629284,0.823074
1500,0.1633,0.243865,0.627706,0.824719





Evaluation:




eval_loss: 0.2572
eval_f1_macro: 0.6247
eval_f1_micro: 0.8050
eval_runtime: 236.4382
eval_samples_per_second: 5.3460
eval_steps_per_second: 0.3340
epoch: 8.0000
