# 03 - Baseline Models

This notebook trains traditional ML baseline models:
- Logistic Regression
- Random Forest

Using TF-IDF features with and without resampling.

> **Note on GPU Acceleration**: scikit-learn runs on CPU. For GPU-accelerated
> traditional ML, consider [cuML](https://docs.rapids.ai/api/cuml/stable/)
> from NVIDIA RAPIDS, which provides drop-in replacements for sklearn models.
> For this dataset size (~20k samples), CPU performance is sufficient.

In [None]:
import sys
sys.path.insert(0, '..')

import numpy as np
import pandas as pd
from collections import Counter

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline

from src.config import RANDOM_SEED, LABEL_TO_ID
from src.data.dataset import load_emotion_data

## Load and Prepare Data

In [None]:
train_df, val_df, test_df = load_emotion_data(resample=False)

# Convert labels to numeric
y_train = train_df['category'].map(LABEL_TO_ID)
y_val = val_df['category'].map(LABEL_TO_ID)
y_test = test_df['category'].map(LABEL_TO_ID)

# Vectorize text
vectorizer = TfidfVectorizer(stop_words='english', min_df=0.001, ngram_range=(1, 2))
X_train = vectorizer.fit_transform(train_df['text'])
X_val = vectorizer.transform(val_df['text'])
X_test = vectorizer.transform(test_df['text'])

print(f"Feature shape: {X_train.shape}")

## Logistic Regression

In [None]:
lr_model = LogisticRegression(max_iter=1000, random_state=RANDOM_SEED)
lr_model.fit(X_train, y_train)

y_pred = lr_model.predict(X_test)
print("Logistic Regression Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

## Logistic Regression with Oversampling

In [None]:
# Apply oversampling to training data
ros = RandomOverSampler(sampling_strategy='not majority', random_state=RANDOM_SEED)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

print(f"Before resampling: {Counter(y_train)}")
print(f"After resampling: {Counter(y_train_resampled)}")

lr_model_balanced = LogisticRegression(max_iter=1000, random_state=RANDOM_SEED)
lr_model_balanced.fit(X_train_resampled, y_train_resampled)

y_pred_balanced = lr_model_balanced.predict(X_test)
print("\nLogistic Regression (Resampled) Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_balanced):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred_balanced, average='weighted'):.4f}")

## Random Forest

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=RANDOM_SEED, n_jobs=-1)
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)
print("Random Forest Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred_rf, average='weighted'):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf))

## Random Forest with Oversampling

In [None]:
rf_model_balanced = RandomForestClassifier(n_estimators=100, random_state=RANDOM_SEED, n_jobs=-1)
rf_model_balanced.fit(X_train_resampled, y_train_resampled)

y_pred_rf_balanced = rf_model_balanced.predict(X_test)
print("Random Forest (Resampled) Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf_balanced):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred_rf_balanced, average='weighted'):.4f}")

## Model Comparison

In [None]:
results = pd.DataFrame({
    'Model': ['Logistic Regression', 'LR + Resampling', 'Random Forest', 'RF + Resampling'],
    'Accuracy': [
        accuracy_score(y_test, y_pred),
        accuracy_score(y_test, y_pred_balanced),
        accuracy_score(y_test, y_pred_rf),
        accuracy_score(y_test, y_pred_rf_balanced)
    ],
    'F1 Score': [
        f1_score(y_test, y_pred, average='weighted'),
        f1_score(y_test, y_pred_balanced, average='weighted'),
        f1_score(y_test, y_pred_rf, average='weighted'),
        f1_score(y_test, y_pred_rf_balanced, average='weighted')
    ]
})
print(results.to_string(index=False))