# 02 - Feature Engineering

This notebook covers different feature extraction approaches:
- TF-IDF / CountVectorizer
- BERT embeddings
- Handling class imbalance with oversampling

In [None]:
import sys
sys.path.insert(0, '..')

import numpy as np
import pandas as pd
from collections import Counter

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from imblearn.over_sampling import RandomOverSampler

from src.config import DATASET_NAME, EMOTION_LABELS, RANDOM_SEED
from src.data.dataset import load_emotion_data

## Load Data

In [None]:
train_df, val_df, test_df = load_emotion_data(resample=False)
print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")
print(f"\nClass distribution (train):")
print(train_df['category'].value_counts())

## TF-IDF Vectorization

In [None]:
# CountVectorizer approach
count_vec = CountVectorizer(stop_words='english', min_df=0.001, ngram_range=(1, 1))
X_train_count = count_vec.fit_transform(train_df['text'])
print(f"CountVectorizer shape: {X_train_count.shape}")

# TF-IDF approach
tfidf_vec = TfidfVectorizer(stop_words='english', min_df=0.001, ngram_range=(1, 2))
X_train_tfidf = tfidf_vec.fit_transform(train_df['text'])
print(f"TF-IDF shape: {X_train_tfidf.shape}")

## Handling Class Imbalance

In [None]:
print("Before resampling:")
print(Counter(train_df['category']))

# Apply RandomOverSampler
ros = RandomOverSampler(sampling_strategy='not majority', random_state=RANDOM_SEED)
X_resampled, y_resampled = ros.fit_resample(
    train_df['text'].values.reshape(-1, 1),
    train_df['category']
)

print(f"\nAfter resampling:")
print(Counter(y_resampled))

## BERT Embeddings (using src module)

In [None]:
import torch
from transformers import BertModel, BertTokenizer
from src.config import MODEL_NAME

# Load BERT model for feature extraction
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
bert_model = BertModel.from_pretrained(MODEL_NAME)
bert_model.eval()

def extract_bert_features(text, model, tokenizer):
    """Extract [CLS] token embedding from BERT."""
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze().numpy()  # [CLS] token

# Example extraction
sample_text = train_df['text'].iloc[0]
features = extract_bert_features(sample_text, bert_model, tokenizer)
print(f"BERT embedding shape: {features.shape}")
print(f"Sample text: {sample_text[:100]}...")