In [1]:
import pandas as pd
import numpy as np
from typing import Dict, Tuple

def load_data() -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    train_texts = pd.read_csv('../data/clean/train_texts.csv', index_col=0)
    X_train = train_texts.iloc[:, :768].values
    y_train = train_texts.iloc[:, 769:].values
    
    val_texts = pd.read_csv('../data/clean/train_texts.csv', index_col=0)
    X_val = val_texts.iloc[:, :768].values
    y_val = val_texts.iloc[:, 769:].values
    
    test_texts = pd.read_csv('../data/clean/test_texts.csv', index_col=0)
    X_test = test_texts.iloc[:, :768].values 
    y_test = test_texts.iloc[:, 769:].values
    
    return X_train + X_val, y_train + y_val, X_test, y_test

In [2]:
X_train, y_train, X_test, y_test = load_data()

In [3]:
X_train

array([[ 1.25063754, -0.6882836 , -0.6730092 , ..., -0.18533202,
         0.75368624, -1.16505468],
       [-0.15346272, -0.3789488 , -0.48358848, ...,  1.40171028,
        -0.25717076,  0.9301322 ],
       [-1.09913612,  0.58593976, -0.47562278, ...,  0.06537923,
        -0.34900344, -0.44718388],
       ...,
       [ 0.3631426 ,  0.31029248,  0.01329327, ...,  0.0114778 ,
        -0.27975324, -0.23371865],
       [-0.30254254, -0.03405347,  0.23653449, ...,  0.41180586,
         0.04707385, -0.2015373 ],
       [-0.40055462,  0.4447935 , -0.16510852, ...,  0.3321956 ,
        -0.52933704,  0.3470086 ]])

In [4]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

models = [
    {
        "name": "Gradient Boosting Classifier",
        "model": GradientBoostingClassifier(),
        "params": {
            "estimator__n_estimators": [100, 200, 300],
            "estimator__learning_rate": [0.01, 0.05, 0.1],
            "estimator__max_depth": [3, 4, 5, 6],
            "estimator__min_samples_split": [2, 5, 10],
            "estimator__min_samples_leaf": [1, 2, 4],
            "estimator__subsample": [0.8, 0.9, 1.0],
        }
    },
    {
        "name": "Random Forest Classifier",
        "model": RandomForestClassifier(),
        "params": {
            "estimator__n_estimators": [100, 200, 300],
            "estimator__max_depth": [10, 20, 30, None],
            "estimator__min_samples_split": [2, 5, 10],
            "estimator__min_samples_leaf": [1, 2, 4],
        }
    }
]

In [5]:
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate_model(y_true: np.ndarray, y_pred: np.ndarray) -> Dict:
    labels = ['Joy', 'Trust', 'Anticipation', 'Surprise', 'Fear', 
              'Sadness', 'Disgust', 'Anger', 'Positive', 'Negative', 'Neutral']
    
    metrics = {}
    
    for i, label in enumerate(labels):
        metrics[f"Precision {label}"] = precision_score(y_true[:, i], y_pred[:, i])
        metrics[f"Recall {label}"] = recall_score(y_true[:, i], y_pred[:, i])
        metrics[f"F1-score {label}"] = f1_score(y_true[:, i], y_pred[:, i])
    
    metrics['F1-score macro texts'] = np.mean([metrics[f"F1-score {label}"] for label in labels])
    
    return metrics

In [6]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import GridSearchCV

best_models = {
    'models': [],
    'parameters': [],
    'scores': []
}

for model in models:
    print(f"\nModel: {model['name']}")
    base_model = MultiOutputClassifier(model["model"])
    grid_search = GridSearchCV(
        estimator=base_model, 
        param_grid=model["params"], 
        cv=5, 
        n_jobs=-1, 
        scoring='f1_macro',
        verbose=1
    )
    
    grid_search.fit(X_train, y_train)
    
    y_pred = grid_search.predict(X_test)
    
    metrics = evaluate_model(y_test, y_pred)
    
    best_models['models'].append(model['name'])
    best_models['parameters'].append(grid_search.best_params_)
    best_models['scores'].append({
        'best_cv_score': grid_search.best_score_,
        'test_metrics': metrics
    })
    
    print(f"\nBest parameters for {model['name']}:")
    print(grid_search.best_params_)
    print(f"Best cross-validation score: {grid_search.best_score_:.4f}")
    print(f"Test set macro F1 score: {metrics['F1-score macro texts']:.4f}")


Model: Gradient Boosting Classifier
Fitting 5 folds for each of 972 candidates, totalling 4860 fits


KeyboardInterrupt: 

In [None]:
results_df = pd.DataFrame({
    'Model': best_models['models'],
    'Best CV Score': [scores['best_cv_score'] for scores in best_models['scores']],
    'Test Macro F1': [scores['test_metrics']['F1-score macro texts'] for scores in best_models['scores']]
})

detailed_metrics = []
for model_name, scores in zip(best_models['models'], best_models['scores']):
    metrics_dict = scores['test_metrics']
    metrics_dict['Model'] = model_name
    detailed_metrics.append(metrics_dict)

detailed_results_df = pd.DataFrame(detailed_metrics)

print("\nSummary Results:")
print(results_df)
print("\nDetailed Results:")
print(detailed_results_df)