In [1]:
import sys
import os

# Add the project root to the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.neural_network import MLPClassifier

from dataset.preprocessing import word_vectorization

In [2]:
def create_mlp_model():
    """
    Creates a Multi-layer Perceptron (MLP) model.
    """
    # You can tune these parameters
    model = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=300, alpha=1e-4,
                        solver='adam', verbose=10, random_state=42,
                        learning_rate_init=.001)
    return model

In [3]:
def train_and_evaluate_mlp_with_kfold():
    """
    Trains and evaluates an MLP model using k-fold cross-validation.
    """
    print("Loading and vectorizing data...")
    tfidf_matrix, _, labels = word_vectorization('tfidf')

    # Convert to numpy arrays
    X = tfidf_matrix.toarray()
    
    # Encode labels
    encoder = LabelEncoder()
    y = encoder.fit_transform(labels)
    
    n_splits = 5
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    accuracies = []

    print(f"Starting {n_splits}-fold cross-validation...")
    for fold, (train_index, test_index) in enumerate(kf.split(X)):
        print(f"--- Fold {fold+1}/{n_splits} ---")
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model = create_mlp_model()
        
        print("Training the model...")
        model.fit(X_train, y_train)

        print("Evaluating the model...")
        y_pred = model.predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
        
        print(f"Fold {fold+1} Accuracy: {accuracy:.4f}")
        print(classification_report(y_test, y_pred, target_names=[str(c) for c in encoder.classes_]))

    print(f"\nAverage cross-validation accuracy: {np.mean(accuracies):.4f} (+/- {np.std(accuracies):.4f})")

    # Train the final model on the entire dataset and save it
    print("\nTraining the final model on the entire dataset...")
    final_model = create_mlp_model()
    final_model.fit(X, y)
    print("Final model training complete.")

    model_filename = 'mlp_model.joblib'
    joblib.dump(final_model, model_filename)
    print(f"\nFinal model has been saved to '{model_filename}'")

In [4]:
train_and_evaluate_mlp_with_kfold()

Loading and vectorizing data...
Starting 5-fold cross-validation...
--- Fold 1/5 ---
Training the model...
Iteration 1, loss = 0.16424659
Iteration 2, loss = 0.07531205
Iteration 3, loss = 0.05559672
Iteration 4, loss = 0.03947345
Iteration 5, loss = 0.02676621
Iteration 6, loss = 0.01785873
Iteration 7, loss = 0.01139954
Iteration 8, loss = 0.00766457
Iteration 9, loss = 0.00548807
Iteration 10, loss = 0.00415331
Iteration 11, loss = 0.00391396
Iteration 12, loss = 0.00333140
Iteration 13, loss = 0.00336184
Iteration 14, loss = 0.00318943
Iteration 15, loss = 0.00324766
Iteration 16, loss = 0.00314139
Iteration 17, loss = 0.00365366
Iteration 18, loss = 0.00338320
Iteration 19, loss = 0.00269077
Iteration 20, loss = 0.00265277
Iteration 21, loss = 0.00231274
Iteration 22, loss = 0.00236370
Iteration 23, loss = 0.00243498
Iteration 24, loss = 0.00228500
Iteration 25, loss = 0.00225571
Iteration 26, loss = 0.00223054
Iteration 27, loss = 0.00226637
Iteration 28, loss = 0.00228207
Iterat