# Classical ML Models for Mindfulness Prediction

**Phase 2: Days 5-10**

This notebook implements and compares three classical machine learning models:
1. **Linear Regression with Regularization** (Ridge, Lasso, ElasticNet)
2. **Random Forest Regressor**
3. **XGBoost Regressor**

Each model includes:
- Hyperparameter tuning with GridSearchCV
- Comprehensive evaluation (MSE, MAE, R¬≤, RMSE)
- Visualizations (actual vs predicted, residuals, feature importance)

---

## 1. Setup & Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import warnings
from pathlib import Path

# Add src to path
sys.path.append('../src')

# Import custom modules
from models import (
    LinearRegressionModel, 
    RandomForestModel, 
    XGBoostModel,
    compare_models,
    plot_actual_vs_predicted,
    plot_residuals,
    plot_residual_distribution,
    plot_feature_importance,
    plot_model_comparison
)

# Configure plotting
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

print("‚úì Imports successful")

## 2. Load Preprocessed Data

Load the train/test splits created in the preprocessing notebook.

In [None]:
# Define data paths
data_dir = Path('../data/processed')

# Load training data
X_train = pd.read_csv(data_dir / 'S2_X_train.csv')
y_train = pd.read_csv(data_dir / 'S2_y_train.csv').values.ravel()

# Load test data
X_test = pd.read_csv(data_dir / 'S2_X_test.csv')
y_test = pd.read_csv(data_dir / 'S2_y_test.csv').values.ravel()

# Get feature names
feature_names = X_train.columns.tolist()

print(f"Training set: {X_train.shape[0]} samples, {X_train.shape[1]} features")
print(f"Test set: {X_test.shape[0]} samples, {X_test.shape[1]} features")
print(f"\nTarget distribution (train):")
print(pd.Series(y_train).describe())
print(f"\nTarget distribution (test):")
print(pd.Series(y_test).describe())

## 3. Baseline Model: Simple Linear Regression

Establish baseline performance with unregularized linear regression.

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Train baseline model
baseline_model = LinearRegression()
baseline_model.fit(X_train, y_train)

# Evaluate on test set
y_pred_baseline = baseline_model.predict(X_test)

baseline_metrics = {
    'MSE': mean_squared_error(y_test, y_pred_baseline),
    'RMSE': np.sqrt(mean_squared_error(y_test, y_pred_baseline)),
    'MAE': mean_absolute_error(y_test, y_pred_baseline),
    'R2': r2_score(y_test, y_pred_baseline)
}

print("\n" + "="*60)
print("BASELINE MODEL: Simple Linear Regression (No Regularization)")
print("="*60)
print(f"R¬≤ Score:  {baseline_metrics['R2']:.4f}")
print(f"RMSE:      {baseline_metrics['RMSE']:.4f}")
print(f"MAE:       {baseline_metrics['MAE']:.4f}")
print(f"MSE:       {baseline_metrics['MSE']:.4f}")
print("="*60)

---

## 4. Model 1: Linear Regression with Regularization

Train and compare Ridge, Lasso, and ElasticNet regularization.

### 4.1 Ridge Regression (L2 Regularization)

In [None]:
# Initialize and train Ridge model
ridge_model = LinearRegressionModel(model_type='ridge', random_state=RANDOM_STATE)

ridge_model.train(
    X_train.values, 
    y_train, 
    feature_names=feature_names,
    tune_hyperparams=True,
    cv_folds=5,
    verbose=1
)

# Evaluate on test set
ridge_metrics = ridge_model.evaluate(X_test.values, y_test, dataset_name="Test")

### 4.2 Lasso Regression (L1 Regularization)

In [None]:
# Initialize and train Lasso model
lasso_model = LinearRegressionModel(model_type='lasso', random_state=RANDOM_STATE)

lasso_model.train(
    X_train.values, 
    y_train, 
    feature_names=feature_names,
    tune_hyperparams=True,
    cv_folds=5,
    verbose=1
)

# Evaluate on test set
lasso_metrics = lasso_model.evaluate(X_test.values, y_test, dataset_name="Test")

### 4.3 ElasticNet (L1 + L2 Regularization)

In [None]:
# Initialize and train ElasticNet model
elasticnet_model = LinearRegressionModel(model_type='elasticnet', random_state=RANDOM_STATE)

elasticnet_model.train(
    X_train.values, 
    y_train, 
    feature_names=feature_names,
    tune_hyperparams=True,
    cv_folds=5,
    verbose=1
)

# Evaluate on test set
elasticnet_metrics = elasticnet_model.evaluate(X_test.values, y_test, dataset_name="Test")

### 4.4 Compare Linear Models

In [None]:
# Compare linear models
linear_models = [ridge_model, lasso_model, elasticnet_model]
linear_comparison = compare_models(linear_models, X_test.values, y_test)

print("\n" + "="*80)
print("LINEAR MODELS COMPARISON")
print("="*80)
print(linear_comparison.to_string(index=False))
print("="*80)

# Select best linear model
best_linear_idx = linear_comparison['R¬≤'].idxmax()
best_linear_model = linear_models[best_linear_idx]
print(f"\n‚úì Best Linear Model: {best_linear_model.name}")

### 4.5 Visualize Best Linear Model

In [None]:
# Actual vs Predicted
y_pred_linear = best_linear_model.predict(X_test.values)

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Plot 1: Actual vs Predicted
plot_actual_vs_predicted(y_test, y_pred_linear, best_linear_model.name, ax=axes[0])

# Plot 2: Residuals
plot_residuals(y_test, y_pred_linear, best_linear_model.name, ax=axes[1])

# Plot 3: Residual Distribution
plot_residual_distribution(y_test, y_pred_linear, best_linear_model.name, ax=axes[2])

plt.tight_layout()
plt.show()

### 4.6 Feature Importance (Coefficient Magnitudes)

In [None]:
# Get feature importance
linear_importance = best_linear_model.get_feature_importance()

print("\nTop 20 Most Important Features (by coefficient magnitude):")
print(linear_importance.head(20).to_string(index=False))

# Plot feature importance
fig, ax = plt.subplots(figsize=(10, 8))
plot_feature_importance(linear_importance, top_n=20, model_name=best_linear_model.name, ax=ax)
plt.show()

---

## 5. Model 2: Random Forest Regressor

In [None]:
# Initialize and train Random Forest model
rf_model = RandomForestModel(random_state=RANDOM_STATE)

rf_model.train(
    X_train.values, 
    y_train, 
    feature_names=feature_names,
    tune_hyperparams=True,
    cv_folds=5,
    verbose=1
)

# Evaluate on test set
rf_metrics = rf_model.evaluate(X_test.values, y_test, dataset_name="Test")

### 5.1 Visualize Random Forest Performance

In [None]:
# Actual vs Predicted
y_pred_rf = rf_model.predict(X_test.values)

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Plot 1: Actual vs Predicted
plot_actual_vs_predicted(y_test, y_pred_rf, rf_model.name, ax=axes[0])

# Plot 2: Residuals
plot_residuals(y_test, y_pred_rf, rf_model.name, ax=axes[1])

# Plot 3: Residual Distribution
plot_residual_distribution(y_test, y_pred_rf, rf_model.name, ax=axes[2])

plt.tight_layout()
plt.show()

### 5.2 Feature Importance (Gini Importance)

In [None]:
# Get feature importance
rf_importance = rf_model.get_feature_importance()

print("\nTop 20 Most Important Features (Random Forest):")
print(rf_importance.head(20).to_string(index=False))

# Plot feature importance
fig, ax = plt.subplots(figsize=(10, 8))
plot_feature_importance(rf_importance, top_n=20, model_name=rf_model.name, ax=ax)
plt.show()

---

## 6. Model 3: XGBoost Regressor

In [None]:
# Initialize and train XGBoost model
xgb_model = XGBoostModel(random_state=RANDOM_STATE)

xgb_model.train(
    X_train.values, 
    y_train, 
    feature_names=feature_names,
    tune_hyperparams=True,
    cv_folds=5,
    verbose=1
)

# Evaluate on test set
xgb_metrics = xgb_model.evaluate(X_test.values, y_test, dataset_name="Test")

### 6.1 Visualize XGBoost Performance

In [None]:
# Actual vs Predicted
y_pred_xgb = xgb_model.predict(X_test.values)

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Plot 1: Actual vs Predicted
plot_actual_vs_predicted(y_test, y_pred_xgb, xgb_model.name, ax=axes[0])

# Plot 2: Residuals
plot_residuals(y_test, y_pred_xgb, xgb_model.name, ax=axes[1])

# Plot 3: Residual Distribution
plot_residual_distribution(y_test, y_pred_xgb, xgb_model.name, ax=axes[2])

plt.tight_layout()
plt.show()

### 6.2 Feature Importance (Gain, Weight, Cover)

In [None]:
# Get feature importance (gain)
xgb_importance_gain = xgb_model.get_feature_importance(importance_type='gain')

print("\nTop 20 Most Important Features (XGBoost - Gain):")
print(xgb_importance_gain.head(20).to_string(index=False))

# Plot feature importance
fig, ax = plt.subplots(figsize=(10, 8))
plot_feature_importance(xgb_importance_gain, top_n=20, model_name=f"{xgb_model.name} (Gain)", ax=ax)
plt.show()

---

## 7. Model Comparison & Best Model Selection

### 7.1 Comprehensive Comparison Table

In [None]:
# Compare all models
all_models = [best_linear_model, rf_model, xgb_model]
final_comparison = compare_models(all_models, X_test.values, y_test)

print("\n" + "="*90)
print("FINAL MODEL COMPARISON - ALL CLASSICAL MODELS")
print("="*90)
print(final_comparison.to_string(index=False))
print("="*90)

# Identify best model
best_model_idx = final_comparison['R¬≤'].idxmax()
best_model = all_models[best_model_idx]
best_model_name = final_comparison.iloc[best_model_idx]['Model']
best_r2 = final_comparison.iloc[best_model_idx]['R¬≤']

print(f"\nüèÜ BEST MODEL: {best_model_name}")
print(f"   R¬≤ Score: {best_r2:.4f}")
print(f"   Best Hyperparameters: {best_model.best_params_}")

### 7.2 Visual Comparison

In [None]:
# Plot comparison for different metrics
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# R¬≤ comparison
plot_model_comparison(final_comparison, metric='R¬≤', ax=axes[0, 0])

# RMSE comparison
plot_model_comparison(final_comparison, metric='RMSE', ax=axes[0, 1])

# MAE comparison
plot_model_comparison(final_comparison, metric='MAE', ax=axes[1, 0])

# Training time comparison
plot_model_comparison(final_comparison, metric='Training Time (s)', ax=axes[1, 1])

plt.tight_layout()
plt.show()

### 7.3 Side-by-Side Predictions Comparison

In [None]:
# Create side-by-side actual vs predicted plots
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, model in enumerate(all_models):
    y_pred = model.predict(X_test.values)
    plot_actual_vs_predicted(y_test, y_pred, model.name, ax=axes[idx])

plt.tight_layout()
plt.show()

### 7.4 Feature Importance Comparison

In [None]:
# Compare top features across models
print("\n" + "="*90)
print("TOP 10 FEATURES COMPARISON ACROSS MODELS")
print("="*90)

# Create comparison dataframe
comparison_data = {
    best_linear_model.name: linear_importance.head(10)['feature'].tolist(),
    rf_model.name: rf_importance.head(10)['feature'].tolist(),
    xgb_model.name: xgb_importance_gain.head(10)['feature'].tolist()
}

comparison_df = pd.DataFrame(comparison_data)
comparison_df.index = range(1, 11)
comparison_df.index.name = 'Rank'

print(comparison_df.to_string())
print("="*90)

---

## 8. Save Best Model

In [None]:
# Save best model
model_save_path = '../data/processed/best_classical_model.pkl'
best_model.save(model_save_path)

# Save comparison results
final_comparison.to_csv('../data/processed/model_comparison_results.csv', index=False)
print("\n‚úì Model comparison results saved to: ../data/processed/model_comparison_results.csv")

---

## 9. Summary & Conclusions

In [None]:
print("\n" + "="*90)
print("PHASE 2 SUMMARY: CLASSICAL ML MODELS")
print("="*90)
print(f"\n‚úì Models Trained: {len(all_models) + 2} (including baseline and linear variants)")
print(f"‚úì Best Model: {best_model_name}")
print(f"‚úì Best R¬≤ Score: {best_r2:.4f}")
print(f"\nKey Findings:")
print(f"  1. Best performing model: {best_model_name}")
print(f"  2. R¬≤ improvement over baseline: {(best_r2 - baseline_metrics['R2']):.4f}")
print(f"  3. All models successfully trained with hyperparameter tuning")
print(f"  4. Feature importance analysis completed for all models")
print(f"\nNext Steps:")
print(f"  - Proceed to Phase 3: Literature-based methods reproduction")
print(f"  - Consider ensemble methods combining best models")
print(f"  - Extend to multi-subject evaluation (LOSO cross-validation)")
print("="*90)