# Train & Save Final Model

This notebook covers:
- Phase 10: Train Final Model on all Data and Save via Pickle

**Note:** Run `4_pipeline_gridsearch.ipynb` first to get the optimized hyperparameters.

In [14]:
# Import libraries
import pandas as pd
import numpy as np
import pickle
import os
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# Set random seed for reproducibility
RANDOM_STATE = 777
np.random.seed(RANDOM_STATE)

print("Libraries imported successfully!")


Libraries imported successfully!


In [15]:
# Load preprocessor and top 6 feature indices
print("=" * 60)
print("LOADING PREPROCESSOR AND FEATURE INDICES")
print("=" * 60)

with open('models/preprocessor.pkl', 'rb') as f:
    preprocessor = pickle.load(f)
print("‚úÖ Preprocessor loaded")

try:
    with open('models/top_6_indices.pkl', 'rb') as f:
        top_6_indices = pickle.load(f)
    print(f"‚úÖ Top 6 feature indices loaded: {top_6_indices}")
except FileNotFoundError:
    print("‚ö†Ô∏è top_6_indices.pkl not found. Run modelling.ipynb first!")
    raise

# Reload and prepare all data
df = pd.read_csv('data/spotify-tracks.csv')
columns_to_drop = ['spotify_id', 'name', 'artists', 'album_name', 'album_release_date',
                   'popular_in_country', 'mode', 'is_explicit', 'release_year', 
                   'key', 'time_signature', 'release_month', 'duration_ms', 'popularity']
df_clean = df.drop(columns=columns_to_drop, errors='ignore')
target = 'energy'
y = df_clean[target].copy()
X = df_clean.drop(columns=[target]).copy()

# Feature engineering
X_engineered = X.copy()
X_engineered['loudness_tempo'] = X_engineered['loudness'] * X_engineered['tempo']
X_engineered['danceability_valence'] = X_engineered['danceability'] * X_engineered['valence']
X_engineered['loudness_danceability'] = X_engineered['loudness'] * X_engineered['danceability']
X_engineered['tempo_valence'] = X_engineered['tempo'] * X_engineered['valence']
X = X_engineered.copy()

# Preprocess all data
X_all_processed = preprocessor.fit_transform(X)
feature_names = preprocessor.get_feature_names_out()

used_original_features = list(X.columns.intersection(df.columns))  # Original CSV columns still in use


LOADING PREPROCESSOR AND FEATURE INDICES
‚úÖ Preprocessor loaded
‚úÖ Top 6 feature indices loaded: [3, 9, 4, 12, 13, 5]


In [16]:
# Feature selector transformer (same as in pipeline_gridsearch.ipynb)
class TopFeatureSelector(BaseEstimator, TransformerMixin):
    """Selects top N features"""
    def __init__(self, feature_indices):
        self.feature_indices = feature_indices
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[:, self.feature_indices]

# Select top 6 features
X_all_top6 = X_all_processed[:, top_6_indices]



# Create final model with optimized hyperparameters from grid search
# These are the best parameters found in pipeline_gridsearch.ipynb
# Random Forest Regressor outperformed Gradient Boosting
# Optimal Hyperparameters:
#  - n_estimators: 600
#  - max_depth: None (unlimited)
#  - max_features: 'sqrt'
#  - min_samples_split: 2
#  - min_samples_leaf: 1

#######################################################################################################################################
## Attention!! Using reduced n_estimators and max_depth for final model to save space (model.pkl was 1,3GB with 600 trees and unlimited max_depth)##
######################################################################################################################################

# Used Hyperparameters:
#  - n_estimators: 200
#  - max_depth: 10
#  - max_features: 'sqrt'
#  - min_samples_split: 2
#  - min_samples_leaf: 1

final_model = RandomForestRegressor(
    n_estimators=200,
    max_depth=10,
    max_features='sqrt',
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=RANDOM_STATE,
    n_jobs=-1
)

print(f"\nTraining on all data ({len(X_all_processed)} samples)...")
final_model.fit(X_all_top6, y)

# Evaluate on all data
y_pred_all = final_model.predict(X_all_top6)
r2_all = r2_score(y, y_pred_all)
rmse_all = np.sqrt(mean_squared_error(y, y_pred_all))
mae_all = mean_absolute_error(y, y_pred_all)

print("\n" + "=" * 60)
print("‚úÖ FINAL MODEL trained successfully on all data!")
print("=" * 60)




Training on all data (21585 samples)...

‚úÖ FINAL MODEL trained successfully on all data!


In [17]:
# Save model package
print("\n" + "=" * 60)
print("SAVING MODEL")
print("=" * 60)

model_package = {
    'model': final_model,
    'top_6_indices': top_6_indices,
    'feature_names': feature_names,
    'preprocessor': preprocessor,
    'model_type': type(final_model).__name__,
    'r2_score': r2_all,
    'rmse': rmse_all,
    'mae': mae_all,
    'hyperparameters': {
        'n_estimators': 200,
        'max_depth': 20,
        'max_features': 'sqrt',
        'min_samples_split': 2,
        'min_samples_leaf': 1
    },
    'grid_search_info': {
        'best_cv_r2': 0.6968,
        'test_r2': 0.7131,
        'model_selected': 'RandomForestRegressor'
    }
}

os.makedirs('models', exist_ok=True)

with open('models/model.pkl', 'wb') as f:
    pickle.dump(model_package, f)

print("\n‚úÖ Model saved to 'models/model.pkl'")
print("\nModel package includes:")
print("  - Trained Random Forest model")
print("  - Top 6 feature indices")
print("  - Preprocessor")
print("  - Performance metrics")
print("  - Optimized hyperparameters")
print("  - Grid search results summary")



SAVING MODEL

‚úÖ Model saved to 'models/model.pkl'

Model package includes:
  - Trained Random Forest model
  - Top 6 feature indices
  - Preprocessor
  - Performance metrics
  - Optimized hyperparameters
  - Grid search results summary


In [18]:
# Test loading
print("\n" + "=" * 60)
print("TESTING MODEL LOADING")
print("=" * 60)

with open('models/model.pkl', 'rb') as f:
    loaded_package = pickle.load(f)

loaded_model = loaded_package['model']
loaded_indices = loaded_package['top_6_indices']
loaded_preprocessor = loaded_package['preprocessor']

print("‚úÖ Model loaded successfully")



TESTING MODEL LOADING
‚úÖ Model loaded successfully


In [19]:
print("\n" + "=" * 60)
print("PREDICTION SANITY CHECK (QUALITATIVE)")
print("=" * 60)
print("Note: The following examples are for demonstration only.")
print("They do NOT represent an unbiased performance evaluation.\n")

# Random sample for transparency
test_sample = X.sample(n=5, random_state=RANDOM_STATE)
test_indices = test_sample.index

test_sample_processed = loaded_preprocessor.transform(test_sample)
test_sample_top6 = test_sample_processed[:, loaded_indices]

predictions = loaded_model.predict(test_sample_top6)

print(f"{'Index':<8} {'Actual':<10} {'Predicted':<12} {'Abs Error':<10}")
print("-" * 50)

for idx, actual, pred in zip(
    test_indices,
    y.loc[test_indices],
    predictions
):
    print(f"{idx:<8} {actual:<10.4f} {pred:<12.4f} {abs(actual - pred):<10.4f}")

print("\nPrediction range check:")
print(f"  Min prediction: {predictions.min():.3f}")
print(f"  Max prediction: {predictions.max():.3f}")

if np.all((predictions >= 0) & (predictions <= 1)):
    print("‚úÖ All predictions within valid range [0, 1]")
else:
    print("‚ö†Ô∏è Some predictions outside expected range")

print("\nSummary (demonstration only):")
print(f"  Mean absolute error: {np.mean(np.abs(y.loc[test_indices].values - predictions)):.4f}")
print(f"  Max absolute error:  {np.max(np.abs(y.loc[test_indices].values - predictions)):.4f}")




PREDICTION SANITY CHECK (QUALITATIVE)
Note: The following examples are for demonstration only.
They do NOT represent an unbiased performance evaluation.

Index    Actual     Predicted    Abs Error 
--------------------------------------------------
14720    0.8210     0.7835       0.0375    
18254    0.7170     0.5851       0.1319    
912      0.7600     0.7313       0.0287    
12732    0.5480     0.5059       0.0421    
2003     0.6720     0.5421       0.1299    

Prediction range check:
  Min prediction: 0.506
  Max prediction: 0.783
‚úÖ All predictions within valid range [0, 1]

Summary (demonstration only):
  Mean absolute error: 0.0740
  Max absolute error:  0.1319


---

## üéØ Final Model Ready for Deployment

The model has been:

- ‚úÖ Trained on all available data  
- ‚úÖ Using optimized hyperparameters from grid search  
- ‚úÖ Saved to `models/model.pkl`  
- ‚úÖ Tested and verified  

You can now use this model for predictions on new, unseen data.

---
