# Model Usage & Final Model Save

This notebook covers:
- Phase 10: Final Model Save and Usage Examples

**Note:** Run `pipeline_gridsearch.ipynb` first to get the optimized model.


In [19]:
# Import libraries
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics import r2_score

# Set random seed for reproducibility
RANDOM_STATE = 777
np.random.seed(RANDOM_STATE)

print("Libraries imported successfully!")


Libraries imported successfully!


In [20]:
# Load preprocessed data and optimized model
with open('models/preprocessor.pkl', 'rb') as f:
    preprocessor = pickle.load(f)

# Load optimized model from pipeline_gridsearch.ipynb
# In practice, you'd load the grid_search.best_estimator_ from that notebook
# For demonstration, we'll create a simple example

print("=" * 60)
print("FINAL MODEL - TRAINING ON ALL DATA")
print("=" * 60)

# Reload and prepare all data
df = pd.read_csv('data/spotify-tracks.csv')
columns_to_drop = ['spotify_id', 'name', 'artists', 'album_name', 'album_release_date',
                   'popular_in_country', 'mode', 'is_explicit', 'release_year', 
                   'key', 'time_signature']
df_clean = df.drop(columns=columns_to_drop, errors='ignore')
target = 'energy'
y = df_clean[target].copy()
X = df_clean.drop(columns=[target]).copy()

# Feature engineering
# Only valid interaction features (no target leakage, no data leakage)
X_engineered = X.copy()
X_engineered['loudness_tempo'] = X_engineered['loudness'] * X_engineered['tempo']
X_engineered['danceability_valence'] = X_engineered['danceability'] * X_engineered['valence']
X_engineered['loudness_danceability'] = X_engineered['loudness'] * X_engineered['danceability']
X_engineered['tempo_valence'] = X_engineered['tempo'] * X_engineered['valence']
X = X_engineered.copy()

# Preprocess all data
X_all_processed = preprocessor.fit_transform(X)
feature_names = preprocessor.get_feature_names_out()

print(f"Training on all {len(X_all_processed)} samples")


FINAL MODEL - TRAINING ON ALL DATA
Training on all 21585 samples


In [21]:
# Train final model on all data
# Note: In practice, load the optimized model from pipeline_gridsearch.ipynb
# For demonstration, we'll use GradientBoostingRegressor with good defaults
from sklearn.ensemble import GradientBoostingRegressor

# Load top 10 indices from modelling.ipynb
try:
    with open('models/top_10_indices.pkl', 'rb') as f:
        top_10_indices = pickle.load(f)
    print(f"✅ Loaded top_10_indices: {top_10_indices}")
except FileNotFoundError:
    print("⚠️ top_10_indices.pkl not found. Run modelling.ipynb first!")
    print("   Using first 10 features as fallback...")
    top_10_indices = list(range(10))

X_all_top10 = X_all_processed[:, top_10_indices]

# Train final model
final_model = GradientBoostingRegressor(
    n_estimators=200,
    max_depth=5,
    learning_rate=0.1,
    subsample=0.9,
    random_state=RANDOM_STATE
)
final_model.fit(X_all_top10, y)

# Verify it works
y_pred_all = final_model.predict(X_all_top10)
r2_all = r2_score(y, y_pred_all)

print(f"\nFinal Model Performance (all data):")
print(f"  R²: {r2_all:.4f}")


✅ Loaded top_10_indices: [3, 11, 4, 14, 15, 6, 9, 5, 0, 1]

Final Model Performance (all data):
  R²: 0.7793


In [22]:
# Save model package
print("\n" + "=" * 60)
print("SAVING MODEL")
print("=" * 60)

model_package = {
    'model': final_model,
    'top_10_indices': top_10_indices,
    'feature_names': feature_names,
    'preprocessor': preprocessor,
    'model_type': type(final_model).__name__,
    'r2_score': r2_all
}

import os
os.makedirs('models', exist_ok=True)

with open('models/model.pkl', 'wb') as f:
    pickle.dump(model_package, f)

print("✅ Model saved to 'models/model.pkl'")



SAVING MODEL
✅ Model saved to 'models/model.pkl'


In [23]:
# Test loading
print("\n" + "=" * 60)
print("TESTING MODEL LOADING")
print("=" * 60)

with open('models/model.pkl', 'rb') as f:
    loaded_package = pickle.load(f)

loaded_model = loaded_package['model']
loaded_indices = loaded_package['top_10_indices']
loaded_preprocessor = loaded_package['preprocessor']

print("✅ Model loaded successfully")
print(f"   Model type: {loaded_package['model_type']}")
print(f"   R² score: {loaded_package['r2_score']:.4f}")



TESTING MODEL LOADING
✅ Model loaded successfully
   Model type: GradientBoostingRegressor
   R² score: 0.7793


In [24]:
# Test prediction
print("\n" + "=" * 60)
print("TESTING PREDICTIONS")
print("=" * 60)

# Use a sample from the dataset
test_sample = X.iloc[:5].copy()
test_sample_processed = loaded_preprocessor.transform(test_sample)
test_sample_top10 = test_sample_processed[:, loaded_indices]

predictions = loaded_model.predict(test_sample_top10)

print("\nSample predictions:")
print(f"Actual values:    {y.iloc[:5].values}")
print(f"Predicted values: {predictions}")
print(f"\nRange check:")
print(f"  Min prediction: {predictions.min():.3f}")
print(f"  Max prediction: {predictions.max():.3f}")

if all(0 <= p <= 1 for p in predictions):
    print("✅ All predictions in valid range [0, 1]")
else:
    print("⚠️ Some predictions outside [0, 1] range")
    print("   Consider clipping predictions")
    



TESTING PREDICTIONS

Sample predictions:
Actual values:    [0.666 0.639 0.802 0.917 0.592]
Predicted values: [0.6616102  0.65444796 0.81849333 0.86394187 0.54749   ]

Range check:
  Min prediction: 0.547
  Max prediction: 0.864
✅ All predictions in valid range [0, 1]
