# Preprocessing

This notebook covers:
- Phase 4: Preprocessing Pipeline

**Note:** Run `exploration.ipynb` first to prepare the data.


In [17]:
# Import libraries
import pandas as pd
import numpy as np
import pickle
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# Set random seed for reproducibility
RANDOM_STATE = 777
np.random.seed(RANDOM_STATE)

print("Libraries imported successfully!")


Libraries imported successfully!


In [18]:
# Load and prepare data (same as exploration.ipynb)
df = pd.read_csv('data/spotify-tracks.csv')

# Remove non-predictive columns
columns_to_drop = ['spotify_id', 'name', 'artists', 'album_name', 'album_release_date',
                   'popular_in_country', 'mode', 'is_explicit', 'release_year', 
                   'key', 'time_signature']
df_clean = df.drop(columns=columns_to_drop, errors='ignore')

# Define target
target = 'energy'
y = df_clean[target].copy()
X = df_clean.drop(columns=[target]).copy()

# Feature engineering (from exploration.ipynb)
# Only valid interaction features (no target leakage, no data leakage)
X_engineered = X.copy()

# Interaction features
X_engineered['loudness_tempo'] = X_engineered['loudness'] * X_engineered['tempo']
X_engineered['danceability_valence'] = X_engineered['danceability'] * X_engineered['valence']
X_engineered['loudness_danceability'] = X_engineered['loudness'] * X_engineered['danceability']
X_engineered['tempo_valence'] = X_engineered['tempo'] * X_engineered['valence']

X = X_engineered.copy()

print(f"Data loaded: {X.shape[0]} samples, {X.shape[1]} features")


Data loaded: 21585 samples, 17 features


## Phase 4: Preprocessing Pipeline


In [19]:
# Define feature types
numeric_features = [
    'danceability', 'liveness', 'tempo', 'loudness', 'speechiness',
    'duration_ms', 'instrumentalness', 'popularity', 'dynamic_range',
    'valence', 'rhythmic_complexity', 'acousticness',
    'release_month'
]

categorical_features = [
    # Note: Most categorical features were dropped
]

# Get all numeric columns (including engineered features)
numeric_cols = [col for col in X.columns if X[col].dtype in ['int64', 'float64'] 
                and col not in categorical_features]

categorical_cols = [col for col in categorical_features if col in X.columns]

print(f"Numeric columns: {len(numeric_cols)}")
print(f"Categorical columns: {len(categorical_cols)}")


Numeric columns: 16
Categorical columns: 0


In [20]:
# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=None
)

print("=" * 60)
print("TRAIN/TEST SPLIT")
print("=" * 60)
print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Features: {X_train.shape[1]}")


TRAIN/TEST SPLIT
Training set: 17268 samples
Test set: 4317 samples
Features: 17


In [21]:
# Create preprocessing pipelines
print("=" * 60)
print("PREPROCESSING PIPELINE")
print("=" * 60)

# Numeric pipeline: Median imputation + StandardScaler
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical pipeline: Most frequent imputation + OneHotEncoder
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False))
])

# ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='drop'
)

print("✅ Preprocessing pipelines created")


PREPROCESSING PIPELINE
✅ Preprocessing pipelines created


In [22]:
# Fit and transform
print("\nFitting preprocessor on training data...")
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

print(f"\n✅ Preprocessing complete")
print(f"Training shape: {X_train_processed.shape}")
print(f"Test shape: {X_test_processed.shape}")
print(f"Total features after preprocessing: {X_train_processed.shape[1]}")

# Get feature names
feature_names = preprocessor.get_feature_names_out()
print(f"\nFirst 10 feature names: {feature_names[:10].tolist()}")



Fitting preprocessor on training data...

✅ Preprocessing complete
Training shape: (17268, 16)
Test shape: (4317, 16)
Total features after preprocessing: 16

First 10 feature names: ['num__danceability', 'num__liveness', 'num__tempo', 'num__loudness', 'num__speechiness', 'num__duration_ms', 'num__instrumentalness', 'num__popularity', 'num__dynamic_range', 'num__valence']


In [23]:
# Verify no data leakage
print("=" * 60)
print("DATA LEAKAGE CHECK")
print("=" * 60)
print("✅ Preprocessor fitted only on training data")
print("✅ Test set transformed separately")
print("✅ No target variable in preprocessing pipeline")


DATA LEAKAGE CHECK
✅ Preprocessor fitted only on training data
✅ Test set transformed separately
✅ No target variable in preprocessing pipeline


In [24]:
# Save preprocessor
import os
os.makedirs('models', exist_ok=True)

with open('models/preprocessor.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)

print("✅ Preprocessor saved to 'models/preprocessor.pkl'")


✅ Preprocessor saved to 'models/preprocessor.pkl'


## Summary

In this notebook we:
1. ✅ Split data into train/test sets (80/20)
2. ✅ Created preprocessing pipelines:
   - Numeric: Median imputation + StandardScaler
   - Categorical: Most frequent imputation + OneHotEncoder
3. ✅ Fitted preprocessor on training data
4. ✅ Transformed train and test sets
5. ✅ Verified no data leakage
6. ✅ Saved preprocessor for later use

**Next:** Move to `modelling.ipynb` for model training and evaluation.
