In [4]:
import pandas as pd
import numpy as np
import pickle

# Load cleaned data from previous notebook using pickle
with open('step2.pkl', 'rb') as f:
    df = pickle.load(f)
    
print("Cleaned data loaded from step2.pkl:", df.shape)

Cleaned data loaded from step2.pkl: (8536, 9)


# Preprocessing & Feature Engineering
Encode categorical variables, scale numerical features, create customer-level features (spend, frequency, ratios), and aggregate into a structured dataset ready for clustering.

In [5]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [6]:
categorical_cols = ['brand', 'category_code', 'event_type']
numeric_cols = ['price']

In [7]:
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('num', StandardScaler(), numeric_cols)
])
X = preprocessor.fit_transform(df[categorical_cols + numeric_cols])

## Create Customer-Level Features

In [8]:
customer_features = df.groupby('user_id').agg({
    'price': ['sum', 'mean', 'count'],
    'event_type': 'nunique',
    'category_code': 'nunique',
    'brand': 'nunique'
})
customer_features.columns = ['total_spend', 'avg_spend', 'purchase_count', 'event_type_count', 'category_count', 'brand_count']
customer_features = customer_features.reset_index()

In [9]:
# Save preprocessed data and customer features for next notebook using pickle
import pickle

# Save all data using pickle
data_bundle = {
    'X': X,
    'preprocessor': preprocessor,
    'customer_features': customer_features,
    'df': df
}

with open('step3.pkl', 'wb') as f:
    pickle.dump(data_bundle, f)

print("Preprocessed data saved as step3.pkl")

Preprocessed data saved as step3.pkl
