### Case Study 2: Customer Segmentation Pipeline
Segment customers into behavioral clusters from logs + transactions for use in:

Recommender systems

Email targeting

Churn prediction

Lifetime value modeling

In [1]:
import pandas as pd
import numpy as np

In [2]:
def load_data():
    users = pd.read_csv("users.csv", parse_dates=['signup_date'])
    orders = pd.read_csv("orders.csv", parse_dates=['timestamp'])
    sessions = pd.read_csv("sessions.csv", parse_dates=["timestamp"])
    return users, orders, sessions

In [3]:
users, orders, sessions = load_data()
print(users.head())
print(orders.head())
print(sessions.head())

  user_id                     email     city signup_date
0    U001      john.doe@example.com   Berlin  2022-01-15
1    U002    jane.smith@example.com   Munich  2022-03-22
2    U003     ali.ahmed@example.com  Hamburg  2023-02-10
3    U004  marie.dupont@example.com    Paris  2021-11-05
4    U005        li.wei@example.com   Berlin  2022-08-18
  user_id  timestamp   price  items
0    U001 2023-12-01  120.50      3
1    U001 2024-01-15  250.00      5
2    U002 2024-03-20   80.00      1
3    U003 2024-04-05   35.00      2
4    U001 2025-05-01   99.99      2
  user_id   event_type  timestamp
0    U001        login 2025-07-01
1    U001  add_to_cart 2025-07-01
2    U001     purchase 2025-07-01
3    U002        login 2025-06-30
4    U002       browse 2025-06-30


In [4]:
def build_features(users, orders, sessions):
    order_agg = orders.groupby("user_id").agg({
        'price': ['sum', 'count', 'mean'],
        'timestamp': 'max'
    })
    order_agg.columns = ['total_spent', 'num_orders', 'avg_order', 'last_order']
    order_agg['days_since_last_order'] = (pd.Timestamp.today() - order_agg['last_order']).dt.days

    sess_agg = sessions.groupby("user_id").event_type.value_counts().unstack().fillna(0)
    sess_agg.columns = [f"event_{col}" for col in sess_agg.columns]
    df = users.set_index('user_id').join([order_agg, sess_agg])
    df = df.dropna()
    return df

In [5]:
df = build_features(users, orders, sessions)
print(df.head())

                            email     city signup_date  total_spent  \
user_id                                                               
U001         john.doe@example.com   Berlin  2022-01-15       470.49   
U002       jane.smith@example.com   Munich  2022-03-22        80.00   
U003        ali.ahmed@example.com  Hamburg  2023-02-10       105.00   
U004     marie.dupont@example.com    Paris  2021-11-05       510.00   
U005           li.wei@example.com   Berlin  2022-08-18        45.00   

         num_orders  avg_order last_order  days_since_last_order  \
user_id                                                            
U001              3     156.83 2025-05-01                     79   
U002              1      80.00 2024-03-20                    486   
U003              2      52.50 2025-06-01                     48   
U004              2     255.00 2025-02-14                    155   
U005              1      45.00 2025-06-10                     39   

         event_add_to_car

In [6]:
from sklearn.preprocessing import StandardScaler
def normalize_features(df, features):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df[features])
    return X_scaled, scaler

In [7]:
features = ['total_spent', 'num_orders', 'avg_order', 'days_since_last_order']
X_scaled, scaler = normalize_features(df, features)

In [8]:
from sklearn.cluster import KMeans

def cluster_customers(X_scaled, df):
    kmeans = KMeans(n_clusters=4, random_state=42)
    df['cluster'] = kmeans.fit_predict(X_scaled)

    def label(row):
        if row['total_spent'] > 1000 and row['num_orders'] > 10:
            return 'VIP'
        elif row['days_since_last_order'] > 60:
            return 'Churn Risk'
        return 'Standard'

    df['segment'] = df.apply(label, axis=1)
    return df, kmeans

In [9]:
df, kmeans = cluster_customers(X_scaled, df)

In [10]:
def save_segments(df):
    df[['email', 'segment']].to_csv("output/segmented_customers.csv", index=False)
# save_segments(df)

In [12]:
import joblib
def save_models(scaler, kmeans):
    joblib.dump(scaler, 'scaler.pkl')
    joblib.dump(kmeans, 'kmeans')

In [13]:
save_models(scaler, kmeans)