In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

import sys
sys.path.append('../scripts')  
from load_data import load_and_clean_data, define_customer_df_features

In [None]:
df = load_and_clean_data()
customer_df = define_customer_df_features(df)

In [None]:
# standardize fetures
features = ['total_orders', 'total_spent', 'recency_days', 'aov'] #define feature to use for clustering
X = customer_df[features].dropna(subset=features)
customer_df_filtered = customer_df.loc[X.index]
scaler = StandardScaler() #to normalize the feature values
X_scaled = scaler.fit_transform(X) #scaling the data to have the same scale

In [None]:
# elbow method to find optimal K

inertia = []
K_range = range(1,10)
for k in K_range:
    # Create a KMeans model with k clusters and a fixed random seed for reproducibility
    kmeans = KMeans(n_clusters=k, random_state=42) #random_state can use any number; 
    kmeans.fit(X_scaled) # Fit the KMeans model on the scaled data
    inertia.append(kmeans.inertia_) #the lower the better; inertia = total within-cluster squared euclidean

plt.figure()
plt.plot(K_range, inertia, marker='o')
plt.title('Elbow Method For Optimal K')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Inertia')
plt.tight_layout()
plt.show()

In [None]:
# Kmeans
kmeans = KMeans(n_clusters=4, random_state=42) #based on elbow method result
customer_df_filtered['cluster'] = kmeans.fit_predict(X_scaled)

In [None]:
# visualize clusters
sns.pairplot(customer_df_filtered, vars=features, hue='cluster', palette='Set2')
plt.suptitle('Customer Segments', y=1.02)
plt.tight_layout()
plt.show()

In [None]:
# check cluster profiles
cluster_summary = customer_df_filtered.groupby('cluster')[features].mean()
print(cluster_summary)