### 1. Setup & Imports

In [24]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score


### 2. Load Dataset

In [6]:
CSV_PATH = "spending_l9_dataset.csv"
df = pd.read_csv(CSV_PATH)
print("Dataset Snapshot:")
print(df.head())
print("\nColumns:", df.columns.tolist())

Dataset Snapshot:
   CustomerID  Age  Income_$  SpendingScore  VisitsPerMonth  OnlinePurchases  \
0           1   28        33             78              14                9   
1           2   21        25             87               8               23   
2           3   23        24             88              13               10   
3           4   24        25             73              16               11   
4           5   20        23             88              17               16   

   Gender Region  
0  Female   East  
1    Male  North  
2    Male  South  
3  Female   West  
4    Male   West  

Columns: ['CustomerID', 'Age', 'Income_$', 'SpendingScore', 'VisitsPerMonth', 'OnlinePurchases', 'Gender', 'Region']


### 3. Prepare Features

In [3]:
FEATURES = ["Income_$", "SpendingScore"]
X = df[FEATURES].copy()

# Handle missing values via median (numeric only)
for col in FEATURES:
    if X[col].isna().any():
        median_val = X[col].median()
        X[col] = X[col].fillna(median_val)
        print(f"Filled missing values in {col} with median: {median_val}")

# Scale Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("Features scaled successfully. Shape:", X_scaled.shape)

Features scaled successfully. Shape: (200, 2)


### 4. Elbow Check (SSE)

In [25]:
print("ELBOW METHOD (SSE per k) :")
for k in range(1, 11):
    km = KMeans(n_clusters=k, n_init="auto", random_state=42)
    km.fit(X_scaled)
    print(f"k={k} → SSE={km.inertia_:.2f}")

ELBOW METHOD (SSE per k) :
k=1 → SSE=400.00
k=2 → SSE=199.70
k=3 → SSE=79.37
k=4 → SSE=21.37
k=5 → SSE=19.09
k=6 → SSE=15.65
k=7 → SSE=14.48
k=8 → SSE=13.81
k=9 → SSE=12.94
k=10 → SSE=11.52




### 5. Model Training (Pick K)

In [11]:
K_CHOSEN = 4 
kmeans = KMeans(n_clusters=K_CHOSEN, n_init="auto", random_state=42)
labels = kmeans.fit_predict(X_scaled)

df["Cluster"] = labels.astype(int)
print(f"Model trained with K={K_CHOSEN}.")

Model trained with K=4.




### 6. Evaluate Clustering

In [13]:
sil = silhouette_score(X_scaled, labels)
dbi = davies_bouldin_score(X_scaled, labels)

print("METRICS :")
print(f"Silhouette Score : {sil:.3f}")
print(f"Davies–Bouldin   : {dbi:.3f}")

METRICS :
Silhouette Score : 0.729
Davies–Bouldin   : 0.387


### 7. Cluster Centers (Original Units)

In [14]:
centers_scaled = kmeans.cluster_centers_
centers_original = scaler.inverse_transform(centers_scaled)

centers_df = pd.DataFrame(centers_original, columns=FEATURES)
centers_df.index.name = "Cluster"

print("CLUSTER CENTERS (Original Units) :")
print(centers_df.round(2))

CLUSTER CENTERS (Original Units) :
         Income_$  SpendingScore
Cluster                         
0           56.32          53.58
1           28.92          19.60
2           24.14          83.10
3           99.16          79.24


### 8. Sanity Check

In [20]:
print("SANITY CHECK (Sample Rows) :")
print(df[["Income_$", "SpendingScore", "Cluster"]].sample(3, random_state=42))

SANITY CHECK (Sample Rows) :
    Income_$  SpendingScore  Cluster
95        53             61        0
15        19             86        2
30        27             80        2


### 9. Save Output

In [22]:
OUT_PATH = "spending_labeled_clusters.csv"
df.to_csv(OUT_PATH, index=False)
print(f"Labeled dataset saved to: {OUT_PATH}")

Labeled dataset saved to: spending_labeled_clusters.csv
