In [8]:
# ==========================================
# Spending Pattern Analysis (Clustering)
# - K-Means + Clustering Metrics
# ==========================================

import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score


# --------------------------------
# 1) Load dataset
# --------------------------------
CSV_PATH = r"C:\Users\MR HIIL\Desktop\DS and Machine learning\ds-ml-bootcamp\submissions\Abdalla-hiil\Assignment_6\spending_labeled_clusters.csv"
df = pd.read_csv(CSV_PATH)
print("\n=== INITIAL SNAPSHOT ===")
print(df.head())

# --------------------------------
# 2) Select features for clustering
# --------------------------------
FEATURES = ["Income_$", "SpendingScore"]
X = df[FEATURES].copy()

# Fill missing numeric values (if any) with median
for col in FEATURES:
    if X[col].isna().any():
        X[col] = X[col].fillna(X[col].median())

print("\n=== FEATURES HEAD ===")
print(X.head())

# --------------------------------
# 3) Scale features
# --------------------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("\nScaled shape:", X_scaled.shape)

# --------------------------------
# 4) Elbow method (print SSE)
# --------------------------------
print("\n=== ELBOW METHOD (SSE per k) ===")
for k in range(1, 11):
    km = KMeans(n_clusters=k, n_init="auto", random_state=42)
    km.fit(X_scaled)
    print(f"k={k} → SSE={km.inertia_:.2f}")

# --------------------------------
# 5) Fit K-Means with chosen k
# --------------------------------
# Based on SSE trend, pick a K (example: K=3 or 4)
K = 3  
kmeans = KMeans(n_clusters=K, n_init="auto", random_state=42)
labels = kmeans.fit_predict(X_scaled)

df["Cluster"] = labels.astype(int)
print(f"\n=== SAMPLE WITH {K} CLUSTERS ===")
print(df.head())

# --------------------------------
# 6) Evaluate clustering
# --------------------------------
sil = silhouette_score(X_scaled, labels)
dbi = davies_bouldin_score(X_scaled, labels)
print("\n=== METRICS ===")
print(f"Silhouette Score : {sil:.3f}")
print(f"Davies–Bouldin   : {dbi:.3f}")

# --------------------------------
# 7) Cluster centers (back to original units)
# --------------------------------
centers_scaled = kmeans.cluster_centers_
centers_original = scaler.inverse_transform(centers_scaled)

centers_df = pd.DataFrame(centers_original, columns=FEATURES)
centers_df.index.name = "Cluster"

print("\n=== CLUSTER CENTERS (Original Units) ===")
print(centers_df.round(2))

# --------------------------------
# 8) Sanity check (3 customers)
# --------------------------------
sample_idx = [0, 5, 10]  # Selection of three different rows
sanity = df.loc[sample_idx, FEATURES + ["Cluster"]]
print("\n=== SANITY CHECK (3 Customers) ===")
print(sanity)



=== INITIAL SNAPSHOT ===
   CustomerID  Age  Income_$  SpendingScore  VisitsPerMonth  OnlinePurchases  \
0           1   28        33             78              14                9   
1           2   21        25             87               8               23   
2           3   23        24             88              13               10   
3           4   24        25             73              16               11   
4           5   20        23             88              17               16   

   Gender Region  Cluster  
0  Female   East        2  
1    Male  North        2  
2    Male  South        2  
3  Female   West        2  
4    Male   West        2  

=== FEATURES HEAD ===
   Income_$  SpendingScore
0        33             78
1        25             87
2        24             88
3        25             73
4        23             88

Scaled shape: (200, 2)

=== ELBOW METHOD (SSE per k) ===
k=1 → SSE=400.00
k=2 → SSE=199.70
k=3 → SSE=79.37
k=4 → SSE=21.37
k=5 → SSE=19.09
