In [None]:
# 01_customer_segmentation.ipynb

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_csv('/mnt/data/ecommerce_customer_data_large.csv')

# Preprocessing
df.fillna(df.mean(), inplace=True)  # Fill missing values with column mean
features = ['age', 'annual_income', 'spending_score']  # Select relevant features

# Standardize features
scaler = StandardScaler()
X = scaler.fit_transform(df[features])

# Apply K-Means Clustering
kmeans = KMeans(n_clusters=5, random_state=42)
df['cluster'] = kmeans.fit_predict(X)

# Evaluate clustering using silhouette score
sil_score = silhouette_score(X, df['cluster'])
print(f'Silhouette Score: {sil_score}')

# Visualize clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=df['cluster'], palette='viridis', s=100)
plt.title('Customer Segments using K-Means Clustering')
plt.show()
