In [None]:
# customer_segmentation.ipynb

# Importing the necessary libraries for data analysis and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import plotly.express as px

In [None]:
# Load the data
# The customer data CSV file is loaded here
data = pd.read_csv('data/customer_data.csv')

# Show the first few rows of the dataset to understand its structure
data.head()
# Data Preprocessing

# Check for missing values
print("Missing Values:", data.isnull().sum())

# Convert 'Gender' to binary (Male = 1, Female = 0)
data['Gender'] = data['Gender'].map({'Male': 1, 'Female': 0})

# Checking the basic statistics of the data
print(data.describe())


In [None]:
# Normalize the data (Scaling the numerical features for clustering)
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data[['Age', 'AnnualIncome', 'SpendingScore']])


In [None]:
# Perform K-Means clustering for customer segmentation

# Let's determine the optimal number of clusters using the Elbow Method
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=42)
    kmeans.fit(data_scaled)
    wcss.append(kmeans.inertia_)

# Plot the WCSS to visually determine the "elbow" (optimal clusters)
plt.figure(figsize=(10,6))
plt.plot(range(1, 11), wcss)
plt.title('Elbow Method for Optimal Clusters')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
# Based on the elbow method, we choose 3 clusters as the optimal number
kmeans = KMeans(n_clusters=3, init='k-means++', max_iter=300, n_init=10, random_state=42)
data['Cluster'] = kmeans.fit_predict(data_scaled)

# Visualize the clusters using a scatter plot
plt.figure(figsize=(10,6))
sns.scatterplot(x=data['Age'], y=data['AnnualIncome'], hue=data['Cluster'], palette='viridis', s=100)
plt.title('Customer Segments (Clustered)')
plt.xlabel('Age')
plt.ylabel('Annual Income')
plt.legend(title='Cluster', loc='best')
plt.show()

In [None]:
# Cluster Profile Analysis (describe each segment)
cluster_profile = data.groupby('Cluster').agg({
    'Age': ['mean', 'std'],
    'AnnualIncome': ['mean', 'std'],
    'SpendingScore': ['mean', 'std'],
    'LoyaltyStatus': ['mean'],
    'ProductCategory': lambda x: x.mode()[0]  # Most frequent product category per cluster
})

print(cluster_profile)