# Clustering and dimensionality reduction of customers

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [None]:
# Set plot style in seaborn
sns.set(style='whitegrid')

Each time we load Colab, we need to upload our kaggle.json file to access the dataset. 

In [None]:
# Then, we need to move the kaggle.json file to the expected location  

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

## Load and view the data

In [None]:
# Download the Mall Customers dataset
!kaggle datasets download -d vjchoudhary7/customer-segmentation-tutorial-in-python

# Unzip the downloaded dataset
!unzip customer-segmentation-tutorial-in-python.zip

In [None]:
# Load the dataset of Mall Customers
df = pd.read_csv('Mall_Customers.csv')

In [None]:
# Display the first few rows of the dataset
df.head()

In [None]:
# Check the structure and summary of the dataset
df.info()

In [None]:
# Summary statistics
df.describe()

## Preprocessing

In [None]:
# Check for missing values
df.isnull().sum()

No missing values - this dataset looks clean! Do you want to check anything else?

In [None]:
# In df.info(), we can see that the column 'Gender' has dtype 'object', which means it's non numerical.
# Apply one-hot-encoding to make this data numerical

df = pd.get_dummies(df, columns=['Gender'], drop_first=True)

In [None]:
# We will drop the column 'customer ID' because it is not a real/useful feature
df = df.drop('CustomerID', axis=1)

In [None]:
# Visualise correlation matrix using heatmap
# This shows which features are related to each other 
plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

We can see from the correlation matrix that Age and Spending Score are negatively correlated (-0.33). Otherwise, there are not many correlations.

In [None]:
# Select the features for clustering, e.g. Annual Income, Spending Score - try different options, including 3 if you want (see 3D visualisation below)
features = ['Annual Income (k$)', 'Spending Score (1-100)']

In [None]:
# Feature Scaling
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df[features])

## K-means clustering

In [None]:
# Choose the number of clusters, k
k = 5

In [None]:
# K-Means Clustering
kmeans = KMeans(n_clusters=k, 
                random_state=42) # Set random state for reproducibility

kmeans_labels = kmeans.fit_predict(df_scaled) # Fit the model and predict the clusters

df['KMeans_Labels'] = kmeans_labels # Add the labels to the original dataset

In [None]:
# Visualize K-Means Clusters
plt.figure(figsize=(10, 6)) # Set the figure size for better visualization

sns.scatterplot(x=df['Annual Income (k$)'], y=df['Spending Score (1-100)'], hue=kmeans_labels, palette='viridis') # Create a scatter plot in seaborn

plt.title('K-Means Clustering') # Set the title of the plot
plt.xlabel('Annual Income (k$)') # Set the x-axis label
plt.ylabel('Spending Score (1-100)') # Set the y-axis label
plt.legend(title='Clusters') # Add a legend to the plot

plt.show() # Show the plot

In [None]:
# # Optional - if you chose to use 3 features - visualize the clusters in 3D (remove # on each line to run this bit of code)
# fig = plt.figure(figsize=(10, 6))
# ax = fig.add_subplot(111, projection='3d')
# 
# # Change 'Annual Income, 'Spending Score', 'Age' to whichever 3 features you chose
# ax.scatter(df['Annual Income (k$)'], df['Spending Score (1-100)'], df['Age'], c=kmeans_labels, cmap='viridis')
# 
# ax.set_xlabel('Annual Income (k$)')
# ax.set_ylabel('Spending Score (1-100)')
# ax.set_zlabel('Age')
# 
# plt.title('K-Means Clustering in 3D')
# plt.show()

## Hierarchical clustering

In [None]:
# Set number of clusters for Agglomerative Clustering
n = 5 # Number of clusters

In [None]:
# Hierarchical Clustering
hierarchical = AgglomerativeClustering(n_clusters=n) # Create an instance of the model

hierarchical_labels = hierarchical.fit_predict(df_scaled) # Fit the model and predict the clusters

df['Hierarchical_Labels'] = hierarchical_labels # Add the labels to the original dataset

In [None]:
# Visualize Hierarchical Clusters

plt.figure(figsize=(10, 6))
sns.scatterplot(x=df['Annual Income (k$)'], y=df['Spending Score (1-100)'], hue=hierarchical_labels, palette='viridis') # Create a scatter plot in seaborn
plt.title('Hierarchical Clustering')
plt.show()

If you compare the clustering from k-means and hierarchical, can you see any difference?

## DBSCAN clustering

In [None]:
# DBSCAN Clustering
dbscan = DBSCAN(eps=0.5, min_samples=5) # Create an instance of the model with parameters eps and min_samples 
dbscan_labels = dbscan.fit_predict(df_scaled) # Fit the model and predict the clusters
df['DBSCAN_Labels'] = dbscan_labels # Add the labels to the original dataset

In [None]:
# Visualize DBSCAN Clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x=df['Annual Income (k$)'], y=df['Spending Score (1-100)'], hue=dbscan_labels, palette='viridis')
plt.title('DBSCAN Clustering')
plt.show()

This clustering doesn't look as good. Can you change the parameters eps and min_samples to improve it? 

https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html

Why might DBSCAN not perform so well for this data?

## Clustering evaluation

We can use the silhoette score which measures how similar a data point is to its own cluster compared to other clusters. Values range from -1 to 1, with higher values indicating better-defined clusters.

In [None]:
from sklearn.metrics import silhouette_score

# Calculate the silhouette score for K-Means Clustering
silhouette_kmeans = silhouette_score(df_scaled, kmeans_labels)
print(f'Silhouette Score for K-Means Clustering: {silhouette_kmeans}')

# Calculate the silhouette score for Hierarchical Clustering
silhouette_hierarchical = silhouette_score(df_scaled, hierarchical_labels)
print(f'Silhouette Score for Hierarchical Clustering: {silhouette_hierarchical}')

# Calculate the silhouette score for DBSCAN Clustering
silhouette_dbscan = silhouette_score(df_scaled, dbscan_labels)
print(f'Silhouette Score for DBSCAN Clustering: {silhouette_dbscan}')

## Dimensionality reduction
Let's take all the features, and use PCA (Principal Component Analysis) to find the axis of greatest variation and project the data onto 2D. 

In [None]:
pca = PCA(n_components=2) # Create an instance of the PCA model with 2 components
pca_components = pca.fit_transform(df_scaled) # Fit the model and transform the data

df['PCA1'] = pca_components[:, 0] # Add the first component to the original dataset
df['PCA2'] = pca_components[:, 1] # Add the second component to the original dataset

In [None]:
# Visualize PCA Components
plt.figure(figsize=(10, 6)) # Set the figure size for better visualization
sns.scatterplot(x=df['PCA1'], y=df['PCA2']) # Create a scatter plot in seaborn
plt.title('PCA') # Set the title of the plot
plt.show() 

After using PCA, our data (with all features) has been projected onto 2D which will make clustering and visualisation much easier. 

Let's do k-means clustering on this 2D projection of our data. 

In [None]:
# K-means on PCA1 and PCA2 as features
pca_features = ['PCA1', 'PCA2']
pca_df = df[pca_features]

pca_scaler = StandardScaler()
pca_scaled = pca_scaler.fit_transform(pca_df)

pca_kmeans = KMeans(n_clusters=5, random_state=42)
pca_kmeans_labels = pca_kmeans.fit_predict(pca_scaled)

df['PCA_KMeans_Labels'] = pca_kmeans_labels

In [None]:
# Visualise PCA with k-means clustering 
sns.scatterplot(x=df['PCA1'], y=df['PCA2'], hue=pca_kmeans_labels, palette='viridis') # Scatter plot coloured by k_means labels
plt.title('K-Means Clustering')
plt.show()

The data splits nicely into 5 clusters when considering all features. Let's look at which type of customer these clusters represent.

## Customer profiles

In [None]:
# Get mean values of each column, grouped by KMeans_Labels cluster number
df.groupby('KMeans_Labels').mean()