# Clustering and dimensionality reduction of customers

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [None]:
# Set plot style in seaborn
sns.set(style='whitegrid')

Each time we load Colab, we need to upload our kaggle.json file to access the dataset. 

In [None]:
# Then, we need to move the kaggle.json file to the expected location  

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

## Load and view the data

In [None]:
# Download the Mall Customers dataset
!kaggle datasets download -d vjchoudhary7/customer-segmentation-tutorial-in-python

# Unzip the downloaded dataset
!unzip customer-segmentation-tutorial-in-python.zip

In [None]:
# Display the first few rows of the dataset
df.head()

In [None]:
# Check the structure and summary of the dataset
df.info()

In [None]:
# Summary statistics
df.describe()

## Preprocessing

In [None]:
# Check for missing values
df.isnull().sum()

In [None]:
# Get % of missing values in each column
missing = df.isnull().sum()
missing = missing[missing > 0] # Only look at columns with missing values
missing_percentage = missing / df.shape[0] * 100

In [None]:
# Remove customers with more than 50% missing values


In [None]:
# Visualise correlation matrix using heatmap
# This shows which features are related to each other 
plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Select the features for clustering, e.g. Annual Income, Spending Score
features = ['Annual Income (k$)', 'Spending Score (1-100)']

In [None]:
# Feature Scaling
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df[features])

## K-means clustering

In [None]:
# Choose the number of clusters, k
k = 5

In [None]:
# K-Means Clustering
kmeans = KMeans(n_clusters=k, 
                random_state=42) # Set random state for reproducibility

kmeans_labels = kmeans.fit_predict(df_scaled) # Fit the model and predict the clusters

df['KMeans_Labels'] = kmeans_labels # Add the labels to the original dataset

In [None]:
# Visualize K-Means Clusters
plt.figure(figsize=(10, 6)) # Set the figure size for better visualization

sns.scatterplot(x=df['Annual Income (k$)'], y=df['Spending Score (1-100)'], hue=kmeans_labels, palette='viridis') # Create a scatter plot in seaborn

plt.title('K-Means Clustering') # Set the title of the plot
plt.xlabel('Annual Income (k$)') # Set the x-axis label
plt.ylabel('Spending Score (1-100)') # Set the y-axis label
plt.legend(title='Clusters') # Add a legend to the plot

plt.show() # Show the plot

In [None]:
# Visualize the clusters in 3D
fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(111, projection='3d')

ax.scatter(df['Annual Income (k$)'], df['Spending Score (1-100)'], df['Age'], c=kmeans_labels, cmap='viridis')

ax.set_xlabel('Annual Income (k$)')
ax.set_ylabel('Spending Score (1-100)')
ax.set_zlabel('Age')

plt.title('K-Means Clustering in 3D')
plt.show()

Here, we have used 2 features (annual income and spending score) so our clusters are in 2D and easy to visualise. If you want to use more than 2 features, you can visualise them in a more limited way by 

## Hierarchical clustering

In [None]:
# Set number of clusters for Agglomerative Clustering
n = 5 # Number of clusters

In [None]:
# Hierarchical Clustering
hierarchical = AgglomerativeClustering(n_clusters=n) # Create an instance of the model

hierarchical_labels = hierarchical.fit_predict(df_scaled) # Fit the model and predict the clusters

df['Hierarchical_Labels'] = hierarchical_labels # Add the labels to the original dataset

In [None]:
# Visualize Hierarchical Clusters

plt.figure(figsize=(10, 6))
sns.scatterplot(x=df['Annual Income (k$)'], y=df['Spending Score (1-100)'], hue=hierarchical_labels, palette='viridis') # Create a scatter plot in seaborn
plt.title('Hierarchical Clustering')
plt.show()

## DBSCAN clustering

In [None]:
# DBSCAN Clustering
dbscan = DBSCAN(eps=0.5, min_samples=5) # Create an instance of the model with parameters eps and min_samples 
dbscan_labels = dbscan.fit_predict(df_scaled) # Fit the model and predict the clusters
df['DBSCAN_Labels'] = dbscan_labels # Add the labels to the original dataset

In [None]:
# Visualize DBSCAN Clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x=df['Annual Income (k$)'], y=df['Spending Score (1-100)'], hue=dbscan_labels, palette='viridis')
plt.title('DBSCAN Clustering')
plt.show()

## Clustering evaluation

We can use the silhoette score which measures how similar a data point is to its own cluster compared to other clusters. Values range from -1 to 1, with higher values indicating better-defined clusters.

In [None]:
from sklearn.metrics import silhouette_score

# Calculate the silhouette score for K-Means Clustering
silhouette_kmeans = silhouette_score(df_scaled, kmeans_labels)
print(f'Silhouette Score for K-Means Clustering: {silhouette_kmeans}')

# Calculate the silhouette score for Hierarchical Clustering
silhouette_hierarchical = silhouette_score(df_scaled, hierarchical_labels)
print(f'Silhouette Score for Hierarchical Clustering: {silhouette_hierarchical}')

# Calculate the silhouette score for DBSCAN Clustering
silhouette_dbscan = silhouette_score(df_scaled, dbscan_labels)
print(f'Silhouette Score for DBSCAN Clustering: {silhouette_dbscan}')

## Dimensionality reduction
Let's take all the features, and use PCA (Principal Component Analysis) to find the axis of greatest variation and project the data onto 2D. 

In [None]:
pca = PCA(n_components=2) # Create an instance of the PCA model with 2 components
pca_components = pca.fit_transform(df_scaled) # Fit the model and transform the data

df['PCA1'] = pca_components[:, 0] # Add the first component to the original dataset
df['PCA2'] = pca_components[:, 1] # Add the second component to the original dataset

In [None]:
# Visualize PCA Components
plt.figure(figsize=(10, 6)) # Set the figure size for better visualization
sns.scatterplot(x=df['PCA1'], y=df['PCA2'], hue=kmeans_labels, palette='viridis') # Create a scatter plot in seaborn
plt.title('PCA - K-Means Clustering') # Set the title of the plot
plt.show() 