In [30]:
# Initial imports
import pandas as pd
from sklearn.cluster import KMeans
import plotly.express as px
import hvplot.pandas
import numpy as np

In [46]:
# Load data
file_path = "shopping_data.csv"
df_shopping = pd.read_csv(file_path)
df_shopping = pd.DataFrame(df_shopping)
df_shopping.dropna
df_shopping['Annual Income'] = df_shopping['Annual Income'].div(1000)
df_shopping.head(10)

Unnamed: 0,CustomerID,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,Yes,19.0,15.0,39.0
1,2,Yes,21.0,15.0,81.0
2,3,No,20.0,16.0,6.0
3,4,No,23.0,16.0,77.0
4,5,No,31.0,17.0,40.0
5,6,No,22.0,17.0,76.0
6,7,No,35.0,18.0,6.0
7,8,No,23.0,18.0,94.0
8,9,Yes,64.0,19.0,3.0
9,10,No,30.0,19.0,72.0


In [47]:
a = pd.Series(np.where(df_shopping["Card Member"].values == 'Yes', 1, 0),
    df_shopping.index)

df_shopping["Card members 2"]=a.values
df_shopping.drop('Card Member', axis=1, inplace=True)
df_shopping.head()

Unnamed: 0,CustomerID,Age,Annual Income,Spending Score (1-100),Card members 2
0,1,19.0,15.0,39.0,1
1,2,21.0,15.0,81.0,1
2,3,20.0,16.0,6.0,0
3,4,23.0,16.0,77.0,0
4,5,31.0,17.0,40.0,0


In [48]:
# See what the points look like at the start by entering the code. 
df_shopping.hvplot.scatter(x="Annual Income", y="Spending Score (1-100)")

On first look, it may seem obvious the amount of clusters that would work, but let's see what happens when we start to cluster.

First, let's create a function so we can quickly run K-means on the DataFrame with a different amount of clusters by entering the following code:

In [49]:
# this function is needed so that NaN values can be replcaed with something!

from sklearn.impute import SimpleImputer
def replace_missing_value(df, number_features):

    imputer = SimpleImputer(strategy="median")
    df_num = df[number_features]
    imputer.fit(df_num)
    X = imputer.transform(df_num)
    res_def = pd.DataFrame(X, columns=df_num.columns)
    return res_def

In [50]:
number_features = ['CustomerID', 'Age', 'Annual Income', 'Spending Score (1-100)', 'Card members 2']
df_shopping = replace_missing_value(df_shopping, number_features)

In [51]:
# Function to cluster and plot dataset
def test_cluster_amount(df, clusters):
    model = KMeans(n_clusters=clusters, random_state=5)
    model

    # Fitting model
    model.fit(df)

    # Add a new class column
    df["class"] = model.labels_

In [52]:
# This function will take a DataFrame and the number of clusters to make as arguments. 
# Start by running the function to create two clusters and then plot the results:

test_cluster_amount(df_shopping, 2)
df_shopping.hvplot.scatter(x="Annual Income", y="Spending Score (1-100)", by="class")

In [53]:
fig = px.scatter_3d(
    df_shopping,
    x="Annual Income",
    y="Spending Score (1-100)",
    z="Age",
    color="class",
    symbol="class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [54]:
# Use the Elbow Curve
# Let's walk through an example of how to use the elbow curve. 
# This time, we'll answer the question about the customer data dataset and how many clusters would be ideal

# Initial imports
import pandas as pd
from sklearn.cluster import KMeans
import plotly.express as px
import hvplot.pandas

df_shopping.head(10)

Unnamed: 0,CustomerID,Age,Annual Income,Spending Score (1-100),Card members 2,class
0,1.0,19.0,15.0,39.0,1.0,0
1,2.0,21.0,15.0,81.0,1.0,0
2,3.0,20.0,16.0,6.0,0.0,0
3,4.0,23.0,16.0,77.0,0.0,0
4,5.0,31.0,17.0,40.0,0.0,0
5,6.0,22.0,17.0,76.0,0.0,0
6,7.0,35.0,18.0,6.0,0.0,0
7,8.0,23.0,18.0,94.0,0.0,0
8,9.0,64.0,19.0,3.0,1.0,0
9,10.0,30.0,19.0,72.0,0.0,0


In [55]:
# To create the elbow curve, remember there are two values we need: 
# a list of K values and a list of inertia values. 
# Recall that inertia is the objective function to plot K values against. 
# We will loop through 10 values for K and determine the inertia:

inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_shopping)
    inertia.append(km.inertia_)



KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.



In [56]:
print(len(k), len(inertia)) # Print the lengths to make sure they are the same!

10 10


In [57]:
# Next, let's create a plot for the elbow curve:

elbow_data = {"k": k, "inertia": inertia}

df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)

In [58]:
# This elbow curve doesn't have as obvious of an elbow as previously seen.
# -------> Remember, we're looking for the break where the vertical direction shifts to a strong horizontal direction.

# let's create a K-means function again to reuse the K-means cluster. 
# As you may recall, functions allow us to save time because we don't need to write the code 
# contained in the function more than once:

def get_clusters(k, data):
    # Create a copy of the DataFrame
    data = data.copy()

    # Initialize the K-Means model
    model = KMeans(n_clusters=k, random_state=0)

    # Fit the model
    model.fit(data)

    # Predict clusters
    predictions = model.predict(data)

    # Create return DataFrame with predicted clusters
    data["class"] = model.labels_

    return data
    
    

In [59]:
# Test with 5 clusters
five_clusters = get_clusters(5, df_shopping)
five_clusters.head()

Unnamed: 0,CustomerID,Age,Annual Income,Spending Score (1-100),Card members 2,class
0,1.0,19.0,15.0,39.0,1.0,4
1,2.0,21.0,15.0,81.0,1.0,4
2,3.0,20.0,16.0,6.0,0.0,0
3,4.0,23.0,16.0,77.0,0.0,4
4,5.0,31.0,17.0,40.0,0.0,0


In [64]:
# Test with 6 clusters
six_clusters = get_clusters(6, df_shopping)
six_clusters.head()

Unnamed: 0,CustomerID,Age,Annual Income,Spending Score (1-100),Card members 2,class
0,1.0,19.0,15.0,39.0,1.0,2
1,2.0,21.0,15.0,81.0,1.0,5
2,3.0,20.0,16.0,6.0,0.0,2
3,4.0,23.0,16.0,77.0,0.0,5
4,5.0,31.0,17.0,40.0,0.0,2


In [61]:
# Test with 2 clusters
two_clusters = get_clusters(2, df_shopping)
two_clusters.head()

Unnamed: 0,CustomerID,Age,Annual Income,Spending Score (1-100),Card members 2,class
0,1.0,19.0,15.0,39.0,1.0,1
1,2.0,21.0,15.0,81.0,1.0,1
2,3.0,20.0,16.0,6.0,0.0,1
3,4.0,23.0,16.0,77.0,0.0,1
4,5.0,31.0,17.0,40.0,0.0,1


In [62]:
# Plot the 5 clusters data
five_clusters.hvplot.scatter(x="Annual Income", y="Spending Score (1-100)", by="class")

In [66]:
# Plot the 6 clusters data
six_clusters.hvplot.scatter(x="Annual Income", y="Spending Score (1-100)", by="class")

In [65]:
# Plot a 3D graph for K = 5:

# Plot the 3D-scatter with x="Annual Income", y="Spending Score (1-100)" and z="Age"
fig = px.scatter_3d(
    five_clusters,
    x="Age",
    y="Spending Score (1-100)",
    z="Annual Income",
    color="class",
    symbol="class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

Recall, in the trial-and-error method, both graphs displayed multiple clusters. We're still applying some trial and error here, but the elbow curve helps narrow down the number of clusters.

Now, the important question: So do we use five or six groups? This depends on what insights you can take away from the data. One might conclude that six groups would be most useful because they could be broken down like so:

Cluster 0: medium income, low annual spend
Cluster 1: low income, low annual spend
Cluster 2: high income, low annual spend
Cluster 3: low income, high annual spend
Cluster 4: medium income, high annual spend
Cluster 5: very high income, high annual spend
If we choose five groups, they would need to be different and would not fit into what you're looking for, which is grouping types of customers based on spending habits. Remember, unsupervised learning can help us make decisions about the data, up to a point, then it is up to you, the expert, to make the final call.