# Clustering Customer Data

Load the required libraries

In [None]:
import pandas as pd
from sklearn import preprocessing
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt

Get the data

In [None]:
customer_data = pd.read_csv("Retail_data.csv")

In [None]:
print(customer_data[0:5])

## Clean the Data

Is there any missing data?

In [None]:
print(customer_data.isnull().sum().sum())

Convert category data to values

In [None]:
column_name = 'Gender'
one_hot = pd.get_dummies(customer_data[column_name])
customer_data.drop(column_name, axis=1, inplace=True)
customer_data=one_hot.join(customer_data)

In [None]:
column_name = 'Married'
one_hot = pd.get_dummies(customer_data[column_name])
customer_data.drop(column_name, axis=1, inplace=True)
customer_data=one_hot.join(customer_data)

In [None]:
print(customer_data[0:5])

Chart some of the data just to see how it looks

In [None]:
customer_data.plot.scatter('Age', 'Salary')

Scale (standardize) the data into a standard size

In [None]:
standardized_customer_data = preprocessing.scale(customer_data)
standardized_customer_data_df = pd.DataFrame(standardized_customer_data, columns = customer_data.columns)

Take a look at the standardised data

In [None]:
standardized_customer_data_df.plot.scatter('Age', 'Salary')


Its the same 'shape' .. but different scales

## Cluster the data (Build the model)

In [None]:
# create kmeans object
kmeans = KMeans(n_clusters=4)

# fit kmeans object to data
kmeans.fit(standardized_customer_data_df)

# save new clusters for chart
y_km = kmeans.fit_predict(standardized_customer_data_df)

## Review the Results

In [None]:
print (y_km[0:20])

Look at particular clusters of data .. starting with those in group 2

In [None]:
print(customer_data[y_km==2][0:10])

Look at the clusters in chart form

.. First plotting Age Vs Salary

In [None]:
plt.figure(figsize=(10,10))
plt.scatter(customer_data[y_km ==0]['Age'], customer_data[y_km == 0]['Salary'], s=15, c='red', alpha=.5)
plt.scatter(customer_data[y_km ==1]['Age'], customer_data[y_km == 1]['Salary'], s=15, c='black', alpha=.5)
plt.scatter(customer_data[y_km ==2]['Age'], customer_data[y_km == 2]['Salary'], s=15, c='blue', alpha=.5)
plt.scatter(customer_data[y_km ==3]['Age'], customer_data[y_km == 3]['Salary'], s=15, c='cyan', alpha=.5)

Then plot the married status (Single / Married) against their salary

In [None]:
plt.figure(figsize=(7,7))
plt.scatter(customer_data[y_km ==0]['Single'], customer_data[y_km == 0]['Salary'], s=15, c='red', alpha=.5)
plt.scatter(customer_data[y_km ==1]['Single'], customer_data[y_km == 1]['Salary'], s=15, c='black', alpha=.5)
plt.scatter(customer_data[y_km ==2]['Single'], customer_data[y_km == 2]['Salary'], s=15, c='blue', alpha=.5)
plt.scatter(customer_data[y_km ==3]['Single'], customer_data[y_km == 3]['Salary'], s=15, c='cyan', alpha=.5)

Then plot their gender against their salary

In [None]:
plt.figure(figsize=(7,7))
plt.scatter(customer_data[y_km ==0]['Female'], customer_data[y_km == 0]['Salary'], s=15, c='red', alpha=.5)
plt.scatter(customer_data[y_km ==1]['Female'], customer_data[y_km == 1]['Salary'], s=15, c='black', alpha=.5)
plt.scatter(customer_data[y_km ==2]['Female'], customer_data[y_km == 2]['Salary'], s=15, c='blue', alpha=.5)
plt.scatter(customer_data[y_km ==3]['Female'], customer_data[y_km == 3]['Salary'], s=15, c='cyan', alpha=.5)

We can even take a '3D' view of the data

In [None]:
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt

In [None]:
%matplotlib
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111, projection='3d')
ax.view_init(20, 20)
ax.set_xlabel('Annual Spend')
ax.set_ylabel('Salary')
ax.set_zlabel('Age')
ax.scatter(customer_data[y_km ==0]['Annual Spend'], customer_data[y_km == 0]['Salary'], customer_data[y_km == 0]['Age'],s=15, c='red', alpha=.3)
ax.scatter(customer_data[y_km ==1]['Annual Spend'], customer_data[y_km == 1]['Salary'], customer_data[y_km == 1]['Age'],s=15, c='black', alpha=.3)
ax.scatter(customer_data[y_km ==2]['Annual Spend'], customer_data[y_km == 2]['Salary'], customer_data[y_km == 2]['Age'],s=15, c='blue', alpha=.3)
ax.scatter(customer_data[y_km ==3]['Annual Spend'], customer_data[y_km == 3]['Salary'], customer_data[y_km == 3]['Age'],s=15, c='cyan', alpha=.3)
