## Imports

In [1]:
import os
os.chdir(r"C:\Users\tom.dawson\Documents\Git Projects\MallCustomerData")
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import math
import plotly.figure_factory as ff
from KMeansCluster import KMeansCluster
import plotly.express as px
from sklearn.cluster import KMeans

## Data Processing

In [2]:
#Misc
Data = pd.read_csv(r"Mall_Customers.csv")
Data["Gender"] = Data["Gender"].apply(lambda x: 1 if x == "Male" else 0)
Data["AnnualIncome"] = Data["Annual Income (k$)"]
Data["SpendingScore"] = Data["Spending Score (1-100)"]
Data = Data.drop("Annual Income (k$)", axis=1)
Data = Data.drop("Spending Score (1-100)", axis=1)

PerVars = ["SpendingScore"] #List of Percentage variables for plotting
MonVars = ["AnnualIncome"] #List of currency vars for plotting
Data["All"] = np.ones(shape = (len(Data), 1)) #For Violin Plots

In [3]:
Data.isnull().sum()

CustomerID       0
Gender           0
Age              0
AnnualIncome     0
SpendingScore    0
All              0
dtype: int64

## Functions

In [34]:
def scatter(Df, PlotVar, Hue, Y, Title):
    
    '''
    Produces a plot of data pulled from specified dataframe split by a certain binary population
    PlotVars defines the independent variable
    Hue defines the population for which to split the plots
    Y is the dependent variable
    Title is title of the plot
    '''
    
    Multiplier = 1000 if "Income" in PlotVar else 1

    fig = go.Figure()

    fig.add_trace(  
        go.Scatter(
            x = Df[Df[Hue] == 1][PlotVar]*Multiplier
            , y=Df[Df[Hue] == 1][Y]
            , legendgroup=Hue + " = 1"
            , name=Hue + " = 1"
            , mode='markers'
            , line=dict(color='red')
            , marker=dict(size=10, opacity=0.9)
            , showlegend= True))

    fig.add_trace(  
        go.Scatter(
            x = Df[Df[Hue] == 0][PlotVar]*Multiplier
            , y=Df[Df[Hue] == 0][Y]
            , legendgroup=Hue + " = 0"
            , name=Hue + " = 0"
            , mode='markers'
            , line=dict(color='blue')
            , marker=dict(size=10, opacity=0.9)
            , showlegend= True))

    fig.update_xaxes(
        zeroline = True
        , showgrid = True
        , title = PlotVar
        , ticksuffix = "%" if PlotVar in PerVars else ""
        , tickprefix = "$" if PlotVar in MonVars else ""
        , range = [0.95*Multiplier*np.min(Df[PlotVar]), 1.05*Multiplier*np.max(Df[PlotVar])])

    fig.update_yaxes(
        zeroline=True
        , showgrid=True
        , ticksuffix = "%" if Y in PerVars else ""
        , tickprefix = "$" if Y in MonVars else ""
        , range = [0.95*np.min(Df[Y]), 1.05*np.max(Df[Y])]
        , title = Y)
    
    
    fig.update_layout(
        title = dict(text=Title, font=dict(size=17)))

    fig.update_annotations(
        font = dict(size=14))
    
    fig.show()

In [28]:
def DistributionPlot(Df, PlotVar):
    '''
    Plots the distribution of a given variable in a dataframe
    '''
    Labels = [i for i in range(0, 100, 10)]
    
    fig = ff.create_distplot(
            hist_data = [Df[PlotVar].values.tolist()]
            , group_labels = [str(PlotVar) + " Distribution"]
            , bin_size=3.5)
    
    fig.update_xaxes(
    zeroline = True
    , showgrid = True
    , title=PlotVar)


    fig.update_yaxes(
        zeroline=True
        , showgrid=True
        , title="Distribution")


    fig.update_layout(
        title = dict(text=str(PlotVar) + " Distribution"
                     , font=dict(color="Black", size=20))
        , font = dict(color="Black", size=10)
        , height = 700
        , width = 1100
        , legend_title='Legend')

    fig.show()

In [37]:
def Plot(Df, Y, Hue):
    
    '''
    Generates a violin plot with data pulled from a specified dataframe.
    Dependent variable is specified as Y
    Hue splits the dependent variable into two populations according to a specified binary classifier.
    Can be Gender, Retired etc.
    '''
    
    Multiplier = 1000 if "Income" in Y else 1
    
    fig = go.Figure()

    fig.add_trace(
        go.Violin(
            x = Df["All"]
            , y=Df[Y][Df[Hue] == 0]*Multiplier
            , name=Hue + " = 0"
            , side="positive"
            , showlegend= True
            , box = dict(visible=True)
            , meanline = dict(visible=True)))

    fig.add_trace(
        go.Violin(
            x = Df["All"]
            , y=Df[Y][Df[Hue] == 1]*Multiplier
            , name=Hue + " = 1"
            , side="negative"
            , showlegend= True
            , box = dict(visible=True)
            , meanline = dict(visible=True)))
    
    fig.update_xaxes(
        zeroline = True
        , showgrid = True
        , showticklabels = False)

    fig.update_yaxes(
        zeroline=True
        , showgrid=True
        , ticksuffix = "%" if Y in PerVars else ""
        , tickprefix = "$" if Y in MonVars else ""
        , title=Y)
    
    fig.update_layout(
        title = dict(text=Y+" Split By "+ Hue)
        , height = 600
        , width = 900)

    fig.update_annotations(
        font = dict(size=14))

    fig.show()

In [38]:
def InertiaPlot(DataFrame, Vars):
    
    '''
    Calculates inertia for a variety of K's
    Plots inertia for each model
    Allows user to find optimal K.
    '''
    
    Inertia = {}
    
    for N in range(1 , 10):
        Mod = KMeansCluster(300, N)

        Mod = Mod.fit(DataFrame[Vars])

        Preds = Mod.Predict(DataFrame[Vars])

        Inertia[N] = Mod.Inertia


    fig = go.Figure()

    fig.add_trace(  
        go.Scatter(
            x = [i for i in range(1 , 7)]
            , y=[Inertia[i] for i in range(1 , 10)]
            , legendgroup="Inertia"
            , name="Inertia"
            , mode='lines+markers'
            , marker=dict(size=10, opacity=0.9)
            , showlegend= True))

    fig.update_xaxes(
        zeroline = True
        , showgrid = True
        , title = "K")

    fig.update_yaxes(
        zeroline=True
        , showgrid=True
        , title="Inertia")


    fig.update_layout(
        title = dict(text="Inertia vs. K", font=dict(size=17)))

    fig.update_annotations(
        font = dict(size=14))

    fig.show()

## EDA

In [39]:
#Small number of data points, AvAge = 39, AvInc = 60, AvSpen = 50%
#44% of population is Female
Data.describe()

Unnamed: 0,CustomerID,Gender,Age,AnnualIncome,SpendingScore,All,Clusters
count,200.0,200.0,200.0,200.0,200.0,200.0,200.0
mean,100.5,0.44,38.85,60.56,50.2,1.0,2.69
std,57.879185,0.497633,13.969007,26.264721,25.823522,0.0,1.576811
min,1.0,0.0,18.0,15.0,1.0,1.0,1.0
25%,50.75,0.0,28.75,41.5,34.75,1.0,1.0
50%,100.5,0.0,36.0,61.5,50.0,1.0,3.0
75%,150.25,1.0,49.0,78.0,73.0,1.0,4.0
max,200.0,1.0,70.0,137.0,99.0,1.0,5.0


In [40]:
Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 7 columns):
CustomerID       200 non-null int64
Gender           200 non-null int64
Age              200 non-null int64
AnnualIncome     200 non-null int64
SpendingScore    200 non-null int64
All              200 non-null float64
Clusters         200 non-null int64
dtypes: float64(1), int64(6)
memory usage: 11.0 KB


In [41]:
#Can see a clear group forming in the center where Annual income ($) is similar to spending score (%)
scatter(Data, "AnnualIncome", "Gender", "SpendingScore", "Annual Income vs. Spending Score for Male (1) and Female (0)")

In [42]:
#Surprising to see most of the higher earners are below 40
scatter(Data, "AnnualIncome", "Gender", "Age", "Annual Income vs. Age for Male (1) and Female (0)")

In [43]:
#Sharp fall of incomes above 80k
DistributionPlot(Data, "AnnualIncome")

In [44]:
#Positively skewed in terms of age
DistributionPlot(Data, "Age")

In [45]:
#Very uneven distribution for spending score
DistributionPlot(Data, "SpendingScore")

In [46]:
#Male population is slightly older than Female
Plot(Data, "Age", "Gender")

In [47]:
#Male population earning more than Female population by $1,000 or so
Plot(Data, "AnnualIncome", "Gender")

In [48]:
#Female population have a marginally higher spending score than Male
Plot(Data, "SpendingScore", "Gender")

## Modelling

## Annual Income and Spending Score

In [18]:
InertiaPlot(Data, ["AnnualIncome", "SpendingScore"])


Mean of empty slice.


invalid value encountered in true_divide



In [19]:
Mod = KMeansCluster(500, 4)
Mod = Mod.fit(Data[["AnnualIncome", "SpendingScore"]])
Data["Clusters"] = Mod.Predict(Data[["AnnualIncome", "SpendingScore"]])


fig = go.Figure()

for i in Data["Clusters"].sort_values().unique():
    fig.add_trace(  
        go.Scatter(
            x = Data[Data["Clusters"] == i]["AnnualIncome"]*1000
            , y=Data[Data["Clusters"] == i]["SpendingScore"]
            , legendgroup="Group" + " = " + str(i)
            , name="Group" + " = " + str(i)
            , mode='markers'
            , marker=dict(size=10, opacity=0.9)
            , showlegend= True))
    
    
fig.add_trace(  
    go.Scatter(
        x = [Centroid[0]*1000 for Centroid in Mod.Centroid]
        , y = [Centroid[1] for Centroid in Mod.Centroid]
        , name = "Centroid"
        , mode='markers'
        , marker=dict(size=15, opacity=0.9, color="Black")
        , showlegend= True))
    

fig.update_xaxes(
    zeroline = True
    , showgrid = True
    , title = "AnnualIncome"
    , tickprefix="£")

fig.update_yaxes(
    zeroline=True
    , showgrid=True
    , title="SpendingScore"
    , ticksuffix="%")


fig.update_layout(
    title = dict(text="Clusters", font=dict(size=17)))

fig.update_annotations(
    font = dict(size=14))

fig.show()

## Annual Income, Spending Score and Age

In [20]:
InertiaPlot(Data, ["AnnualIncome", "SpendingScore", "Age"])

In [21]:
Mod = KMeansCluster(500, 5)
Mod = Mod.fit(Data[["AnnualIncome", "SpendingScore", "Age"]])
Data["Clusters"] = Mod.Predict(Data[["AnnualIncome", "SpendingScore", "Age"]])

fig = px.scatter_3d(Data, x='AnnualIncome', y='SpendingScore', z='Age',
              color='Clusters', title="Clusters")
fig.show()

## Test against Sklearns K-Means

In [23]:
Mod = KMeans(n_clusters=5, max_iter=500).fit(Data[["AnnualIncome", "SpendingScore", "Age"]])

Test = pd.DataFrame()
Test["AnnualIncome"] = Data["AnnualIncome"]
Test["SpendingScore"] = Data["SpendingScore"]
Test["Age"] = Data["Age"]
Test["Clusters"] = Mod.predict(Data[["AnnualIncome", "SpendingScore", "Age"]])



fig = px.scatter_3d(Test, x='AnnualIncome', y='SpendingScore', z='Age',
              color='Clusters', title="Clusters")
fig.show()

In [24]:
Mod.inertia_

75350.77917248776