In [110]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
%matplotlib notebook 

In [134]:
data = pd.read_csv('/Users/mac/Desktop/Supermarket Work/Mall_Customers.csv', header='infer',)
data.head(6)

Unnamed: 0,CustomerID,Genre,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40
5,6,Female,22,17,76


In [135]:
data.isnull().sum()  # no NA values 

CustomerID                0
Genre                     0
Age                       0
Annual Income (k$)        0
Spending Score (1-100)    0
dtype: int64

In [136]:
#data.CustomerID.nunique(dropna=True) We're going to rerun with gender and customer id column 

In [117]:
data.shape

(200, 4)

In [137]:
data.dtypes

CustomerID                 int64
Genre                     object
Age                        int64
Annual Income (k$)         int64
Spending Score (1-100)     int64
dtype: object

In [138]:
data.describe()

Unnamed: 0,CustomerID,Age,Annual Income (k$),Spending Score (1-100)
count,200.0,200.0,200.0,200.0
mean,100.5,38.85,60.56,50.2
std,57.879185,13.969007,26.264721,25.823522
min,1.0,18.0,15.0,1.0
25%,50.75,28.75,41.5,34.75
50%,100.5,36.0,61.5,50.0
75%,150.25,49.0,78.0,73.0
max,200.0,70.0,137.0,99.0


In [139]:
#dropping the genre columns it has only 2 unique values 
#data.drop('Genre', axis=1, inplace=True) #rather than dropping we impute
dummy = pd.get_dummies(data['Genre'])
data = pd.concat([data,dummy], axis=1)
data.head(2)

Unnamed: 0,CustomerID,Genre,Age,Annual Income (k$),Spending Score (1-100),Female,Male
0,1,Male,19,15,39,0,1
1,2,Male,21,15,81,0,1


In [140]:
data.drop('Genre', inplace=True, axis=1)

In [141]:
data.head(2)

Unnamed: 0,CustomerID,Age,Annual Income (k$),Spending Score (1-100),Female,Male
0,1,19,15,39,0,1
1,2,21,15,81,0,1


In [142]:
#Standardization
from sklearn.preprocessing import StandardScaler

standardizer = StandardScaler()
std_data=standardizer.fit_transform(data)

stdata = pd.DataFrame(std_data, columns =data.columns)

In [143]:
stdata.describe()

Unnamed: 0,CustomerID,Age,Annual Income (k$),Spending Score (1-100),Female,Male
count,200.0,200.0,200.0,200.0,200.0,200.0
mean,-6.661338e-18,-9.603429000000001e-17,-6.128431e-16,-1.121325e-16,-8.437695000000001e-17,1.554312e-17
std,1.002509,1.002509,1.002509,1.002509,1.002509,1.002509
min,-1.723412,-1.496335,-1.738999,-1.910021,-1.128152,-0.8864053
25%,-0.861706,-0.7248436,-0.7275093,-0.5997931,-1.128152,-0.8864053
50%,0.0,-0.2045351,0.03587926,-0.007764312,0.8864053,-0.8864053
75%,0.861706,0.7284319,0.6656748,0.8851316,0.8864053,1.128152
max,1.723412,2.235532,2.917671,1.894492,0.8864053,1.128152


In [144]:
from sklearn.cluster import KMeans

In [153]:
##First using elbow method to check accurate number of clusters
wss = {}
for k in range(1,21):
    kmeans_loop = KMeans(n_clusters=k,n_init=20, max_iter=200,random_state=1234,verbose=0).fit(stdata)
    clusters = kmeans_loop.labels_
    wss[k] = kmeans_loop.inertia_ #sum of squared distance

In [154]:
list(wss.keys())

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]

In [155]:
wss.values()

dict_values([1200.0, 797.2883426603587, 638.6589248711804, 510.57217511559406, 435.44469860374465, 368.9735492181911, 304.31423972380156, 254.41521729352556, 222.6225050023683, 191.37365582186786, 170.1866701513118, 152.23132550285453, 140.51556647543623, 133.34586947506807, 124.36737612494179, 118.07205526230456, 111.97623325989551, 104.37302587367108, 99.25665449015307, 92.52523029279693])

In [156]:
plt.figure()
plt.plot(list(wss.keys()),list(wss.values()))
plt.grid()
plt.title('The elbow method')
plt.xlabel('Number of Clusters')
plt.ylabel('Total within sum of squares')
plt.show()

<IPython.core.display.Javascript object>

In [150]:
##Second using Silhouette score to select best K 
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.cm as cm 

In [157]:
X_matrix = stdata.as_matrix()

range_n_clusters = [2,3, 4, 5, 6, 7, 8, 9, 10, 11,12,13,14,15]

for n_clusters in range_n_clusters:
    fig, (ax1) = plt.subplots()


    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    clusterer = KMeans(n_clusters=n_clusters, random_state=10)
    cluster_labels = clusterer.fit_predict(X_matrix)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(X_matrix, cluster_labels)

    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(X_matrix, cluster_labels)

    y_lower = 0
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = \
            sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / n_clusters)
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0, ith_cluster_silhouette_values,
                          facecolor=color, edgecolor=color, alpha=0.7)

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("The silhouette plot for the various clusters")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.2, 0, 0.2, 0.4, 0.6, 0.8, 1])

    plt.suptitle(("For %d clusters, silhouette avg coeff = %f " % (n_clusters,silhouette_avg)),
                 fontsize=14, fontweight='bold')
    plt.show()

  """Entry point for launching an IPython kernel.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [60]:
data.head(2)  ##After running with dummified gender we're down to 10 clusters

Unnamed: 0,CustomerID,Age,Annual Income (k$),Spending Score (1-100)
0,1,19,15,39
1,2,21,15,81


In [158]:
ID = data['CustomerID']
ID.head(2)

0    1
1    2
Name: CustomerID, dtype: int64

In [159]:
bestk = KMeans(n_clusters=10,random_state=1234)
best_means=bestk.fit_predict(stdata)

In [160]:
results=pd.DataFrame({'CustomerID':ID,'Clusters':best_means})
#results.head()

In [162]:
##Merging with initial data
info = pd.read_csv('/Users/mac/Desktop/Supermarket Work/Mall_Customers.csv', header='infer', index_col=0)
final = pd.merge(info,results,on='CustomerID')
#info.head(2)
final.head(5)

Unnamed: 0,CustomerID,Genre,Age,Annual Income (k$),Spending Score (1-100),Clusters
0,1,Male,19,15,39,7
1,2,Male,21,15,81,7
2,3,Female,20,16,6,2
3,4,Female,23,16,77,9
4,5,Female,31,17,40,2


In [163]:
final.to_csv('MallClusters10.csv', index=None)

In [166]:
##Saving the centroids for deployment 
# inverse_transform is used to Scale back the data to the original representation
mall_centroids = pd.DataFrame(standardizer.inverse_transform(bestk.cluster_centers_),columns=stdata.columns)   ## Using inverse_transform to retrive actual values from standardized data
mall_centroids.to_csv("best_mall_cluster_centroids10.csv")
#mall_centroids

In [167]:
mall_centroids

Unnamed: 0,CustomerID,Age,Annual Income (k$),Spending Score (1-100),Female,Male
0,163.333333,32.190476,86.047619,81.666667,1.0,-1.110223e-16
1,158.368421,32.947368,86.052632,81.263158,0.0,1.0
2,23.461538,41.538462,26.538462,20.692308,1.0,-1.110223e-16
3,69.692308,58.115385,48.038462,41.269231,0.0,1.0
4,94.416667,27.833333,58.125,46.541667,1.0,0.0
5,159.5,39.5,85.15,14.05,0.0,1.0
6,80.84,54.08,53.24,49.52,1.0,0.0
7,50.521739,25.043478,38.652174,59.608696,0.0,1.0
8,171.0,44.6,92.333333,21.6,1.0,-1.665335e-16
9,24.285714,25.857143,26.642857,79.142857,1.0,-1.110223e-16


In [168]:
final.Clusters.value_counts()

3    26
6    25
4    24
7    23
0    21
5    20
1    19
8    15
9    14
2    13
Name: Clusters, dtype: int64

In [None]:
#analysis = final.groupby(['Clusters']).mean().reset_index()
#analysis

In [None]:
#Assuming we need to perform Agglomerative Clustering 

In [169]:
from scipy.cluster.hierarchy import linkage, dendrogram
# Preparing linkage matrix
linkage_matrix = linkage(stdata, method='ward',metric='euclidean')


fig, axs = plt.subplots()
dendrogram(linkage_matrix,labels=ID.as_matrix())
plt.tight_layout()
plt.show()

<IPython.core.display.Javascript object>

  import sys


In [103]:
from sklearn.cluster import AgglomerativeClustering

## Instantiating object
agg_clust = AgglomerativeClustering(n_clusters=6, affinity='euclidean', linkage='ward')

## Training model and return class labels
agg_clusters = agg_clust.fit_predict(stdata)

## Label - Cluster
agg_result = pd.DataFrame({"CustomerID":ID,"agg_cluster":agg_clusters}).sort_values('agg_cluster')

In [105]:
agg_result.head(3)

Unnamed: 0,CustomerID,agg_cluster
71,72,0
90,91,0
89,90,0


In [101]:
agg_result.agg_cluster.value_counts()

0    52
3    39
1    35
2    30
5    22
4    22
Name: agg_cluster, dtype: int64

In [109]:
### redo the process with Gender variable but view in tableau shows no need 
### Agglomerative has gender but doesn't change a thing 
Aggdone.to_csv('Aggdone.csv', index=None)

In [107]:
info = pd.read_csv('/Users/mac/Desktop/Supermarket Work/Mall_Customers.csv', header='infer', index_col=0)
Aggdone = pd.merge(info,agg_result,on='CustomerID')
#info.head(2)

In [108]:
Aggdone.head()

Unnamed: 0,CustomerID,Genre,Age,Annual Income (k$),Spending Score (1-100),agg_cluster
0,1,Male,19,15,39,4
1,2,Male,21,15,81,5
2,3,Female,20,16,6,4
3,4,Female,23,16,77,5
4,5,Female,31,17,40,4
