# Scikit-learn Clustering the Walmart data

Let's look at a clustering example in Scikit-learn

Here, we are going to load the walmart dataset. 

In [1]:
%matplotlib inline
import pandas as pd
from sklearn.cluster import KMeans


In [2]:
dataset = pd.read_csv("/data/walmart-triptype/train-transformed.csv.gz")
dataset

Unnamed: 0,VisitNumber,TripType,Weekday,NumItems,Return,1-HR PHOTO,ACCESSORIES,AUTOMOTIVE,BAKERY,BATH AND SHOWER,...,SEAFOOD,SEASONAL,SERVICE DELI,SHEER HOSIERY,SHOES,SLEEPWEAR/FOUNDATIONS,SPORTING GOODS,SWIMWEAR/OUTERWEAR,TOYS,WIRELESS
0,5,999,5,-1,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,7,30,5,2,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,8,26,5,28,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,9,8,5,3,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10,8,5,3,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,11,35,5,4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,12,41,5,7,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
7,15,21,5,9,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,17,6,5,4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,19,42,5,9,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Creating Vectors

We'll again use the VectorAssembler class to create features from the data..

In [3]:
columns = dataset.columns
columns = columns.drop('VisitNumber')
columns = columns.drop('TripType')

features_v = dataset[columns]
features_v

Unnamed: 0,Weekday,NumItems,Return,1-HR PHOTO,ACCESSORIES,AUTOMOTIVE,BAKERY,BATH AND SHOWER,BEAUTY,BEDDING,...,SEAFOOD,SEASONAL,SERVICE DELI,SHEER HOSIERY,SHOES,SLEEPWEAR/FOUNDATIONS,SPORTING GOODS,SWIMWEAR/OUTERWEAR,TOYS,WIRELESS
0,5,-1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,5,28,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,5,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
7,5,9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,5,9,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Step 3: Running Kmeans

We know there are 39 triptypes.  So that makes a good "natural" value of k.

**=> TODO: set K = 39, and then run kmeans

In [5]:
k = 39
cluster = KMeans(n_clusters=k, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances='auto', verbose=0, random_state=None, copy_x=True, n_jobs=1)
kmeans = cluster.fit(features_v)
kmeans.cluster_centers_


array([[  1.97751913e+00,   1.21930804e+00,   1.48198342e-01, ...,
         -3.18877551e-04,   1.21970663e-02,   1.84151786e-02],
       [  4.47101449e+00,   4.29855072e+01,   1.92028986e-01, ...,
          3.26086957e-02,   7.64492754e-01,   3.62318841e-02],
       [  4.47414741e+00,   1.78294829e+01,   9.13091309e-02, ...,
          2.20022002e-03,   6.71067107e-02,   1.10011001e-02],
       ..., 
       [  4.57673267e+00,   3.50544554e+01,   1.85643564e-01, ...,
          4.95049505e-03,   1.21287129e-01,   2.47524752e-03],
       [  4.52941176e+00,   3.16974790e+01,   1.38655462e-01, ...,
          2.10084034e-02,   1.97478992e-01,   4.20168067e-03],
       [  4.58669834e+00,   2.22755344e+01,   8.78859857e-02, ...,
          4.75059382e-03,   4.03800475e-02,   7.12589074e-03]])

In [7]:
clustered_v = kmeans.predict(features_v)
clustered_v
dataset['cluster'] = clustered_v

Let's take a look at the transformed dataset.  let's look at a distribution of our transformed dataset

In [6]:
# TODO: Do a group by 'cluster', and sort by cluster, and count number in each cluster.
histogram = ???
histogram

SyntaxError: invalid syntax (<ipython-input-6-6d049604aa04>, line 3)

In [None]:
# TODO: Show a pandas bar plot for the histogram.

**=> Do a matplotlib or Pandas plot showing the histogram**

In [None]:
# TODO : Do a bar plot
histogram ????  # Show bar plot illustrating histogram

## Step 4: Relate Cluster Numbers to Trip Types

Is there a relationship here? Discuss the results.

Remember, clustering is trying to find "natural" patterns -- it is not a classifier, and if we are trying to classify trip type we should use a classification algorithm and not k-means.

In [None]:

for i in (range(0,38)):
    print('Cluster #' + str(i) + ':')
    # TODO Count and print the top 5 trip types for each cluster. 

**=> TODO Analyze and discuss your results  Does there appear to be a relationship between cluster and triptype**

**=> TODO: Are there any outliers that don't seem to fit into any cluster? Look at the outliers**