In [1]:
# import block
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from scipy.spatial import distance

import time

from coding_1 import *

## The Dataset

In [2]:
# import data
records = pd.read_csv("Ecommerce.csv")
print("shape of records is", records.shape)
records.head()

shape of records is (541909, 9)


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Unnamed: 8
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,29-Nov-16,2.55,17850.0,United Kingdom,
1,536365,71053,WHITE METAL LANTERN,6,29-Nov-16,3.39,17850.0,United Kingdom,
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,29-Nov-16,2.75,17850.0,United Kingdom,
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,29-Nov-16,3.39,17850.0,United Kingdom,
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,29-Nov-16,3.39,17850.0,United Kingdom,


In [3]:
# get rid of meaningless columns
# drop lines with NaN values
records_small = records[["CustomerID", "Quantity", "UnitPrice", "Country"]].dropna()
records_small["CustomerID"] = records_small["CustomerID"].astype(int)

# calculate total pay for each invoice
records_small["Total"] = records_small["Quantity"] * records_small["UnitPrice"]
records_small

# convert the country column to numerical
records_small["Country"] = pd.Categorical(records_small["Country"])
records_small["Country"] = records_small["Country"].cat.codes

print("shape of records_small is", records_small.shape)
records_small.head(10)

shape of records_small is (406829, 5)


Unnamed: 0,CustomerID,Quantity,UnitPrice,Country,Total
0,17850,6,2.55,35,15.3
1,17850,6,3.39,35,20.34
2,17850,8,2.75,35,22.0
3,17850,6,3.39,35,20.34
4,17850,6,3.39,35,20.34
5,17850,2,7.65,35,15.3
6,17850,6,4.25,35,25.5
7,17850,6,1.85,35,11.1
8,17850,6,1.85,35,11.1
9,13047,32,1.69,35,54.08


In [4]:
# get the total money each customer spent and the total quantity each customer bought

# code consultation:
# https://stackoverflow.com/questions/49783178/python-keep-other-columns-when-using-sum-with-groupby

customer = records_small.groupby(["CustomerID"], as_index=False)['Quantity', 'Total', 'Country'].agg({'Quantity': 'sum', 'Total': 'sum', 'Country': 'first'})
print("shape of customer is", customer.shape)
customer.head(10)

shape of customer is (4372, 4)
  customer = records_small.groupby(["CustomerID"], as_index=False)['Quantity', 'Total', 'Country'].agg({'Quantity': 'sum', 'Total': 'sum', 'Country': 'first'})


Unnamed: 0,CustomerID,Quantity,Total,Country
0,12346,0,0.0,35
1,12347,2458,4310.0,16
2,12348,2341,1797.24,12
3,12349,631,1757.55,18
4,12350,197,334.4,24
5,12352,470,1545.41,24
6,12353,20,89.0,2
7,12354,530,1079.4,30
8,12355,240,459.4,2
9,12356,1591,2811.43,26


In [5]:
# convert the dataset to a numpy array for later use
customer_np = customer[['Quantity', 'Total', 'Country']].to_numpy()
customer_np

array([[   0.  ,    0.  ,   35.  ],
       [2458.  , 4310.  ,   16.  ],
       [2341.  , 1797.24,   12.  ],
       ...,
       [  98.  ,  176.6 ,   35.  ],
       [1397.  , 2094.88,   35.  ],
       [1586.  , 1837.28,   35.  ]])

## Looping KMeans and Efficacy Comparison

The hyperparameter here is k, the number of centers. I will find the best k through elbowology. I want to compare the efficacy of my KMeans implementation and that of KMeans in sklearn. Therefore, I will implement a looping-KMeans from k=1 to k=16 first with my KMeans and then with sklearn's KMeans and compare the their tuntimes.

In [6]:
start_time = time.time()

# looping k-means with implementation from sklearn
SSE_list_sklearn = looping_kmeans_sklearn(customer_np, range(2, 16))

# Stop the clock and determine the length of time
stop_time = time.time()

print("This took %s seconds to run" % (stop_time - start_time))

This took 5.720094442367554 seconds to run


In [7]:
start_time = time.time()

# looping k-means with my implementation of KMeans
SSE_list_my_kmeans = looping_kmeans_my(customer_np, range(2, 16))

# Stop the clock and determine the length of time
stop_time = time.time()

print("This took %s seconds to run" % (stop_time - start_time))

This took 22.070647954940796 seconds to run


## Elbowology: Choosing the Best k

## Conclusion