Data from here: http://archive.ics.uci.edu/ml/datasets/online+retail#

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA

import os

ModuleNotFoundError: No module named 'skopt'

## explore

In [None]:
raw = pd.read_excel('online_retail.xlsx')
raw.head()

In [None]:
raw.shape

In [None]:
raw['CustomerID'].unique().shape[0]/raw.shape[0]

In [None]:
raw.describe()

In [None]:
raw['Description'].unique().shape[0]/raw.shape[0]

In [None]:
raw[raw['Description'] == 'WHITE METAL LANTERN'].head()

In [None]:
raw.dtypes

In [None]:
np.amin(raw['InvoiceDate'])

In [None]:
np.amax(raw['InvoiceDate'])

In [None]:
raw['Country'].unique()

## build rankings matrix

should this be the spec?:

|user|countryonehot|item1|item2|...|itemN|
|---|---|---|---|
|user1|cntry|numItem1|numItem2|...|numItemN|
|user2|cntry|numItem1|numItem2|...|numItemN|
|.|.|.|.|...|.|
|.|.|.|.|...|.|
|.|.|.|.|...|.|
|userM|cntry|numItem1|numItem2|...|numItemN|

In [None]:
rankings_matrix = raw.groupby(['CustomerID','StockCode']).count()['InvoiceNo'].unstack()
rankings_matrix.reset_index(inplace=True)
rankings_matrix.head()

In [None]:
rankings_matrix.shape

In [None]:
customers = raw['CustomerID'].unique()
customers = np.delete(customers, 32)
countries = []

for i in customers:
    countries.append(raw[raw['CustomerID'] == i].iloc[0]['Country'])
    
countries = np.array(countries)

new = pd.DataFrame()
new['CustomerID'] = customers
new['Country'] = countries
new.head()

In [None]:
rankings_matrix = rankings_matrix.merge(new, how='left', on='CustomerID')
rankings_matrix.fillna(value=0., inplace=True)
rankings_matrix.head()

In [None]:
rankings_matrix = pd.get_dummies(rankings_matrix, columns=['Country'])
rankings_matrix.head()

In [None]:
temp = rankings_matrix.drop(['CustomerID'], axis=1)

nonz = np.flatnonzero(temp.as_matrix())

print('sparsity = ', 100 * nonz.shape[0] / (temp.shape[0] * temp.shape[1]), '%')

In [None]:
rankings_matrix.to_csv('rankings.csv', index=False)

## cluster

In [None]:
km = KMeans(n_clusters=20, n_init=50, n_jobs=-1)

In [None]:
%%time

km.fit(rankings_matrix)

rankings_matrix['cluster'] = km.labels_

os.system("spd-say 'Your calculations are complete.' -r -50 -t female3")

In [None]:
rankings_matrix.groupby('cluster').count().describe()['CustomerID']

In [None]:
cluster_counts = list(rankings_matrix.groupby('cluster').count()['CustomerID'])

plt.figure(figsize=(15,5))

plt.plot(range(len(cluster_counts)), cluster_counts)

plt.grid()
plt.show()

In [None]:
rankings_matrix.head()

## covariance

In [None]:
vals = rankings_matrix.drop(['cluster', 'CustomerID'], inplace=False, axis=1)

vals.shape

In [None]:
# users
#cov = np.cov(vals)

# items
cov = np.cov(np.transpose(vals))

cov.shape

In [None]:
print('mean:', np.mean(cov))
print('std:', np.std(cov))
print('max: ', np.amax(cov))
print('min: ', np.amin(cov))

In [None]:
temp = cov

temp = abs(temp)

#super_threshold_indices = temp > .1
#temp[super_threshold_indices] = 0

temp = normalize(temp)

print('mean:', np.mean(temp))
print('std:', np.std(temp))
print('max: ', np.amax(temp))
print('min: ', np.amin(temp))

In [None]:
plt.figure(figsize=(10,10))
plt.imshow(temp, cmap='viridis')
plt.colorbar()
plt.show()

In [None]:
plt.figure(figsize=(15,5))

plt.plot(cov[3000])

plt.show()

## PCA viz

In [None]:
vals.shape

In [None]:
%%time
pca = PCA(n_components=2)

# items
#components = pca.fit_transform(np.transpose(vals))

# users
components = pca.fit_transform(vals)
print(components.shape)

In [None]:
plt.figure(figsize=(10,10))

plt.scatter(components[:,0], components[:,1], c=rankings_matrix['cluster'], marker='.')
plt.xlim(-5,50)
plt.ylim(-20,10)

plt.colorbar()
plt.show()