In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import datetime as dt

from src import config

### Combine datasets to 1 master dataframe

In [None]:
orders_df = pd.read_csv(config.RAW_FILE_PATH /'olist_orders_dataset.csv')

# remove unavailable and cancelled orders
orders_df = orders_df[orders_df['order_status'].str.contains('unavailable|canceled')==False]

In [None]:
payments_df = pd.read_csv(config.RAW_FILE_PATH /'olist_order_payments_dataset.csv')
payments_df.drop_duplicates(['order_id'], inplace=True)

In [None]:
order_items_df = pd.read_csv(config.RAW_FILE_PATH /'olist_order_items_dataset.csv')
products_df = pd.read_csv(config.RAW_FILE_PATH /'olist_products_dataset.csv')
product_cat_df = pd.read_csv(config.RAW_FILE_PATH /'product_category_name_translation.csv')

In [None]:
master_df = orders_df.merge(payments_df, on = 'order_id'). \
            merge(order_items_df, on = 'order_id'). \
            merge(products_df, on = 'product_id'). \
            merge(product_cat_df, on = 'product_category_name')

master_df['order_purchase_timestamp'] = pd.to_datetime(master_df['order_purchase_timestamp']).dt.date
master_df['order_purchase_timestamp'].min(), master_df['order_purchase_timestamp'].max()

In [None]:
# subset to only 1 year of transactions
master_df = master_df[master_df['order_purchase_timestamp'] > dt.date(2017, 9, 3)]

In [None]:
# create dataframe for customers and product category
customer_category_df = master_df.groupby(['customer_id', 'product_category_name_english']) \
                        ['order_item_id'].agg('sum').reset_index()

# create dataframe for clustering 
# dataframe to contain details of customer-category interaction and amount spent
df = pd.pivot_table(customer_category_df,
                                   values = 'order_item_id',
                                   index = 'customer_id',
                                   columns = 'product_category_name_english',
                                   fill_value = 0)

customer_spent_df = master_df.groupby(['customer_id'])['payment_value'].agg('sum').reset_index()
df = df.merge(customer_spent_df, on = 'customer_id')

In [None]:
# visualise outliers 
plt.boxplot(customer_spent_df[customer_spent_df['payment_value'] < 500]['payment_value'])

In [None]:
# percentage of customers who spent below $500
len(customer_spent_df[customer_spent_df['payment_value'] < 500]) / len(customer_spent_df) * 100

Since 93% of the customers spend below $500, we will focus on this group for clustering. (Having too many outliers may hinder clustering steps.)

In [None]:
# exclude customers who spend more than 500 to avoid poor clustering in kmeans
df = df[df['payment_value'] < 500]
df['payment_standardised'] = (df['payment_value'] - df['payment_value'].mean()) / df['payment_value'].std()
df.set_index('customer_id', inplace = True)
final_df = df.drop(columns = ['payment_value'])

### Dimensionality reduction with PCA

In [None]:
# insights on how many features to keep based on cumulative variance plot
pca = PCA()
pca.fit(final_df)

In [None]:
plt.figure(figsize=(10,5))
plt.plot(range(1,73), pca.explained_variance_ratio_.cumsum(), marker = 'o', linestyle = '--')
plt.xlabel('Number of components')
plt.ylabel('Cumulative explained variance')

Since 11 components can explain about 80% of the variance, we will use 11 principal components for the next step.

In [None]:
# perform pca with 11 components
pca = PCA(n_components = 11, random_state = 42)
X_pca = pca.fit_transform(final_df)

### Performing KMeans

In [None]:
# fit kmeans using transformed data from pca
inertia = []
for i in range(2,30):
    kmeans_pca = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    kmeans_pca.fit(X_pca)
    inertia.append(kmeans_pca.inertia_)
    
plt.figure(figsize = (10,10))
plt.plot(range(2,30), inertia, marker = 'o', linestyle = '--')
plt.xlabel('Number of clusters')
plt.ylabel('inertia')

Since there is no obvious "elbow" observed, we will use 15 clusters.

In [None]:
# implement KMeans
kmeans_pca = KMeans(n_clusters = 15, init = 'k-means++', random_state = 42)
kmeans_pca.fit(X_pca)

### Results from KMeans

In [None]:
# examine the category of products bought by customers in each cluster
kmeans_df = df.reset_index().copy(deep=True)
kmeans_df['cluster'] = kmeans_pca.labels_
for i in range(15):
    print('-'*40)
    print('Cluster:',i)
    print('-'*40)
    clusterx_customers = kmeans_df[kmeans_df['cluster']==i]['customer_id'].unique().tolist()
    clusterx_products = customer_category_df[customer_category_df['customer_id'].isin(clusterx_customers)]
    
    print(clusterx_products['product_category_name_english'].value_counts())

Cluster numbers 1, 3, 4, and 6 have a higher percentage of products from multiple categories unlike the remaining clusters where majority of the customers bought from a single product category. Further analysis will be performed for these clusters.

In [None]:
clusters = [1,3,4,6]
for c in clusters:
    customer_list = kmeans_df[kmeans_df['cluster']==c]['customer_id'].unique().tolist()
    clusterx_df = customer_category_df[customer_category_df['customer_id'].isin(customer_list)]

    # subset to find customers who bought from more than 1 category
    subset_df = clusterx_df.groupby(['customer_id'])['product_category_name_english'].agg('count').reset_index(name="count")
    subset_list = subset_df[subset_df['count']>1]['customer_id'].unique()

    multi_c_df = customer_category_df[customer_category_df['customer_id'].isin(subset_list)]
    multi_c_df = multi_c_df.rename(columns = {'product_category_name_english': 'product_category'})
    multi_c_df = multi_c_df.groupby('customer_id').agg({'product_category': ', '.join}).reset_index()
    multi_c_df = multi_c_df.groupby('product_category')['customer_id'].agg('count').reset_index(name = 'count')
    multi_c_df.sort_values('count', ascending = False, inplace = True)
    print('-'*60)
    print('Cluster:', c)
    print('-'*60)
    print(multi_c_df.head(5))
    print(" ")

In [None]:
print('Average spending per cluster')

clusters = [1,3,4,6]
for c in clusters:
    clusterx = kmeans_df[kmeans_df['cluster']==c]
    print('Cluster {0}: ${1}'.format(c, round(clusterx['payment_value'].agg('mean'),2)))
    

### Performing DBSCAN 

In [None]:
dbs = DBSCAN()
dbs.fit(X_pca)
labels = dbs.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)

### Results from DBSCAN

In [None]:
# examine the category of products bought by customers in each cluster
dsb_df = final_df.reset_index().copy(deep=True)
dsb_df['cluster'] = dbs.labels_
for i in range(37):
    print('-'*40)
    print('Cluster:',i)
    print('-'*40)
    clusterx_customers = dsb_df[dsb_df['cluster']==i]['customer_id'].unique().tolist()
    clusterx_products = customer_category_df[customer_category_df['customer_id'].isin(clusterx_customers)]
    
    print(clusterx_products['product_category_name_english'].value_counts())

In [None]:
cluster = [0]
for c in cluster:
    customer_list = dsb_df[dsb_df['cluster']==c]['customer_id'].unique().tolist()
    clusterx_df = customer_category_df[customer_category_df['customer_id'].isin(customer_list)]

    # subset to find customers who bought from more than 1 category
    subset_df = clusterx_df.groupby(['customer_id'])['product_category_name_english'].agg('count').reset_index(name="count")
    subset_list = subset_df[subset_df['count']>1]['customer_id'].unique()

    multi_c_df = customer_category_df[customer_category_df['customer_id'].isin(subset_list)]
    multi_c_df = multi_c_df.rename(columns = {'product_category_name_english': 'product_category'})
    multi_c_df = multi_c_df.groupby('customer_id').agg({'product_category': ', '.join}).reset_index()
    multi_c_df = multi_c_df.groupby('product_category')['customer_id'].agg('count').reset_index(name = 'count')
    multi_c_df.sort_values('count', ascending = False, inplace = True)
    print('-'*60)
    print('Cluster:', c)
    print('-'*60)
    print(multi_c_df.head(5))
    print(" ")