In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

from src import config

### Combine datasets to 1 master dataframe

In [None]:
orders_df = pd.read_csv(config.RAW_FILE_PATH /'olist_orders_dataset.csv')

# remove unavailable and cancelled orders
orders_df = orders_df[orders_df['order_status'].str.contains('unavailable|canceled')==False]

In [None]:
payments_df = pd.read_csv(config.RAW_FILE_PATH /'olist_order_payments_dataset.csv')
payments_df.drop_duplicates(['order_id'], inplace=True)

In [None]:
order_items_df = pd.read_csv(config.RAW_FILE_PATH /'olist_order_items_dataset.csv')
products_df = pd.read_csv(config.RAW_FILE_PATH /'olist_products_dataset.csv')
product_cat_df = pd.read_csv(config.RAW_FILE_PATH /'product_category_name_translation.csv')

In [None]:
master_df = orders_df.merge(payments_df, on = 'order_id'). \
            merge(order_items_df, on = 'order_id'). \
            merge(products_df, on = 'product_id'). \
            merge(product_cat_df, on = 'product_category_name')

In [None]:
# create dataframe for customers and product category
customer_category_df = master_df.groupby(['customer_id', 'product_category_name_english']) \
                        ['order_item_id'].agg('sum').reset_index()

# create dataframe for clustering 
# dataframe to contain details of customer-category interaction and amount spent
final_df = pd.pivot_table(customer_category_df,
                                   values = 'order_item_id',
                                   index = 'customer_id',
                                   columns = 'product_category_name_english',
                                   fill_value = 0)

customer_spent_df = master_df.groupby(['customer_id'])['payment_value'].agg('sum').reset_index()
final_df = final_df.merge(customer_spent_df, on = 'customer_id')

In [None]:
# visualise outliers 
# change 500 to 1000 or 10000 etc 
plt.boxplot(customer_spent_df[customer_spent_df['payment_value'] < 500]['payment_value'])

In [None]:
# exclude customers who spend more than 500 to avoid poor clustering in kmeans
final_df = final_df[final_df['payment_value'] <500]
final_df['payment_value'] = (final_df['payment_value'] - final_df['payment_value'].mean()) / final_df['payment_value'].std()
final_df.set_index('customer_id', inplace = True)

### Dimensionality reduction with PCA

In [None]:
# insights on how many features to keep based on cumulative variance plot
pca = PCA()
pca.fit(final_df)

In [None]:
plt.figure(figsize=(10,5))
plt.plot(range(1,73), pca.explained_variance_ratio_.cumsum(), marker = 'o', linestyle = '--')
plt.xlabel('Number of components')
plt.ylabel('Cumulative explained variance')

Since 11 components can explain about 80% of the variance, we will use 11 principal components. 

In [None]:
# perform pca with 11 components
pca = PCA(n_components = 11)
pca.fit(final_df)

scores_pca = pca.transform(final_df)

### Performing KMeans

In [None]:
# fit kmeans using transformed data from pca
inertia = []
for i in range(3,21):
    kmeans_pca = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    kmeans_pca.fit(scores_pca)
    inertia.append(kmeans_pca.inertia_)
    
plt.figure(figsize = (10,10))
plt.plot(range(3,21), inertia, marker = 'o', linestyle = '--')
plt.xlabel('Number of clusters')
plt.ylabel('inertia')

Since there is no obvious kink observed in the plot, we will use 15 clusters.

In [None]:
# implement kmeans
kmeans_pca = KMeans(n_clusters = 15, init = 'k-means++', random_state = 42)
kmeans_pca.fit(scores_pca)

### Analysing results from clustering

In [None]:
# examine the category of products bought by customers in each cluster
final_df.reset_index(inplace = True)
final_df['cluster'] = kmeans_pca.labels_
for i in range(15):
    print('-'*40)
    print('Cluster:',i)
    print('-'*40)
    clusterx_customers = final_df[final_df['cluster']==i]['customer_id'].unique().tolist()
    clusterx_products = customer_category_df[customer_category_df['customer_id'].isin(clusterx_customers)]
    print(clusterx_products['product_category_name_english'].value_counts())

Cluster numbers 0, 1, and 4 have a higher percentage of products from multiple categories unlike the remaining clusters where majority of the customers bought from a single product category. Further analysis will be performed for clusters 0, 1, and 4. 

In [None]:
# Cluster 0
cluster1_customers = final_df[final_df['cluster']==1]['customer_id'].unique().tolist()
cluster1_df = customer_category_df[customer_category_df['customer_id'].isin(cluster1_customers)]

# subset to find customers who bought from more than 1 category
multi_c_df = cluster1_df.groupby(['customer_id'])['product_category_name_english'].agg('count').reset_index(name="count")
multi_c_users = multi_c_df[multi_c_df['count']>1]['customer_id'].unique()

multi_c_df = customer_category_df[customer_category_df['customer_id'].isin(multi_c_users)]
multi_c_df

In [None]:
multi_c_df = multi_c_df.groupby('customer_id').agg({'product_category_name_english': ', '.join}).reset_index()
ch = multi_c_df.groupby('product_category_name_english').value_counts().reset_index(name = 'count')
ch[ch['count']>1]