In [None]:
import sys
sys.path.append('../..')
import pandas as pd
import numpy as np
import src.cleaning as clean
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sales_data = pd.read_csv('../../data/anon_data.csv')
cleaned_data = clean.clean_data(sales_data)


In [None]:
# Look at customers based on average profit on an order, average quantity bought in an order, and number of orders. 
# Ignore sales_team_name as this is already a grouping that will be corrrelated to the values above, and KMeans does not handle categorical variables easily due to inability to express distance/ difference in strings with a simple number.
# Keeping in returns and refunds as a customer that orders a lot but returns most of it is not the same as a customer that just orders a lot.


In [None]:
grouped_by_customer = cleaned_data.groupby('customer_id')
summed_values = grouped_by_customer[['profit', 'quantity', 'revenue']].sum()
summed_values.columns = ['profit_by_customer', 'quantity_by_customer', 'revenue_by_customer']


In [None]:
# For number of orders, look at only the orders and not the returns as if a return cancels out an order, that order has a profit of 0 and is only one order not two orders as would be counted by sales_order_number
# Could remove the order entirely but is difficult as one return can cover multiple orders.
only_sales_not_returns = cleaned_data.loc[cleaned_data['quantity']> 0, :]
only_sales_not_returns_grouped = only_sales_not_returns.groupby('customer_id')
orders_by_customer = only_sales_not_returns_grouped['sales_order_number'].unique().apply(len)
shipments_by_customer = only_sales_not_returns_grouped['ship_date'].unique().apply(len)

In [None]:
# Add two extra columns to the dataframe
df_with_orders_and_shipments = summed_values.assign(orders_by_customer=orders_by_customer, shipments_by_customer=shipments_by_customer)

In [None]:
# Calculate profit per order for each customer
# These metrics were not used in the end
df_values_per_order = df_with_orders_and_shipments.assign(profit_per_order_by_customer=df_with_orders_and_shipments['profit_by_customer']/df_with_orders_and_shipments['orders_by_customer'],
                                                    quantity_per_order_by_customer=df_with_orders_and_shipments['quantity_by_customer']/df_with_orders_and_shipments['orders_by_customer'])
df_values_per_order


In [None]:
# Specify feature columns to use
feature_columns = ['profit_by_customer', 'quantity_by_customer']

In [None]:
# Drop NaNs
feature_dataframe = df_values_per_order[feature_columns]
feature_dataframe = feature_dataframe.dropna(axis=0, how='any')
df_values_per_order.shape[0] - feature_dataframe.shape[0]

### KMeans Clustering is sensitive to outlier so outliers need to be remove

In [None]:
upper_quartiles = feature_dataframe.quantile(0.75)
lower_quartiles = feature_dataframe.quantile(0.25)
iqr = upper_quartiles - lower_quartiles
iqr

In [None]:
upper_bounds = (upper_quartiles + 1.5*iqr)
lower_bounds = (lower_quartiles - 1.5*iqr)
upper_bounds

In [None]:
outliers = ((feature_dataframe > upper_bounds) | (feature_dataframe < lower_bounds)).any(axis=1)
outliers.sum()

In [None]:
# That's a lot of outliers, what if we just removed top and bottom 5% as these are more likely to be fake orders
quantile_outliers = ((feature_dataframe > feature_dataframe.quantile(0.95)) | (feature_dataframe < feature_dataframe.quantile(0.05))).any(axis=1)
quantile_outliers.sum()
# This also removed negatives, using outlier only removes a few negatives

In [None]:
removed_outliers = feature_dataframe[~quantile_outliers]
removed_outliers

### Notes
Given this large number of outliers, consider k-medoids.
Using KMeans but removed upper and lower 5% of data. Having explored the data, these are likely to be outlier cause by returns or low revenue and negative profit resulting in questionable data points. 

In [None]:
sns.scatterplot(removed_outliers, x=feature_columns[0], y=feature_columns[1])

### Notes
Does appear to be any clear cut clusters, one main cluster around (50, 25) but everything else is sparse

In [None]:
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(removed_outliers)
scaled_data = pd.DataFrame(scaled_data, columns=feature_columns)

In [None]:
# Use elbow method to find optimal k
results = []
silhouettes = []
k_range = list(range(2, 40))
for k in k_range:
    k_mean_clusterer = KMeans(n_clusters=k, random_state=26)
    k_mean_clusterer.fit(scaled_data)
    silhouettes.append(silhouette_score(scaled_data, k_mean_clusterer.labels_))
    results.append(k_mean_clusterer.inertia_)

In [None]:
plt.plot(k_range, results)
plt.grid(True)

In [None]:
plt.plot(k_range, silhouettes)
plt.grid(True)

# Final Clustering

In [None]:
# Choose a k of 5 given the results and intial cluster plot

In [None]:
k_mean_clusterer = KMeans(n_clusters=5, random_state=26, n_init=100)
k_mean_clusterer.fit_predict(scaled_data)
results_data = removed_outliers.assign(cluster=k_mean_clusterer.labels_)

In [None]:
plot = sns.scatterplot(data=results_data, x=feature_columns[0], y=feature_columns[1], hue='cluster')
plot.legend(loc='upper left', bbox_to_anchor=(1, 1))


In [None]:
# For curiosity:
# Set feature_columns=['profit_per_order_by_customer', 'quantity_per_order_by_customer', 'orders_by_customer'] and run all above to use this plot

In [None]:
import matplotlib.colors
sns.set_style ("darkgrid")
plot_mean = 3
min_num = 30
x = results_data['quantity_per_order_by_customer']
y = results_data['profit_per_order_by_customer']
z = results_data['orders_by_customer']
colours = results_data['cluster']
colors = ['red','green','blue','purple', 'black', 'cornflowerblue', 'lime', 'darkorange', 'sienna', 'teal', 'aqua', 'deeppink', 'gold']
figure = plt.figure(figsize = (10, 10))
seaborn_plot = figure.add_subplot(111, projection='3d')

seaborn_plot.scatter(x, y, z, c=colours, cmap=matplotlib.colors.ListedColormap(colors))
seaborn_plot.set_xlabel ('quantity_per_order')
seaborn_plot.set_ylabel ('profit_per_order')
seaborn_plot.set_zlabel ('number_of_orders')
plt.show ()