# Clustering by Mini Batch K-Means
ref: 
- https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans
- https://scikit-learn.org/stable/modules/clustering.html#k-means
- https://scikit-learn.org/stable/modules/generated/sklearn.cluster.MiniBatchKMeans.html

# Set up

In [None]:
%load_ext autoreload
%autoreload 2

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from sklearn.cluster import MiniBatchKMeans
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import plotly.express as px
import pandas as pd
from sklearn.manifold import TSNE
from datetime import datetime
import pickle

# scikit-learn==0.24.2

# Data
## Load Data

Cell below assumed that dataset is registered in AML Workspace.

In [None]:
# azureml-core of version 1.0.72 or higher is required
# azureml-dataprep[pandas] of version 1.1.34 or higher is required
from azureml.core import Workspace, Dataset

workspace = Workspace.from_config()
print(workspace.name, workspace.resource_group, workspace.location, workspace.subscription_id, sep = '\n')

dataset = Dataset.get_by_name(workspace, name='online-retail-frm')
df_orig = dataset.to_pandas_dataframe()

In [None]:
df = df_orig.copy()
df

### Calculate Within Cluster Sum of Squared Errors (*WCSS*) aka Inertia
- This metric is the same as `km.inertia_`
- Inertia can be recognized as a measure of how internally coherent clusters are. 
- there are other metrics, e.g. `Silhouette Score` 

In [None]:
def calculate_wcss(min_cluster, max_cluster, batch_size, data):
    wcss=[]
    for i in range(min_cluster, max_cluster):
        km = MiniBatchKMeans(n_clusters=i,
                             random_state=9,
                             batch_size=batch_size,
                             max_iter=100).fit(data)
        km.fit(data)
        wcss.append(km.inertia_)
    return wcss

In [None]:
X = df.copy().to_numpy()

min_cluster = 1
max_cluster = 11
batch_size = int(len(X)*0.1); batch_size

wcss = calculate_wcss(min_cluster, max_cluster, batch_size, X)

### Elbow-Curve
- The most optimum value for *k* is where the 'elbow joint' is, in this case, it is 5, after which the value of *wcss* decrease less than before. 

In [None]:
#The elbow curve
_ = plt.figure(figsize=(12,6))
_ = plt.plot(range(min_cluster, max_cluster), wcss)
_ = plt.plot(range(min_cluster, max_cluster),wcss, linewidth=2, color="red", marker ="8")
_ = plt.xlabel("K Value")
_ = plt.xticks(np.arange(1,11,1))
_ = plt.ylabel("WCSS")
_ = plt.grid(True)
_ = plt.show()

### Set *k* to 4

In [None]:
# Assuming 5 clusters
n_clusters = 4
km = MiniBatchKMeans(n_clusters=n_clusters,
                     random_state=9,
                     batch_size=batch_size,
                     max_iter=100).fit(X)

# Fitting the input data
km.fit(X)

# save the model
# if False:
if True:
    import sklearn
    model_filepath = f'../../.aml/models/mini-batch-k-means-customer-segmentation.pkl'
    pickle.dump(km, open(model_filepath, "wb"))

# Predicting the labels of the input data
y = km.predict(X)

# Add the labels to a column named label
df_clusters = df.copy()
df_clusters["label"] = y

df_clusters.head()

### Register the Model

In [None]:
if True:
# if False:
    from azure.ai.ml import MLClient
    from azure.ai.ml.entities import Model
    #from azure.ai.ml._constants import ModelType
    from azure.identity import DefaultAzureCredential

    # get a handle to the workspace
    ml_client = MLClient(credential=DefaultAzureCredential(), 
                        subscription_id=workspace.subscription_id, 
                        resource_group_name=workspace.resource_group, 
                        workspace_name=workspace.name)
    ml_client

    model_filepath = f'../../.aml/models/mini-batch-k-means-customer-segmentation.pkl'

    file_model = Model(
        path = model_filepath,
        #type=ModelType.CUSTOM,
        name = "mini-batch-k-means-customer-segmentation",
        description = 'mini-batch-k-means-customer-segmentation | scikit-learn==0.24.2',
        auto_increment_version = True,)
    
    ml_client.models.create_or_update(file_model)

### Inverse Power Transformed data for Intepretation Purpose [Optional]

In [None]:
import pickle

ptransformer_filepath = f'../../.aml/models/powertransformer.pkl'
ptransformer = pickle.load(open(ptransformer_filepath, "rb"))

df_clusters_inversed = pd.DataFrame(ptransformer.inverse_transform(X),
                          columns=['Recency(Days)',	'Frequency', 'Monetary(£)'])

df_clusters_inversed['label'] = df_clusters['label']
df_clusters_inversed    

### Visualise

In [None]:
df_clusters_tsne = X.copy()
df_clusters_tsne
df_clusters_tsne[:10] # get the head of the array. .head() didn't work here so using :10

X_embedded = TSNE(n_components=2, 
                  init='random').fit_transform(df_clusters_tsne)

# Plot t-SNE
_ = plt.figure(figsize=(12,8))
_ = plt.title('Flattened Graph of {} Clusters'.format(n_clusters))
_ = sns.scatterplot(x=X_embedded[:,0], y=X_embedded[:,1], hue=df_clusters["label"], palette="Set2")

In [None]:
fig = px.scatter_3d(df_clusters, 
                    x='Monetary(£)', y='Frequency', z='Recency(Days)',  		
                    color='label',
                    width=1200, height=800) # figure size
fig.show()

In [None]:
_ = plt.figure(figsize=(15,9))  # figsize doesn't work here? why?
_ = sns.pairplot(data=df_clusters, hue='label', palette='Set2', height=3, aspect=1.5)
_ = plt.show()

### Possible Interpretation
Copy from [00-interprete.ipynb](../02-interpretation/00-interprete.ipynb)

Cluster | Recency(Days)                 | Frequency (last 6 months) | Monetary(£) last 6 months | Recommended Action / Thoughts |
---     | ---                           | ---                       | ---                       | --- |
0       | Active since last 2 weeks     | 100 transactions          | 1500                      | Very frequent customers who rely on online shopping. |
1       | Active since last 3 months    | 30 transactions           | 500                       | Inactive for a while, but has been typical in spending. |
2       | Active since last 3 months    | 7 transactions            | 150                       | Inactive for a while, and probably not rugular customer. |
3       | Active since last 2 weeks     | 20 transactions           | 350                       | Active and regular, typical spending. |
