# Clustering by Mini Batch K-Means

Here, we apply Mini Batch K-Means in attempt to segment data described by Recency, Frequency and Monetary Value of this group of customers. See [](../00-data/01-analyse-customer-value-by-frequency-recency-monetary-value.ipynb) for how the data is prepared. 


References: 
- [K-Means](https://scikit-learn.org/stable/modules/clustering.html#k-means)
- [Mini Batch K-Means](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.MiniBatchKMeans.html)

Notebooks Sequence:
- [/00-data/00-explore-and-prepare-data.ipynb](../00-data/00-explore-and-prepare-data.ipynb)
- [/00-data/01-analyse-customer-value-by-frequency-recency-monetary-value.ipynb](../00-data/01-analyse-customer-value-by-frequency-recency-monetary-value.ipynb)
- [This Notebook](../01-clustering/00-clustering-by-mini-batch-k-means.ipynb)
- [/02-interpretation/00-interprete.ipynb](../02-interpretation/00-interprete.ipynb)

# Set up

In [None]:
%load_ext autoreload
%autoreload 2

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from sklearn.cluster import MiniBatchKMeans
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer, normalize
from sklearn.pipeline import Pipeline
import numpy as np

# Data
## Load Data

Cell below assumed that dataset is registered in AML Workspace.

In [None]:
# azureml-core of version 1.0.72 or higher is required
# azureml-dataprep[pandas] of version 1.1.34 or higher is required
from azureml.core import Workspace, Dataset

# Get information about worksapce
workspace = Workspace.from_config()
workspace

# Get dataset registered in AML by name
dataset = Dataset.get_by_name(workspace, name='online-retail-frm')
dataset_transformed = Dataset.get_by_name(workspace, name='online-retail-frm-transformed')

# Convert Dataset to Pandas DataFrame
df_orig = dataset.to_pandas_dataframe()
df_transformed_orig = dataset_transformed.to_pandas_dataframe()

In [None]:
# Make a copy
df = df_orig.copy()
df

In [None]:
# Make a copy
df_transformed = df_transformed_orig.copy()
df_transformed

## Split Data


In [None]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.01, random_state=9)
df_train.shape
df_train.head()
df_test.shape
df_test.head()

## Calculate Within Cluster Sum of Squared Errors (WCSS) aka Inertia

In [None]:
def calculate_wcss(min_cluster, max_cluster, batch_size, data):
    """ Calculate Within Cluster Sum of Squared Errors (*WCSS*), i.e. km.inertia_ when iterate through min_cluster to max_cluster
    """
    wcss=[]
    for i in range(min_cluster, max_cluster):
        km = MiniBatchKMeans(n_clusters=i,
                             random_state=9,
                             batch_size=batch_size,
                             max_iter=100).fit(data)
        km.fit(data)
        wcss.append(km.inertia_)
    return wcss

In [None]:
# Set parameterss
min_cluster = 1
max_cluster = 11
batch_size = int(df.shape[0]*0.1)

# run calculate_wcss
wcss = calculate_wcss(min_cluster, max_cluster, batch_size, df_transformed) # note that df_transformed is used here

wcss

### Retrieve the opitmal `k` by pre-defined threshold

In [None]:
def get_optimal_k(wcss):
    """ Get optimal k
    """
    # Get gradient
    wcss_grad = np.gradient(wcss)

    # Normalise gradient to maximum value
    wcss_grad_norm = normalize(wcss_grad.reshape(1, -1), norm='max')

    # Get optimal_k by pre-defined threshold
    optimal_k = np.argmin(wcss_grad_norm < -0.15) + 1
    
    return optimal_k

In [None]:
k = get_optimal_k(wcss)
k

## Define `sklearn.pipeline`
References:
- [User Guide](https://scikit-learn.org/stable/modules/compose.html#pipeline)
- [`sklearn.pipeline`](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html)

In [None]:
# Configure PowerTransformer
ptransformer = PowerTransformer(method="yeo-johnson")
ptransformer

# Configure kmeans
n_clusters = 4
batch_size = int(df_train.shape[0]*0.1)

km = MiniBatchKMeans(n_clusters=n_clusters,
                     random_state=9,
                     batch_size=batch_size,
                     max_iter=100)
km

pipeline = Pipeline(steps=[('ptransformer', ptransformer), ('mini-batch-k-means', km)],
                    verbose=True)
pipeline

# MLFlow

Create a new MLFlow experiment.

In [None]:
import mlflow

# Create an experiment
experiment_id = mlflow.create_experiment(name='online-retail-customer-segmentation-mlflow', 
                                         tags={'purpose':'tutorial', 'pipeline':'sklearn.pipeline'})

# Get experiment by experimnet_id
experiment = mlflow.get_experiment(experiment_id=experiment_id)

# Set this experiment as the active experiment
experiment = mlflow.set_experiment(experiment_id=experiment_id)

# Display
experiment

## Imply input and output signature

In [None]:
from mlflow.models import infer_signature

# Example input and output
model_output = np.array([0, 2]) # example output, i.e. cluster label
model_input = df.iloc[0:2]

# Infer signature, i.e. input and output
signature = infer_signature(model_input=model_input, model_output=model_output)
signature

## Fit the pipeline

In [None]:
# Start autolog
mlflow.sklearn.autolog() 

# Metrics to log
metrics = {"wcss": wcss[k], 
           "k": k}

with mlflow.start_run() as run:
    # fit pipeline
    pipeline.fit(df) # note that df is used here

    # log custom metrics
    mlflow.log_metrics(metrics=metrics) 

## Load the trained model

In [None]:
run_id = run.info.run_id; run_id
pipeline_model = mlflow.sklearn.load_model(f"runs:/{run_id}/model")
type(pipeline_model)
pipeline_model

## Use model to predict

In [None]:
# Use trained model to predict using df_test
pipeline_model.predict(df_test)

## Retrieve `run` information

### Retrieve `run` information

In [None]:
run.data

In [None]:
run.info

### Retreive `artifacts`

In [None]:
# Retrieve mlflow tracking
client = mlflow.tracking.MlflowClient()
client

# List mlflow artifacts
client.list_artifacts(run_id=run.info.run_id)

## Data Management

### Upload to Datastore

In [None]:
if False:
#if True:
    from azureml.core import Workspace, Dataset

    workspace = Workspace.from_config()
    print(workspace.name, workspace.resource_group, workspace.location, workspace.subscription_id, sep = '\n')

    datastore = workspace.get_default_datastore()
    datastore

    # Save to local
    filename = '../../.aml/data/online-retail-frm-train.csv'
    df_train.to_csv(filename, index=False)

    filename = '../../.aml/data/online-retail-frm-test.csv'
    df_test.to_csv(filename, index=False)

    # Upload to datastore
    Dataset.File.upload_directory('../../.aml/data', datastore, overwrite=True)

### Register Dataframe as Dataset

In [None]:
if False:
#if True:
    from azureml.core import Workspace, Dataset

    workspace = Workspace.from_config()
    workspace

    datastore = workspace.get_default_datastore()
    datastore

    # Dataset name to register as 
    name = 'online-retail-frm-train'

    # create a new dataset
    Dataset.Tabular.register_pandas_dataframe(dataframe=df_train, 
                                              target=datastore, 
                                              name=name, 
                                              show_progress=True, 
                                              tags={'Purpose':'Tutorial'})

    # Dataset name to register as 
    name = 'online-retail-frm-test'

    # create a new dataset
    Dataset.Tabular.register_pandas_dataframe(dataframe=df_test, 
                                              target=datastore, 
                                              name=name, 
                                              show_progress=True, 
                                              tags={'Purpose':'Tutorial'})

##