In [None]:
!cd .. && make dataset && cd notebooks

- order might have multiple items.
  -Ech item might be fulfilled by a distinct seller.
  -ll text identifying stores and partners where replaced by the names of Game of Thrones great houses.

![](https://i.imgur.com/HRhd2Y0.png)


In [None]:
# System modules
import os
import sys
from time import time
import csv

# Append source directory to system path
src_path = os.path.abspath(os.path.join("../src"))
if src_path not in sys.path:
    sys.path.append(src_path)

# Helper functions
import data.helpers as data_helpers
import features.helpers as feat_helpers
import visualization.helpers as vis_helpers
import models.helpers as models_helpers

# numpy and pandas for data manipulation
import numpy as np
import pandas as pd

# sklearn tools for model evaluation
from sklearn.metrics import silhouette_score

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go


# Accelerate the development cycle
SAMPLE_FRAC: float = .1

# Prevent excessive memory usage used by plotly
DRAW_PLOTS: bool = True


In [None]:
customers_df = pd.read_csv(
    "../data/raw/olist_customers_dataset.csv",
    dtype={
        # Nominal qualitative data
        "customer_id": "category",
        "customer_unique_id": "category",
        "customer_city": "category",
        "customer_state": "category",
        "customer_zip_code_prefix": "category",
    },
)
geolocation_df = pd.read_csv(
    "../data/raw/olist_geolocation_dataset.csv",
    dtype={
        # Nominal qualitative data
        "geolocation_zip_code_prefix": "category",
        "geolocation_city": "category",
        "geolocation_state": "category",
        # Continuous quantitative data
        "geolocation_lat": float,
        "geolocation_lng": float,
    },
)
order_items_df = pd.read_csv(
    "../data/raw/olist_order_items_dataset.csv",
    dtype={
        # Nominal qualitative data
        "order_id": "category",
        "order_item_id": "category",
        "product_id": "category",
        "seller_id": "category",
        # Date data
        "shipping_limit_date": str,
        # Continuous quantitative data
        "price": float,
        "freight_value": float,
    },
    parse_dates=["shipping_limit_date"],
)
order_payments_df = pd.read_csv(
    "../data/raw/olist_order_payments_dataset.csv",
    dtype={
        # Nominal qualitative data
        "order_id": "category",
        "payment_type": "category",
        # Discrete quantitative data
        "payment_sequential": int,
        "payment_installments": int,
        # Continuous quantitative data
        "payment_value": float,
    },
)
order_reviews_df = pd.read_csv(
    "../data/raw/olist_order_reviews_dataset.csv",
    dtype={
        # Nominal qualitative data
        "review_id": "category",
        "order_id": "category",
        # Discrete quantitative data
        "review_score": int,
        # Text data
        "review_comment_title": str,
        "review_comment_message": str,
        # Date data
        "review_creation_date": str,
        "review_answer_timestamp": str,
    },
    parse_dates=["review_creation_date", "review_answer_timestamp"],
)
orders_df = pd.read_csv(
    "../data/raw/olist_orders_dataset.csv",
    dtype={
        # Nominal qualitative data
        "order_id": "category",
        "customer_id": "category",
        "order_status": "category",
        # Date data
        "order_purchase_timestamp": str,
        "order_approved_at": str,
        "order_delivered_carrier_date": str,
        "order_delivered_customer_date": str,
        "order_estimated_delivery_date": str,
    },
    parse_dates=[
        "order_purchase_timestamp",
        "order_approved_at",
        "order_delivered_carrier_date",
        "order_delivered_customer_date",
        "order_estimated_delivery_date",
    ],
)
products_df = pd.read_csv(
    "../data/raw/olist_products_dataset.csv",
    dtype={
        # Nominal qualitative data
        "product_id": "category",
        "product_category_name": "category",
        # Discrete quantitative data
        # Nullable : https://pandas.pydata.org/pandas-docs/stable/user_guide/gotchas.html#support-for-integer-na
        "product_name_lenght": pd.Int64Dtype(),
        "product_description_lenght": pd.Int64Dtype(),
        "product_photos_qty": pd.Int64Dtype(),
        # Continuous quantitative data
        "product_weight_g": float,
        "product_length_cm": float,
        "product_height_cm": float,
        "product_width_cm": float,
    },
)
sellers_df = pd.read_csv(
    "../data/raw/olist_sellers_dataset.csv",
    dtype={
        # Nominal qualitative data
        "seller_id": "category",
        "seller_city": "category",
        "seller_state": "category",
        "seller_zip_code_prefix": "category",
    },
)
category_translation_df = pd.read_csv(
    "../data/raw/product_category_name_translation.csv"
)


In [None]:
orders_df[["order_id", "customer_id", "order_purchase_timestamp"]].describe(
    include="all", datetime_is_numeric=True
)


In [None]:
order_payments_df[["order_id", "payment_value"]].describe(include="all")


In [None]:
merged_orders_payments_df = (
    orders_df[["order_id", "customer_id", "order_purchase_timestamp"]]
    .merge(
        order_payments_df[["order_id", "payment_value"]],
        how="left",
        left_on="order_id",
        right_on="order_id",
        validate="1:m",
    )
    .groupby("order_id")
    .agg(
        customer_id=("customer_id", "first"),
        order_purchase_timestamp=("order_purchase_timestamp", "first"),
        payment_value=("payment_value", "sum"),
    )
)

merged_orders_payments_df.describe(include="all", datetime_is_numeric=True)


In [None]:
customers_df.describe(include="all")


### RFM (Recency, Frequency, Monetary)


In [None]:
rfm_df = (
    customers_df[["customer_id", "customer_unique_id"]]
    .merge(
        merged_orders_payments_df,
        how="left",
        left_on="customer_id",
        right_on="customer_id",
        validate="1:1",
    )
    .groupby("customer_unique_id")
    .agg(
        recency=("order_purchase_timestamp", "max"),
        frequency=("customer_id", "count"),
        monetary=("payment_value", "mean"),
    )
)

rfm_df["recency"] = (
    (rfm_df["recency"] - rfm_df["recency"].max()) / np.timedelta64(1, "D")
).values
rfm_df = data_helpers.reduce_dataframe_memory_usage(rfm_df)

rfm_df.describe(include="all", datetime_is_numeric=True)


In [None]:
if DRAW_PLOTS:
    for col in rfm_df.columns:
        fig = px.histogram(
            rfm_df[col],
            marginal="box",
            width=800,
        )
        fig.show()


### Scaling, Cleaning & Sampling


In [None]:
from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()
scaled_rfm_df = pd.DataFrame(
    scaler.fit_transform(rfm_df), columns=rfm_df.columns
)

# Remove outliers and duplicates
scaled_rfm_df = (
    scaled_rfm_df[scaled_rfm_df < 20]
    .dropna()
    .drop_duplicates()
    .sample(frac=SAMPLE_FRAC, random_state=42)
)

rfm_df = pd.DataFrame(
    scaler.inverse_transform(scaled_rfm_df), columns=scaled_rfm_df.columns
)

rfm_df.describe(include="all", datetime_is_numeric=True)


### EDA & Visualization


In [None]:
if DRAW_PLOTS:
    fig = px.scatter_matrix(
        rfm_df,
        color="frequency",
        size="frequency",
        width=1200,
        height=800,
    )
    fig.update_traces(
        diagonal_visible=False,
        showupperhalf=False,
    )
    fig.show()


In [None]:
if DRAW_PLOTS:
    fig = px.scatter(
        rfm_df,
        x="recency",
        y="monetary",
        labels={
            "recency": "Recency : days since last purchase",
            "frequency": "Frequency : number of purchases",
            "monetary": "Monetary : average purchase amount",
        },
        color="frequency",
        size="frequency",
        trendline="ols",
        marginal_x="histogram",
        marginal_y="histogram",
        width=1200,
        height=800,
    )
    fig.show()


#### Distances


In [None]:
from sklearn.neighbors import NearestNeighbors


if DRAW_PLOTS:
    knn = NearestNeighbors(n_neighbors=2).fit(scaled_rfm_df)
    distances, indices = knn.kneighbors(scaled_rfm_df)
    distances = np.sort(distances, axis=0)
    distances = distances[:, 1]

    fig = px.line(
        distances,
        labels={
            "index": "Couple of customers",
            "value": "Euclidian distance",
        },
        title="Customers distances in scaled RFM space",
        width=800,
    )
    fig.show()


#### PCA (Principal Component Analysis)


In [None]:
from sklearn.decomposition import PCA


if DRAW_PLOTS:
    pca = PCA(n_components=2, random_state=42)
    data_pca = pca.fit_transform(scaled_rfm_df)

    # Plot the data in the PCA space
    fig = px.scatter(
        x=data_pca[:1000, 0],
        y=data_pca[:1000, 1],
        trendline="ols",
        title="PCA 2D",
        opacity=0.5,
        width=1200,
        height=800,
        labels={"x": "PCA 1", "y": "PCA 2"},
    )

    # Plot the feature importances in the PCA space
    loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
    for i, feature in enumerate(rfm_df.columns):
        fig.add_shape(
            type="line",
            x0=0,
            y0=0,
            x1=loadings[i, 0],
            y1=loadings[i, 1],
            line=dict(color="red", width=3),
            name=feature,
        )
        fig.add_annotation(
            x=loadings[i, 0],
            y=loadings[i, 1],
            ax=0,
            ay=0,
            xanchor="center",
            yanchor="bottom",
            text=feature,
            name=feature,
        )

    fig.show()


#### Dendrogram


In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage


if DRAW_PLOTS:
    dendrogram(
        linkage(scaled_rfm_df.sample(frac=0.1, random_state=42), method="ward"),
        truncate_mode="level",
        p=3,
    )


## Models evaluation


In [None]:
from sklearn.base import ClusterMixin
from sklearn.metrics import (
    silhouette_score,  # higher is better : https://scikit-learn.org/stable/modules/clustering.html#silhouette-coefficient
    davies_bouldin_score,  # lower is better : https://scikit-learn.org/stable/modules/clustering.html#davies-bouldin-index
    calinski_harabasz_score, # higher is better : https://scikit-learn.org/stable/modules/clustering.html#calinski-harabasz-index
)


models_results = pd.DataFrame(
    columns=[
        "model",
        "n_clusters",
        "labels",
        "cluster_centers",
        "inertia",
        "time",
        "silhouette_score",
        "davies_bouldin_score",
        "calinski_harabasz_score",
    ]
)


def process_model(
    model_class: ClusterMixin,
    model_args: dict,
    param_name: str,
    param_range: list,
    fit_df: pd.DataFrame,
    pred_df: pd.DataFrame,
    verbose: bool = False,
) -> dict:
    results = pd.DataFrame(
        columns=[
            "model",
            "n_clusters",
            "labels",
            "cluster_centers",
            "inertia",
            "time",
            "silhouette_score",
            "davies_bouldin_score",
            "calinski_harabasz_score",
        ]
    )

    for param_value in param_range:
        model_args[param_name] = param_value
        model = model_class(**model_args)

        if verbose:
            print(f">>> Model : { model }")

        if hasattr(model, "fit") and hasattr(model, "predict"):
            start_time = time()
            model.fit(fit_df)
            fit_time = (time() - start_time) / fit_df.shape[0]

            start_time = time()
            predicted_labels = model.predict(pred_df)
            pred_time = (time() - start_time) / pred_df.shape[0]

            fit_pred_time = fit_time + pred_time
        elif hasattr(model, "fit_predict"):
            start_time = time()
            predicted_labels = model.fit_predict(pred_df)
            fit_pred_time = time() - start_time
        else:
            raise ValueError(f"{model} is not a clustering model.")

        n_clusters = predicted_labels.max() + 1
        if verbose:
            print(f"Number of clusters : { n_clusters }")

        if not 1 < n_clusters < 11:
            continue

        result = {
            "model": str(model)
            .replace(", random_state=42", "")
            .replace(", n_jobs=-1", "")
            .replace("(random_state=42", "(")
            .replace("(n_jobs=-1", "("),
            "n_clusters": n_clusters,
            "labels": predicted_labels,
            "cluster_centers": model.cluster_centers_
            if hasattr(model, "cluster_centers_")
            else None,
            "inertia": model.inertia_ if hasattr(model, "inertia_") else None,
            "fit_pred_time": fit_pred_time,
            "silhouette_score": silhouette_score(
                pred_df, predicted_labels, random_state=42
            ),
            "davies_bouldin_score": davies_bouldin_score(
                pred_df, predicted_labels
            ),
            "calinski_harabasz_score": calinski_harabasz_score(
                pred_df, predicted_labels
            ),
        }

        if verbose:
            print(f"Score : { round(result['silhouette_score'], 3) }")

        results = results.append(result, ignore_index=True)

    return results


def plot_scores(results: pd.DataFrame) -> None:
    fig = px.line(
        results,
        x="model",
        y=[
            "silhouette_score",
            "davies_bouldin_score",
        ],
        title="Clustering models evaluation",
        markers=True,
        width=800,
    )
    fig.show()


def plot_clusters(model_name, pred_df, labels, cluster_centers=None) -> None:
    fig = px.scatter_3d(
        pred_df,
        x="recency",
        y="frequency",
        z="monetary",
        title=f"Clustering : { model_name }",
        color=labels,
        opacity=0.5,
        width=1200,
        height=800,
    )

    if cluster_centers is not None:
        fig.add_trace(
            go.Scatter3d(
                x=cluster_centers[:, 0],
                y=cluster_centers[:, 1],
                z=cluster_centers[:, 2],
                mode="markers",
                marker_symbol="x",
                hovertemplate="recency: %{x}, frequency: %{y}, monetary: %{z}",
                text="Cluster Center",
                name="Cluster Center",
            )
        )

    fig.show()


def plot_boxes(model_name, pred_df, labels, cluster_centers=None) -> None:
    pred_df["labels"] = labels

    fig = px.box(
        rfm_df,
        title=f"{ model_name } - Recency : days since last purchase",
        x="recency",
        color="labels",
        width=800,
    )
    fig.update_traces(boxmean="sd")
    fig.update_traces(notched=True)
    fig.show()

    fig = px.box(
        rfm_df,
        title=f"{ model_name } - Frequency : total number of purchases",
        x="frequency",
        color="labels",
        width=800,
    )
    fig.update_traces(boxmean="sd")
    fig.update_traces(notched=True)
    fig.show()

    fig = px.box(
        rfm_df,
        title=f"{ model_name } - Monetary : average purchase value",
        x="monetary",
        color="labels",
        width=800,
    )
    fig.update_traces(boxmean="sd")
    fig.update_traces(notched=True)
    fig.show()


### KMeans


In [None]:
from sklearn.cluster import KMeans


results = process_model(
    model_class=KMeans,
    model_args={"random_state": 42},
    param_name="n_clusters",
    param_range=list(range(2, 11)),
    fit_df=scaled_rfm_df,
    pred_df=scaled_rfm_df,
)
plot_scores(results)


In [None]:
best_result = results.sort_values(by="silhouette_score", ascending=False).iloc[
    0
]
models_results = models_results.append(best_result, ignore_index=True)

plot_clusters(
    model_name=best_result["model"],
    pred_df=rfm_df.head(1000),
    labels=best_result["labels"][:1000],
    cluster_centers=scaler.inverse_transform(best_result["cluster_centers"])
    if best_result["cluster_centers"] is not None
    else None,
)
plot_boxes(
    model_name=best_result["model"],
    pred_df=rfm_df,
    labels=best_result["labels"],
)


In [None]:
second_best_result = results.sort_values(
    by="silhouette_score", ascending=False
).iloc[1]
plot_clusters(
    model_name=second_best_result["model"],
    pred_df=rfm_df.head(1000),
    labels=second_best_result["labels"][:1000],
    cluster_centers=scaler.inverse_transform(
        second_best_result["cluster_centers"]
    )
    if best_result["cluster_centers"] is not None
    else None,
)
plot_boxes(
    model_name=second_best_result["model"],
    pred_df=rfm_df,
    labels=second_best_result["labels"],
)


### MiniBatchKMeans


In [None]:
from sklearn.cluster import MiniBatchKMeans


results = process_model(
    model_class=MiniBatchKMeans,
    model_args={"random_state": 42},
    param_name="n_clusters",
    param_range=list(range(2, 11)),
    fit_df=scaled_rfm_df,
    pred_df=scaled_rfm_df,
)
plot_scores(results)


In [None]:
best_result = results.sort_values(by="silhouette_score", ascending=False).iloc[
    0
]
models_results = models_results.append(best_result, ignore_index=True)

plot_clusters(
    model_name=best_result["model"],
    pred_df=rfm_df.head(1000),
    labels=best_result["labels"][:1000],
    cluster_centers=scaler.inverse_transform(best_result["cluster_centers"])
    if best_result["cluster_centers"] is not None
    else None,
)
plot_boxes(
    model_name=best_result["model"],
    pred_df=rfm_df,
    labels=best_result["labels"],
)


In [None]:
second_best_result = results.sort_values(
    by="silhouette_score", ascending=False
).iloc[1]
plot_clusters(
    model_name=second_best_result["model"],
    pred_df=rfm_df.head(1000),
    labels=second_best_result["labels"][:1000],
    cluster_centers=scaler.inverse_transform(
        second_best_result["cluster_centers"]
    )
    if best_result["cluster_centers"] is not None
    else None,
)
plot_boxes(
    model_name=second_best_result["model"],
    pred_df=rfm_df,
    labels=second_best_result["labels"],
)


### AffinityPropagation


In [None]:
from sklearn.cluster import AffinityPropagation


results = process_model(
    model_class=AffinityPropagation,
    model_args={"random_state": 42},
    param_name="damping",
    param_range=[round(e, 3) for e in np.linspace(0.85, 0.99, 10)],
    fit_df=scaled_rfm_df.sample(frac=0.1, random_state=42),
    pred_df=scaled_rfm_df,
)
plot_scores(results)


In [None]:
best_result = results.sort_values(by="silhouette_score", ascending=False).iloc[
    0
]
models_results = models_results.append(best_result, ignore_index=True)

plot_clusters(
    model_name=best_result["model"],
    pred_df=rfm_df.head(1000),
    labels=best_result["labels"][:1000],
    cluster_centers=scaler.inverse_transform(best_result["cluster_centers"])
    if best_result["cluster_centers"] is not None
    else None,
)
plot_boxes(
    model_name=best_result["model"],
    pred_df=rfm_df,
    labels=best_result["labels"],
)


In [None]:
second_best_result = results.sort_values(
    by="silhouette_score", ascending=False
).iloc[1]
plot_clusters(
    model_name=second_best_result["model"],
    pred_df=rfm_df.head(1000),
    labels=second_best_result["labels"][:1000],
    cluster_centers=scaler.inverse_transform(
        second_best_result["cluster_centers"]
    )
    if best_result["cluster_centers"] is not None
    else None,
)
plot_boxes(
    model_name=second_best_result["model"],
    pred_df=rfm_df,
    labels=second_best_result["labels"],
)


### AgglomerativeClustering


In [None]:
from sklearn.cluster import AgglomerativeClustering

fit_pred_df = scaled_rfm_df.sample(frac=0.5, random_state=42)
results = process_model(
    model_class=AgglomerativeClustering,
    model_args={},
    param_name="n_clusters",
    param_range=list(range(2, 11)),
    fit_df=fit_pred_df,
    pred_df=fit_pred_df,
)
plot_scores(results)


In [None]:
best_result = results.sort_values(by="silhouette_score", ascending=False).iloc[
    0
]
models_results = models_results.append(best_result, ignore_index=True)

plot_clusters(
    model_name=best_result["model"],
    pred_df=pd.DataFrame(
        scaler.inverse_transform(fit_pred_df.head(1000)),
        columns=fit_pred_df.columns,
    ),
    labels=best_result["labels"][:1000],
)
plot_boxes(
    model_name=best_result["model"],
    pred_df=pd.DataFrame(
        scaler.inverse_transform(fit_pred_df), columns=fit_pred_df.columns
    ),
    labels=best_result["labels"],
)


In [None]:
second_best_result = results.sort_values(
    by="silhouette_score", ascending=False
).iloc[1]
plot_clusters(
    model_name=second_best_result["model"],
    pred_df=pd.DataFrame(
        scaler.inverse_transform(fit_pred_df.head(1000)),
        columns=fit_pred_df.columns,
    ),
    labels=second_best_result["labels"][:1000],
)
plot_boxes(
    model_name=second_best_result["model"],
    pred_df=pd.DataFrame(
        scaler.inverse_transform(fit_pred_df), columns=fit_pred_df.columns
    ),
    labels=second_best_result["labels"],
)


### MeanShift


In [None]:
from sklearn.cluster import MeanShift


results = process_model(
    model_class=MeanShift,
    model_args={"n_jobs": -1},
    param_name="bandwidth",
    param_range=[round(e, 1) for e in np.linspace(0.5, 5, 10)],
    fit_df=scaled_rfm_df.sample(frac=0.1, random_state=42),
    pred_df=scaled_rfm_df,
)
plot_scores(results)


In [None]:
best_result = results.sort_values(by="silhouette_score", ascending=False).iloc[
    0
]
models_results = models_results.append(best_result, ignore_index=True)

plot_clusters(
    model_name=best_result["model"],
    pred_df=rfm_df.head(1000),
    labels=best_result["labels"][:1000],
    cluster_centers=scaler.inverse_transform(best_result["cluster_centers"])
    if best_result["cluster_centers"] is not None
    else None,
)
plot_boxes(
    model_name=best_result["model"],
    pred_df=rfm_df,
    labels=best_result["labels"],
)


In [None]:
second_best_result = results.sort_values(
    by="silhouette_score", ascending=False
).iloc[1]
plot_clusters(
    model_name=second_best_result["model"],
    pred_df=rfm_df.head(1000),
    labels=second_best_result["labels"][:1000],
    cluster_centers=scaler.inverse_transform(
        second_best_result["cluster_centers"]
    )
    if best_result["cluster_centers"] is not None
    else None,
)
plot_boxes(
    model_name=second_best_result["model"],
    pred_df=rfm_df,
    labels=second_best_result["labels"],
)


### SpectralClustering


In [None]:
from sklearn.cluster import SpectralClustering


fit_pred_df = scaled_rfm_df.sample(frac=0.2, random_state=42)
results = process_model(
    model_class=SpectralClustering,
    model_args={
        "random_state": 42,
        "n_jobs": -1,
    },
    param_name="n_clusters",
    param_range=list(range(2, 11)),
    fit_df=fit_pred_df,
    pred_df=fit_pred_df,
)
plot_scores(results)


In [None]:
best_result = results.sort_values(by="silhouette_score", ascending=False).iloc[
    0
]
models_results = models_results.append(best_result, ignore_index=True)

plot_clusters(
    model_name=best_result["model"],
    pred_df=pd.DataFrame(
        scaler.inverse_transform(fit_pred_df.head(1000)),
        columns=fit_pred_df.columns,
    ),
    labels=best_result["labels"][:1000],
    cluster_centers=scaler.inverse_transform(best_result["cluster_centers"])
    if best_result["cluster_centers"] is not None
    else None,
)
plot_boxes(
    model_name=best_result["model"],
    pred_df=pd.DataFrame(
        scaler.inverse_transform(fit_pred_df), columns=fit_pred_df.columns
    ),
    labels=best_result["labels"],
)


In [None]:
second_best_result = results.sort_values(
    by="silhouette_score", ascending=False
).iloc[1]

plot_clusters(
    model_name=second_best_result["model"],
    pred_df=pd.DataFrame(
        scaler.inverse_transform(fit_pred_df.head(1000)),
        columns=fit_pred_df.columns,
    ),
    labels=second_best_result["labels"][:1000],
    cluster_centers=scaler.inverse_transform(
        second_best_result["cluster_centers"]
    )
    if second_best_result["cluster_centers"] is not None
    else None,
)
plot_boxes(
    model_name=second_best_result["model"],
    pred_df=pd.DataFrame(
        scaler.inverse_transform(fit_pred_df), columns=fit_pred_df.columns
    ),
    labels=second_best_result["labels"],
)


### DBSCAN


In [None]:
from sklearn.cluster import DBSCAN


fit_pred_df = scaled_rfm_df.sample(frac=0.5, random_state=42)
results = process_model(
    model_class=DBSCAN,
    model_args={
        "n_jobs": -1,
    },
    param_name="eps",
    param_range=[round(e, 2) for e in np.linspace(0.1, 1.5, 10)],
    fit_df=fit_pred_df,
    pred_df=fit_pred_df,
)
plot_scores(results)


In [None]:
best_result = results.sort_values(by="silhouette_score", ascending=False).iloc[
    0
]
models_results = models_results.append(best_result, ignore_index=True)

plot_clusters(
    model_name=best_result["model"],
    pred_df=pd.DataFrame(
        scaler.inverse_transform(fit_pred_df.head(1000)),
        columns=fit_pred_df.columns,
    ),
    labels=best_result["labels"][:1000],
    cluster_centers=scaler.inverse_transform(best_result["cluster_centers"])
    if best_result["cluster_centers"] is not None
    else None,
)
plot_boxes(
    model_name=best_result["model"],
    pred_df=pd.DataFrame(
        scaler.inverse_transform(fit_pred_df), columns=fit_pred_df.columns
    ),
    labels=best_result["labels"],
)


In [None]:
second_best_result = results.sort_values(
    by="silhouette_score", ascending=False
).iloc[1]

plot_clusters(
    model_name=second_best_result["model"],
    pred_df=pd.DataFrame(
        scaler.inverse_transform(fit_pred_df.head(1000)),
        columns=fit_pred_df.columns,
    ),
    labels=second_best_result["labels"][:1000],
    cluster_centers=scaler.inverse_transform(
        second_best_result["cluster_centers"]
    )
    if second_best_result["cluster_centers"] is not None
    else None,
)
plot_boxes(
    model_name=second_best_result["model"],
    pred_df=pd.DataFrame(
        scaler.inverse_transform(fit_pred_df), columns=fit_pred_df.columns
    ),
    labels=second_best_result["labels"],
)


### OPTICS


In [None]:
from sklearn.cluster import OPTICS


fit_pred_df = scaled_rfm_df.sample(frac=0.5, random_state=42)
results = process_model(
    model_class=OPTICS,
    model_args={
        "n_jobs": -1,
    },
    param_name="min_samples",
    param_range=[round(e, 4) for e in np.linspace(0.001, 0.004, 10)],
    fit_df=fit_pred_df,
    pred_df=fit_pred_df,
)
plot_scores(results)


In [None]:
best_result = results.sort_values(by="silhouette_score", ascending=False).iloc[
    0
]
models_results = models_results.append(best_result, ignore_index=True)

plot_clusters(
    model_name=best_result["model"],
    pred_df=pd.DataFrame(
        scaler.inverse_transform(fit_pred_df.head(1000)),
        columns=fit_pred_df.columns,
    ),
    labels=best_result["labels"][:1000],
    cluster_centers=scaler.inverse_transform(best_result["cluster_centers"])
    if best_result["cluster_centers"] is not None
    else None,
)
plot_boxes(
    model_name=best_result["model"],
    pred_df=pd.DataFrame(
        scaler.inverse_transform(fit_pred_df), columns=fit_pred_df.columns
    ),
    labels=best_result["labels"],
)


In [None]:
second_best_result = results.sort_values(
    by="silhouette_score", ascending=False
).iloc[1]

plot_clusters(
    model_name=second_best_result["model"],
    pred_df=pd.DataFrame(
        scaler.inverse_transform(fit_pred_df.head(1000)),
        columns=fit_pred_df.columns,
    ),
    labels=second_best_result["labels"][:1000],
    cluster_centers=scaler.inverse_transform(
        second_best_result["cluster_centers"]
    )
    if second_best_result["cluster_centers"] is not None
    else None,
)
plot_boxes(
    model_name=second_best_result["model"],
    pred_df=pd.DataFrame(
        scaler.inverse_transform(fit_pred_df), columns=fit_pred_df.columns
    ),
    labels=second_best_result["labels"],
)


### Birch


In [None]:
from sklearn.cluster import Birch


results = process_model(
    model_class=Birch,
    model_args={},
    param_name="n_clusters",
    param_range=list(range(2, 11)),
    fit_df=scaled_rfm_df.sample(frac=0.2, random_state=42),
    pred_df=scaled_rfm_df,
)
plot_scores(results)


In [None]:
best_result = results.sort_values(by="silhouette_score", ascending=False).iloc[
    0
]
models_results = models_results.append(best_result, ignore_index=True)

plot_clusters(
    model_name=best_result["model"],
    pred_df=rfm_df.head(1000),
    labels=best_result["labels"][:1000],
    cluster_centers=scaler.inverse_transform(best_result["cluster_centers"])
    if best_result["cluster_centers"] is not None
    else None,
)
plot_boxes(
    model_name=best_result["model"],
    pred_df=rfm_df,
    labels=best_result["labels"],
)


In [None]:
second_best_result = results.sort_values(
    by="silhouette_score", ascending=False
).iloc[1]
plot_clusters(
    model_name=second_best_result["model"],
    pred_df=rfm_df.head(1000),
    labels=second_best_result["labels"][:1000],
    cluster_centers=scaler.inverse_transform(
        second_best_result["cluster_centers"]
    )
    if best_result["cluster_centers"] is not None
    else None,
)
plot_boxes(
    model_name=second_best_result["model"],
    pred_df=rfm_df,
    labels=second_best_result["labels"],
)


### GMM (Gaussian Mixture)


In [None]:
from sklearn.mixture import GaussianMixture


results = process_model(
    model_class=GaussianMixture,
    model_args={
        "random_state": 42,
    },
    param_name="n_components",
    param_range=list(range(2, 11)),
    fit_df=scaled_rfm_df,
    pred_df=scaled_rfm_df,
)
plot_scores(results)


In [None]:
best_result = results.sort_values(by="silhouette_score", ascending=False).iloc[
    0
]
models_results = models_results.append(best_result, ignore_index=True)

plot_clusters(
    model_name=best_result["model"],
    pred_df=rfm_df.head(1000),
    labels=best_result["labels"][:1000],
    cluster_centers=scaler.inverse_transform(best_result["cluster_centers"])
    if best_result["cluster_centers"] is not None
    else None,
)
plot_boxes(
    model_name=best_result["model"],
    pred_df=rfm_df,
    labels=best_result["labels"],
)


In [None]:
second_best_result = results.sort_values(
    by="silhouette_score", ascending=False
).iloc[1]
plot_clusters(
    model_name=second_best_result["model"],
    pred_df=rfm_df.head(1000),
    labels=second_best_result["labels"][:1000],
    cluster_centers=scaler.inverse_transform(
        second_best_result["cluster_centers"]
    )
    if best_result["cluster_centers"] is not None
    else None,
)
plot_boxes(
    model_name=second_best_result["model"],
    pred_df=rfm_df,
    labels=second_best_result["labels"],
)


### Bayesian Gaussian Mixture


In [None]:
from sklearn.mixture import BayesianGaussianMixture


results = process_model(
    model_class=BayesianGaussianMixture,
    model_args={
        "random_state": 42,
    },
    param_name="n_components",
    param_range=list(range(2, 11)),
    fit_df=scaled_rfm_df,
    pred_df=scaled_rfm_df,
)
plot_scores(results)


In [None]:
best_result = results.sort_values(by="silhouette_score", ascending=False).iloc[
    0
]
models_results = models_results.append(best_result, ignore_index=True)

plot_clusters(
    model_name=best_result["model"],
    pred_df=rfm_df.head(1000),
    labels=best_result["labels"][:1000],
    cluster_centers=scaler.inverse_transform(best_result["cluster_centers"])
    if best_result["cluster_centers"] is not None
    else None,
)
plot_boxes(
    model_name=best_result["model"],
    pred_df=rfm_df,
    labels=best_result["labels"],
)


In [None]:
second_best_result = results.sort_values(
    by="silhouette_score", ascending=False
).iloc[1]
plot_clusters(
    model_name=second_best_result["model"],
    pred_df=rfm_df.head(1000),
    labels=second_best_result["labels"][:1000],
    cluster_centers=scaler.inverse_transform(
        second_best_result["cluster_centers"]
    )
    if best_result["cluster_centers"] is not None
    else None,
)
plot_boxes(
    model_name=second_best_result["model"],
    pred_df=rfm_df,
    labels=second_best_result["labels"],
)


### Comparison

In [None]:
models_results.sort_values(by="silhouette_score", ascending=False)
