In [None]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from services.app_api.features.extractor import FeatureExtractor
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from yellowbrick.cluster.elbow import KElbowVisualizer
from sklearn.metrics import silhouette_score
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.colors import ListedColormap
from scipy.stats.mstats import winsorize

In [None]:
# customers, sales = pd.read_excel('ucy_eko_data.xlsx', sheet_name='smile_customers'), pd.read_excel('ucy_eko_data.xlsx', sheet_name='smile_sales')
customers, sales = joblib.load('customers.joblib'), joblib.load('sales.joblib')

In [None]:
fe = FeatureExtractor(target_month=3, perform_split=False, period=60, subperiod=60, generation_type='continuous', filtering_set='customers')
X, y = fe.transform(sales=sales, customers=customers)

In [None]:
s = 'fuel_qty_1-60'
s[:s.find('_1-60')]

In [None]:
X.columns = [f'{col[:col.find("_1-60")]}' for col in X.columns]
X.head()

In [None]:
# X_clust = X[X['monetary']<=1000][['monetary', 'recency', 'average_days_between_visits']]

X_clust = X[['monetary', 'recency', 'average_days_between_visits']]

fig = plt.figure(figsize=(6, 6))
ax = Axes3D(fig, auto_add_to_figure=False)
fig.add_axes(ax)

cmap = ListedColormap(sns.color_palette('husl', 256).as_hex())

sc = ax.scatter(X_clust['monetary'], X_clust['recency'], X_clust['average_days_between_visits'], s=40, c=X_clust['monetary'], marker='o', cmap=cmap, alpha=1)
ax.set_xlabel('Total Spending\nDuring the First Month ("Monetary")', labelpad=20)
ax.set_ylabel('Average Days between Visits\nDuring the First Month ("Frequency")', labelpad=20)
ax.set_zlabel('Difference between\nthe Ending Date of the First Month\nand Latest Purchase ("Recency")', labelpad=20)

ax.legend()
fig.show()

Plot shows extreme outliers for the Monetary variable. Let us examine 99th percentile of this variable

In [None]:
print(f"99th percentile: {X_clust['monetary'].quantile(0.99)}")
print(f"Maximum: {X_clust['monetary'].max()}")

Since the 99th percentile differs from the maximum value, it makes sense to winsorize these outliers

In [None]:
winsor = winsorize(X['monetary'], limits=(0.0, 0.01))
with open('service/app_api/features/winsorizing_object_for_threshold.pkl', 'wb') as f:
    pickle.dump(winsor, f)
X_clust['monetary'] = winsor
X_clust['monetary'].max()

Let us now examine the plot again

In [None]:
fig = plt.figure(figsize=(6, 6))
ax = Axes3D(fig, auto_add_to_figure=False)
fig.add_axes(ax)

cmap = ListedColormap(sns.color_palette("husl", 256).as_hex())

sc = ax.scatter(X_clust['monetary'], X_clust['recency'], X_clust['average_days_between_visits'], s=40, c=X_clust['monetary'], marker='o', cmap=cmap, alpha=1)
ax.set_xlabel('Total Spending\nDuring the First Month ("Monetary")', labelpad=20)
ax.set_ylabel('Average Days between Visits\nDuring the First Month ("Frequency")', labelpad=20)
ax.set_zlabel('Difference between\nthe Ending Date of the First Month\nand Latest Purchase ("Recency")', labelpad=20)

fig.show()

Examining the plot, one may consider that 3 or 4 clusters could be optimal

In [None]:
X_clust = StandardScaler().fit_transform(X_clust)
model = KMeans()
elbow_viz = KElbowVisualizer(model, k=(1, 11))

In [None]:
elbow_viz.fit(X_clust)
elbow_viz.show()

Elbow method suggests 4 clusters as the optimal value. Let us also compute silhoutte ccore:

In [None]:
for k in range(2, 11):
    model = KMeans(k)
    labels = model.fit_predict(X_clust)
    print(f'{k}: {silhouette_score(X_clust, labels)}')

The best silhoutte score was also computed for 4 clusters

In [None]:
K = 4
best_model = KMeans(n_clusters=K, random_state=571)
labels = best_model.fit_predict(X_clust)
centroids = best_model.cluster_centers_

In [None]:
fig, ax = plt.subplots()

tsne = TSNE(n_components=2, random_state=0)
df_clust = pd.DataFrame(tsne.fit_transform(X_clust), columns=['PC1','PC2'])
df_clust['cluster'] = pd.Categorical(labels)

sns.scatterplot(x='PC1', y='PC2', hue='cluster', data=df_clust, ax=ax)

In [None]:
X['cluster'] = pd.Categorical(labels).rename_categories(
    {
        0: 'Regular drivers',
        1: 'Passerbys',
        2: 'Frequent drivers',
        3: 'At Churn Risk'
    }
)
X['cluster'].value_counts().sort_index()

In [None]:
X['target'] = pd.Categorical(y).rename_categories(
    {
        0: 'No purchases at month 3',
        1: 'At least 2 purchases at month 3'
    }
)

In [None]:
X = X.rename({'average_days_between_visits': 'frequency'}, axis=1)

In [None]:
X_agg = X.pivot_table(values=['monetary', 'recency', 'frequency'], columns=['target', 'cluster'], aggfunc=np.median).round(2)
X_agg = X_agg.rename({i: i.capitalize() for i in X_agg.index.unique()}, axis=0)
X_agg.columns.names = ['', 'Indicator']

In [None]:
new_cols = []
for i, j in X_agg.columns:
    if j == 'Regular drivers':
        new_cols.append(f"{i}\n{j}")
    else:
        new_cols.append(j)
X_agg.columns = new_cols

In [None]:
X_agg

In [None]:
X_agg_json = {}
for col_0 in X_agg.columns.get_level_values(0).unique():
    X_agg_json[col_0] = [(X_agg[col_0].to_dict(orient='dict'))]

In [None]:
X_agg_json

In [None]:
import json

with open('service/app_ui/shap_plots/segments_target_rfm_table.json', 'w') as f:
    json.dump(X_agg_json, f)

In [None]:
with open('service/app_ui/shap_plots/segments_target_rfm_table.json') as data_file:    
    d = json.load(data_file)  
df = pd.concat({k: pd.DataFrame(v) for k, v in d.items()}).unstack(0).swaplevel(1,0, axis=1).sort_index(axis=1)
df

In [None]:
X.groupby('cluster').agg(
    Recency = pd.NamedAgg('recency', 'median'),
    Frequency = pd.NamedAgg('frequency', 'median'),
    Monetary = pd.NamedAgg('monetary', 'median')
)

- Regular drivers:
    - medium spending
    - medium recency
    - medium frequency

This cluster customers are quite loyal but, perhaps, not driving this much, so they do not need to visit gas stations often and pay much

- Passerbys:
    - lowest spending
    - worst recency
    - zero frequency

This cluster represents users who made one-two visits and most likely left

- Frequent drivers:
    - highest spending
    - best recency
    - best frequency

This cluster represents users who are frequently visiting gas stations, paying a lot. Perhaps, these are the most loyal customers who are driving long distances

- At churn risk:
    - low spending
    - medium recency
    - low frequency

This cluster represents users who are visiting gas stations from time to time. They are not spending much, not making their visits often, so could be considered to be at risk of churn

In [None]:
# X_agg.columns.get_level_values(0).replace()
X_agg.columns.get_level_values(1).rename_categories(
    {
        0: 'Regular drivers',
        1: 'Passerbys',
        2: 'Frequent drivers',
        3: 'At Churn Risk'
    }
)

In [None]:
import pickle
with open('./features/clustering_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

In [None]:
pd.Series(best_model.labels_).value_counts()

In [None]:
with open('services/app_api/features/clustering_model.pkl', 'rb') as f:
            model = pickle.load(f)
        # Load `scipy.stats.mstats.winsorize` output object to define threshold for the `monetary` variable
with open('services/app_api/configs/centroids_table.table.json', 'r') as f:
    centroids_table = json.load(f)
with open('services/app_api/features/winsorizing_object_for_threshold.pkl', 'rb') as f:
    winsor = pickle.load(f)
X_clust = X[['monetary', 'recency', 'average_days_between_visits']]
monetary_threshold = winsor.max()
# Perform winsorization
X_clust.loc[X_clust['monetary'] > monetary_threshold, 'monetary'] = monetary_threshold
scaler = StandardScaler()
labels = pd.Categorical(
    model.predict(
        scaler.fit_transform(X_clust)
    )
)
X['segments'] = labels
X['segments'] = X['segments'].cat.rename_categories({3: 'at_churn_risk', 2: 'frequent_drivers', 1: 'passerbys', 0: 'regular_drivers'})

In [None]:
import plotly.graph_objects as go

df_chart = X['segments'].value_counts().to_frame().reset_index()
print(df_chart)
df_chart['index'] = df_chart['index'].apply(lambda x: ' '.join([s.capitalize() for s in x.split('_')]))

labels = df_chart['index']
values = df_chart['segments']

fig = go.Figure(data=[go.Pie(labels=labels, values=values, textinfo='label+percent', textfont_size=20)])
fig.update_layout(height=800, width=1200, title=dict(text="Eko Customers Distribution by Defined Segments", font=dict(size=30)))
fig.update(layout_showlegend=False)
fig.show()

In [None]:
df_chart = X['segments'].value_counts().to_frame()
df_chart.reset_index()

In [None]:
X[['ciid', 'segments']].to_excel('user_segment_mapping.xlsx', index=False)

In [None]:
import random
from scipy.spatial.distance import cdist

sample = X.loc[random.randint(0, X.index.max()), ['average_days_between_visits', 'recency', 'monetary']]
dist = cdist(np.array(sample, ndmin=2), centroids, 'cosine')[0]
sim = 1-dist
label = labels[sample.name]
print(label)
print(dist)
print(sim)

In [None]:
(sim+1)/2

In [None]:
import wandb
from services.app_api.configs import utils, settings
utils.login_wandb()

In [None]:
# wandb.sklearn.plot_elbow_curve(best_model, X_clust)
with utils.init_wandb_run(
    name='rfm_features_clustering',
    model=KMeans,
    config=best_model.get_params(),
    target_month=None,
    group='clustering',
    job_type='clustering_fit'
) as run:
    metadata = {
        'experiment': {
            'name': run.name,
        }
    }
    artifact = wandb.Artifact(
        name='clustering_report',
        type='performance_metric',
        metadata=metadata
    )
    artifact.add(wandb.Table(data=centroids, columns=['monetary', 'recency', 'average_days_between_visits']), name='centroids_table')
    artifact.add(wandb.Table(columns=['monetary_winsorization_threshold'], data=[[monetary_threshold]]), name='monetary_winsorization_threshold')
    run.log_artifact(artifact)
    run.finish()

In [None]:
cm = utils.get_artifact('K-Means', 'clustering_report')
cm.download('service/app_api/configs/')

In [None]:
import json

with open('service/app_api/configs/centroids_table.table.json', 'r') as f:
    table = json.load(f)

In [None]:
s = np.array(sample, ndmin=2)

In [None]:
s[s[0][table['columns'].index('monetary')] > monetary_threshold] = monetary_threshold

In [None]:
s = StandardScaler().fit_transform(s)

In [None]:
sim = np.exp(-cdist(s, table['data'], 'euclidean'))

In [None]:
sim[0].tolist().index(sim.max())

In [None]:
labels_euc = []
euc_dist_matrix = cdist(X_clust, centroids, 'euclidean')
for row in euc_dist_matrix:
    max_dist = row.min()
    pred = row.tolist().index(max_dist)
    labels_euc.append(pred)

In [None]:
pd.Series(labels == np.array(labels_euc)).value_counts(normalize=True)

In [None]:
euc_dist_df = pd.DataFrame(euc_dist_matrix)
euc_dist_df.head()

In [None]:
euc_dist_df.apply(lambda x: np.exp(-x), axis=0).head()#.describe()

In [None]:
cm = utils.get_artifact('K-Means', f"clustering_report")

In [None]:
cm.get('centroids')

In [None]:
X['target'] = y

In [None]:
(X.groupby('target')['labour_cards_catalogue_consumables'].value_counts(normalize=True)*100).apply(lambda x: f'{round(x, 2)}%')

In [None]:
X.groupby('target')['labour_cards_catalogue_consumables_qty'].describe()

In [None]:
X_clust

In [None]:
scaler.fit_transform(X_clust)

In [None]:
{
        0: 'Regular Drivers',
        1: 'Passerbys',
        2: 'Frequent Drivers',
        3: 'At Churn Risk'
    }

In [None]:
import plotly.graph_objects as go
from sklearn.cluster import KMeans

# Generate sample data
data = scaler.fit_transform(X_clust)

# Create a 3D scatter plot using Plotly Graph Objects
fig = go.Figure()

# Add scatter plot for each cluster
for cluster_id, cluster_label in zip([i for i in range(max(labels) + 1)], ['Regular Drivers', 'Passerbys', 'Frequent Drivers', 'At Churn Risk']):
    cluster_points = X_clust.to_numpy()[labels == cluster_id]
    fig.add_trace(go.Scatter3d(
        x=cluster_points[:, 0],
        y=cluster_points[:, 1],
        z=cluster_points[:, 2],
        mode='markers',
        marker=dict(size=6),
        # name=f'Cluster {cluster_id + 1}',
        name=cluster_label,
        opacity=0.1
    ))

# Update layout
fig.update_layout(
    scene=dict(
        xaxis=dict(
            title='Monetary',
            tickmode='array',
            tickvals=[0.0]+[float(i[i.find(', ')+len(', '):i.find(']')]) for i in pd.cut(scaler.inverse_transform(data)[:, 0], 5).categories.astype('str')],
            ticktext=['0.0']+[i[i.find(', ')+len(', '):i.find(']')] for i in pd.cut(scaler.inverse_transform(data)[:, 0], 5).categories.astype('str')]
        ),
        yaxis=dict(
            title='Recency',
            tickmode='array',
            tickvals=[0.0]+[float(i[i.find(', ')+len(', '):i.find(']')]) for i in pd.cut(scaler.inverse_transform(data)[:, 1], 5).categories.astype('str')],
            ticktext=['0.0']+[i[i.find(', ')+len(', '):i.find(']')] for i in pd.cut(scaler.inverse_transform(data)[:, 1], 5).categories.astype('str')]
        ),
        zaxis=dict(
            title='Frequency',
            tickmode='array',
            tickvals=[0.0]+[float(i[i.find(', ')+len(', '):i.find(']')]) for i in pd.cut(scaler.inverse_transform(data)[:, 2], 5).categories.astype('str')],
            ticktext=['0.0']+[i[i.find(', ')+len(', '):i.find(']')] for i in pd.cut(scaler.inverse_transform(data)[:, 2], 5).categories.astype('str')]
        ),
    ),
    title='3D Plot of Clusters',
    showlegend=True,
    height=800,
    width=1200
)

# Add scatter plot for cluster centers
fig.add_trace(go.Scatter3d(
    x=scaler.inverse_transform(model.cluster_centers_)[:, 0],
    y=scaler.inverse_transform(model.cluster_centers_)[:, 1],
    z=scaler.inverse_transform(model.cluster_centers_)[:, 2],
    mode='markers',
    marker=dict(size=10, color='yellow'),
    name='Cluster Centers'
))

# Show the plot
fig.show()

In [None]:
s = 'ip-172-31-95-167.ec2.internal'
s[s.find('-')+len('-'):s.find('.')]