## <a name="C4">0. Importation et fonctions</a>

### 0.1 Importation des librairies

In [83]:
import warnings
warnings.filterwarnings('ignore')

import sqlite3
import pandas as pd
import numpy as np
from pandasql import sqldf

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.io as pio

# Feature Engineering
from sklearn import model_selection, preprocessing
from feature_engine import selection, imputation, encoding, discretisation, transformation, outliers, pipeline

# Clustering
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn import metrics
from scipy.cluster.hierarchy import dendrogram, linkage


### 0.2 Définition des fonctions

In [84]:
def plot_histograms(data, vars, categorical=False, h=False, top=None):
    num_vars = len(vars)
    num_cols = min(3, num_vars)
    num_rows = (num_vars + num_cols - 1) // num_cols
    
    fig = make_subplots(rows=num_rows, cols=num_cols)
    
    for i, var in enumerate(vars):
        row = (i // num_cols) + 1
        col = (i % num_cols) + 1

        if categorical:
            hist_data = data[var].value_counts().reset_index()
            if top:
                hist_data = hist_data[0:top]
                
            if h:
                trace = go.Bar(x=hist_data['count'], y=hist_data[var], orientation='h')
            else:
                trace = go.Bar(x=hist_data[var], y=hist_data['count'], name=var)
                
            fig.add_trace(trace, row=row, col=col)
                
        else:
            fig.add_trace(go.Histogram(x=data[var], nbinsx=50, name=var), row=row, col=col)
            
        fig.update_xaxes(title_text=var, row=row, col=col)
        fig.update_layout(height=len(vars)*150)
    
    fig.show()

In [85]:
def inspect_null_values(data):
    n_rows = data.shape[0]

    for var in data.columns:
        null_values = data[var].isnull().sum()
        null_values_share = round(null_values / n_rows * 100)
        print(f"{var} null values: {null_values}, share: {null_values_share}%")

In [86]:
def get_df_from_array(pipe, X_train, X_test=None):
    cols = pipe.get_feature_names_out()
    X_train = pd.DataFrame(X_train, columns=cols)
    if X_test:
        X_test = pd.DataFrame(X_test, columns=cols)
        return X_train, X_test
    else:
        return X_train

### 0.3 Importation des données

In [87]:
df = pd.read_csv('data/simulation_dataset.csv')

## 1. Préparation des données

1. Je filtre les colonnes
2. Je crée les datasets par temporalité (An 1, An 1 + 1 mois, An 1 + 2 mois etc)
2. J'extrais les clients du dataset initial
3. Je filtre les subsets pour ne garder que ces clients et enlever les nouveaux
4. Je recalcule la récence
4. Je group by customer unique id
5. Je fais l'ARI

Pour tester la pertinence du clustering dans le temps, je décide de réaliser une simulation et d'évaluer l'évolution de la qualité des clusters mois après mois grâce à l'ARI (Adjusted Rand Index).

In [88]:
df.head()

Unnamed: 0.1,Unnamed: 0,customer_unique_id,customer_id,zip_code,city,state,nb_orders,average_payment_value,average_payment_installments,average_review_score,average_delay,last_order_recency,average_quantity_per_order,average_product_price_per_order,favorite_product_category,order_date
0,0,861eff4711a542e4b93843c6dd7febb0,06b8999e2fba1a1fbc88172c00ba8bc7,14409,franca,SP,1,146.87,2.0,4.0,,519,1.0,124.99,moveis_escritorio,2017-05-16 15:05:35
1,1,290c77bc529b7ac935b93aa66c333dc3,18955e83d337fd6b2def6b18a428ac77,9790,sao bernardo do campo,SP,1,335.48,8.0,5.0,,277,1.0,289.0,utilidades_domesticas,2018-01-12 20:48:24
2,2,060e732b5b29e8181a18229c7b0b2b5e,4e7b3e00288586ebd08712fdd0374a03,1151,sao paulo,SP,1,157.73,7.0,5.0,,151,1.0,139.94,moveis_escritorio,2018-05-19 16:07:45
3,3,259dac757896d24d7702b9acbbff3f3c,b2b6027bc5c5109e529d4dc6358b12c3,8775,mogi das cruzes,SP,1,173.3,1.0,5.0,,218,1.0,149.94,moveis_escritorio,2018-03-13 16:06:38
4,4,345ecd01c38d18a9036ed96c73b8d066,4f2d8ab171c80ec8364f7c12e35b23ad,13056,campinas,SP,1,252.25,8.0,5.0,,80,1.0,230.0,casa_conforto,2018-07-29 09:51:30


In [89]:
df = df.drop(columns='Unnamed: 0')

In [90]:
df = df[['average_payment_value', 'average_payment_installments', 'average_review_score', 'last_order_recency', 'customer_unique_id', 'order_date']]

In [91]:
df.head()

Unnamed: 0,average_payment_value,average_payment_installments,average_review_score,last_order_recency,customer_unique_id,order_date
0,146.87,2.0,4.0,519,861eff4711a542e4b93843c6dd7febb0,2017-05-16 15:05:35
1,335.48,8.0,5.0,277,290c77bc529b7ac935b93aa66c333dc3,2018-01-12 20:48:24
2,157.73,7.0,5.0,151,060e732b5b29e8181a18229c7b0b2b5e,2018-05-19 16:07:45
3,173.3,1.0,5.0,218,259dac757896d24d7702b9acbbff3f3c,2018-03-13 16:06:38
4,252.25,8.0,5.0,80,345ecd01c38d18a9036ed96c73b8d066,2018-07-29 09:51:30


Je sépare maintenant le jeu de données en plusieurs périodes:
- Le dataset initial, qui comporte les données des 12 premiers mois
- Les subsets, qui contiendront à chaque fois un mois de plus (13 mois, puis 14 mois, puis 15 mois...)

Cela me permettra de comparer la stabilité des clusters dans le temps.

In [92]:
df.order_date = df.order_date.astype('datetime64[ns]')

In [93]:
unique_months = sorted(df['order_date'].dt.to_period('M').unique())
initial_period = unique_months[0:12]
initial_period

[Period('2016-09', 'M'),
 Period('2016-10', 'M'),
 Period('2016-12', 'M'),
 Period('2017-01', 'M'),
 Period('2017-02', 'M'),
 Period('2017-03', 'M'),
 Period('2017-04', 'M'),
 Period('2017-05', 'M'),
 Period('2017-06', 'M'),
 Period('2017-07', 'M'),
 Period('2017-08', 'M'),
 Period('2017-09', 'M')]

Je répartis maintenant les jeux de données.

In [94]:
first_12_months_df = df[df['order_date'].dt.to_period('M').isin(initial_period)]
first_12_months_df.head()

Unnamed: 0,average_payment_value,average_payment_installments,average_review_score,last_order_recency,customer_unique_id,order_date
0,146.87,2.0,4.0,519,861eff4711a542e4b93843c6dd7febb0,2017-05-16 15:05:35
5,282.94,1.0,5.0,367,4c93744516667ad3b8f1fb645a3116a4,2017-09-14 18:14:31
12,117.31,1.0,4.0,403,918dc87cd72cd9f6ed4bd442ed785235,2017-09-09 09:54:57
18,102.03,1.0,,524,7f3a72e8f988c6e735ba118d54f47458,2017-05-11 13:48:47
20,123.0,1.0,5.0,430,e607ede0e63436308660236f5a52da5e,2017-08-13 10:03:36


In [95]:
first_12_months_df.shape

(27582, 6)

In [96]:
subsets = []
for i in range(12, len(unique_months)):
    current_subset_months = unique_months[:i + 1]
    current_subset = df[df['order_date'].dt.to_period('M').isin(current_subset_months)]
    subsets.append(current_subset)

In [97]:
len(subsets)

13

Je dois maintenant filtrer les subsets pour ne garder que les clients présents dans le dataset initial.

In [98]:
initial_clients = first_12_months_df.customer_unique_id.unique()
for i in range(len(subsets)):
    subsets[i] = subsets[i][subsets[i]['customer_unique_id'].isin(initial_clients)]

Il faut maintenant que je groupe les subsets à la granularité client.

In [99]:
first_12_months_df

Unnamed: 0,average_payment_value,average_payment_installments,average_review_score,last_order_recency,customer_unique_id,order_date
0,146.87,2.0,4.0,519,861eff4711a542e4b93843c6dd7febb0,2017-05-16 15:05:35
5,282.94,1.0,5.0,367,4c93744516667ad3b8f1fb645a3116a4,2017-09-14 18:14:31
12,117.31,1.0,4.0,403,918dc87cd72cd9f6ed4bd442ed785235,2017-09-09 09:54:57
18,102.03,1.0,,524,7f3a72e8f988c6e735ba118d54f47458,2017-05-11 13:48:47
20,123.00,1.0,5.0,430,e607ede0e63436308660236f5a52da5e,2017-08-13 10:03:36
...,...,...,...,...,...,...
99420,64.42,1.0,1.0,630,1c137fe37df712015f6488edafe8ece4,2017-01-25 16:51:27
99424,102.03,1.0,1.0,519,206e64e8af2633a2ebe158a7fcb860db,2017-05-15 17:42:38
99428,130.85,1.0,4.0,569,874c93d867b18eb09a5e2f071ee89458,2017-03-27 16:26:18
99430,102.03,1.0,5.0,511,277490f0d435b602fe4475d4b89e9181,2017-05-24 11:54:31


In [100]:
first_12_months_df = first_12_months_df.groupby('customer_unique_id').agg(
    {'average_payment_value': 'mean',
    #  'average_payment_installments': 'mean',
     'average_review_score': 'mean',
     'order_date': 'max'}
)

for i in range(len(subsets)):
    subsets[i] = subsets[i].groupby('customer_unique_id').agg(
    {'average_payment_value': 'mean',
    #  'average_payment_installments': 'mean',
     'average_review_score': 'mean',
     'order_date': 'max'}
    )

Il faut maintenant que j'adapte la valeur de *last_order_recency*, qui doit se baser sur le dernier jour du dernier mois de chaque jeu de données.

In [101]:
for i in range(len(subsets)):
    last_date = subsets[i]['order_date'].max()
    subsets[i]['last_order_recency'] = (last_date - subsets[i]['order_date']).dt.days
    subsets[i] = subsets[i].drop(columns=['order_date'])

last_date = first_12_months_df['order_date'].max()
first_12_months_df['last_order_recency'] = (last_date - first_12_months_df['order_date']).dt.days
first_12_months_df = first_12_months_df.drop(columns=['order_date'])


Le jeu de données est maintenant prêt à passer au feature engineering, puis au clustering.

## 2. Feature Engineering

Je passe maintenant tous les subsets dans la même pipeline de données que lors de la phase de modélisation.

In [102]:
pipe = pipeline.Pipeline(
    [
        ('numerical_imputation', imputation.MeanMedianImputer(imputation_method='median')),
        ('scaler', preprocessing.StandardScaler()),
        ('distribution_transformation', transformation.YeoJohnsonTransformer())

    ]
)

In [103]:
pipe.fit(first_12_months_df)
first_12_months_df = pipe.transform(first_12_months_df)


In [104]:
cols = pipe.get_feature_names_out()
first_12_months_df.columns = cols

In [105]:
for i in range(len(subsets)):
    subsets[i] = pipe.fit_transform(subsets[i])
    subsets[i].columns = cols

## 3. ARI

In [106]:
def perform_clustering(data, num_clusters):
    kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(data)
    return kmeans.labels_

num_clusters = 7

for i in range(len(subsets)):
    cluster_labels = perform_clustering(subsets[i], num_clusters)
    subsets[i]['cluster_labels'] = cluster_labels

In [107]:
cluster_labels = perform_clustering(first_12_months_df, num_clusters)
first_12_months_df['cluster_labels'] = cluster_labels

In [108]:
def calculate_ari(initial_labels, subsets):
    ari_scores = []
    for i in range(len(subsets)):
        monthly_labels = subsets[i]['cluster_labels']
        ari_score = metrics.adjusted_rand_score(initial_labels, monthly_labels)
        ari_scores.append(ari_score)
    return ari_scores

initial_labels = first_12_months_df['cluster_labels']
ari_scores = calculate_ari(initial_labels, subsets)

In [109]:
import plotly.graph_objects as go

fig = go.Figure(data=go.Scatter(y=ari_scores))
fig.update_layout(title='ARI Scores Over Time', xaxis_title='Months', yaxis_title='ARI Score')

fig.show()

On observe une qualité de segmentation qui décroît au fil des mois. Cependant, on voit que le score commence à décroitre plus rapidement aux alentours des 6 mois, ce qui peut constituer une bonne recommendation de maintenance.