In [None]:
%load_ext autoreload
%autoreload 

### import library

In [None]:
import online_retail_analysis.utils.paths as path
from online_retail_analysis.features.build_features import Calculate_RFM
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import FunctionTransformer ,StandardScaler
from sklearn.cluster import MiniBatchKMeans as KMeans

# sns.set_style("whitegrid")
%matplotlib inline
pd.options.display.float_format = '{:,.3f}'.format

### import data

In [None]:
df_proces = pd.read_csv(path.data_processed_dir('data_clean.csv'))
df_proces['InvoiceDate']= pd.to_datetime(df_proces['InvoiceDate'])


In [None]:
df_proces.isna().sum()

In [None]:
df_proces

### RFM Segmetation

In [None]:
from online_retail_analysis.features.build_features import Calculate_RFM , check_skew

df_rfm_labeling = Calculate_RFM(df_proces,labels=True)
df_rfm_labeling

In [None]:
df_rfm_labeling['label'].value_counts()

In [None]:
segmentation1=df_rfm_labeling.groupby('label')['CustomerID'].nunique().sort_values(ascending=False).reset_index()
plt.figure(figsize=(17,8))
segmentation1.drop([0],inplace=False)
sns.barplot(data=segmentation1, x="label", y="CustomerID", palette = "Set2")

In [None]:
df_rfm = Calculate_RFM(df_proces,labels=False)

new_rfm = df_rfm.drop('CustomerID', axis=1)

new_rfm.head()

In [None]:
from online_retail_analysis.visualization.visualize import boxplot_vis
boxplot_vis(new_rfm)


In [None]:
def rm_outliers(df, col):
    p_05 = df[col].quantile(0.05) # 5th quantile
    p_95 = df[col].quantile(0.95) # 95th quantile
    df[col].clip(p_05, p_95, inplace=True)
    return df

In [None]:
rfm_copy = new_rfm.copy()
rm_outliers(rfm_copy , 'Recency')
rm_outliers(rfm_copy , 'Frequency')
rm_outliers(rfm_copy , 'MonetaryValue')
boxplot_vis(rfm_copy)

In [None]:
plt.figure(figsize=(9, 9))

plt.subplot(3, 1, 1)
check_skew(rfm_copy,'Recency')

plt.subplot(3, 1, 2)
check_skew(rfm_copy,'Frequency')

plt.subplot(3, 1, 3)
check_skew(rfm_copy,'MonetaryValue')


plt.tight_layout()
plt.savefig(path.reports_figures_dir('before_transform.png'), format='png', dpi=1000)

In [None]:

transformer = FunctionTransformer(np.log)
new_rfm_trans_log = transformer.fit_transform(rfm_copy)
scaler = StandardScaler()
new_rfm_trans_log_sc = scaler.fit_transform(new_rfm_trans_log)
new_rfm_trans_log_sc = pd.DataFrame(new_rfm_trans_log_sc, columns=new_rfm_trans_log.columns)

plt.figure(figsize=(9, 9))

plt.subplot(3, 1, 1)
check_skew(new_rfm_trans_log_sc,'Recency')

plt.subplot(3, 1, 2)
check_skew(new_rfm_trans_log_sc,'Frequency')

plt.subplot(3, 1, 3)
check_skew(new_rfm_trans_log_sc,'MonetaryValue')

plt.tight_layout()
plt.savefig(path.reports_figures_dir('after_transform_Scaler.png'), format='png', dpi=1000)

In [None]:
boxplot_vis(new_rfm_trans_log_sc)

### K-Means cluster

In [None]:
def train_elbow_cluster(train,start=1,finish=10,each=1):
    wcss = {} #dicionario vacio para guardar los errores
    for i in range(start,finish,each):
        kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
        kmeans.fit(train)
        wcss[i] = kmeans.inertia_ #suma de distancias cuadradas a los centros del grupos más cercanos
    plt.title('Metodo de Elbow ')
    plt.xlabel('k= n grupos')
    plt.ylabel('WCSS')
    sns.pointplot(x=list(wcss.keys()), y=list(wcss.values()))
    plt.show()
#    return wcss 

In [None]:
X = np.asarray(new_rfm_trans_log_sc)
train_elbow_cluster(X,finish=150,each=10)

In [None]:
train_elbow_cluster(X,finish=20,each=1)

In [None]:
def Kmeans(train, clusters_number, original_df_rfm):
    kmeans = KMeans(n_clusters = clusters_number, random_state = 42)
    # Predict the cluster
    y_kmeans = kmeans.fit_predict(train)
    # Create a cluster label column in original dataset
    df_cluster = original_df_rfm.assign(Cluster = y_kmeans)
    return df_cluster

In [None]:
rfm_k4 = Kmeans(X, 4, df_rfm)

rfm_k5 = Kmeans(X, 5, df_rfm)

rfm_k6 = Kmeans(X, 6, df_rfm)
rfm_k4

In [None]:
def snake_plot(normalised_df_rfm, df_rfm_kmeans, df_rfm_original):

    normalised_df_rfm = pd.DataFrame(normalised_df_rfm, 
                                       index=df_rfm_original.index, 
                                       columns=df_rfm_original.columns)
    normalised_df_rfm['Cluster'] = df_rfm_kmeans['Cluster']
    normalised_df_rfm['CustomerID'] = df_rfm_kmeans['CustomerID']
    # Melt data into long format
    df_melt = pd.melt(normalised_df_rfm.reset_index(), 
                    id_vars=['CustomerID', 'Cluster'],
                    value_vars=['Recency', 'Frequency', 'MonetaryValue'], 
                    var_name='Metric', 
                    value_name='Value')
    plt.xlabel('Metric')
    plt.ylabel('Value')
    sns.pointplot(data=df_melt, x='Metric', y='Value', hue='Cluster')
    return

In [None]:
plt.figure(figsize=(9, 10))

plt.subplot(3, 1, 1)
plt.title('Snake Plot of K-Means = 4')
snake_plot(new_rfm_trans_log_sc, rfm_k4, new_rfm)

plt.subplot(3, 1, 2)
plt.title('Snake Plot of K-Means = 5')
snake_plot(new_rfm_trans_log_sc, rfm_k5, new_rfm)

plt.subplot(3, 1, 3)
plt.title('Snake Plot of K-Means = 6')
snake_plot(new_rfm_trans_log_sc, rfm_k6, new_rfm)

plt.tight_layout()

## Evaluation model
### Davies bouldin Score: The smaller Davies Bouldin Score is The more optimal the cluster.

In [None]:
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import silhouette_score
print('davies score to k=4: {}'.format(davies_bouldin_score(X, rfm_k4.Cluster)))
print('davies score to k=5: {}'.format(davies_bouldin_score(X, rfm_k5.Cluster)))
print('davies score to k=6: {}'.format(davies_bouldin_score(X, rfm_k6.Cluster)))

K-Means with 4 clusters has lowest davies bouldin score than other cluster. Therefore the optimum cluster is 4.

### Silhouetter Score: The higher Silhouuter Score is the more optimal the cluster.

In [None]:
print('davies score to k=4: {}'.format(silhouette_score(X, rfm_k4.Cluster)))
print('davies score to k=5: {}'.format(silhouette_score(X, rfm_k5.Cluster)))
print('davies score to k=6: {}'.format(silhouette_score(X, rfm_k6.Cluster)))


K-Means with 4 clusters has higher Silhoutter score than other cluster. Therefore the optimum cluster is 4.

In [None]:
rfm_k4.to_csv(path.data_processed_dir('data_clustering.csv'),index=False)