Parte 3
===

In [1]:
import warnings as ws
ws.filterwarnings("ignore")

import pickle
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

sns.set(style="darkgrid")
%matplotlib inline

from sklearn.cluster import (KMeans, AgglomerativeClustering, DBSCAN)
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE

## 1. Carga de la base de datos

In [2]:
df = pd.read_csv('data/result.csv', sep=";", encoding='utf-8')

df.shape

(42659, 67)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42659 entries, 0 to 42658
Data columns (total 67 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Creditos                           42659 non-null  int64  
 1   saldo_actual                       42659 non-null  float64
 2   Promedio de limite_credito         42659 non-null  float64
 3   Promedio de credito_maximo         42659 non-null  float64
 4   Promedio de cantidad_cuotas        42659 non-null  float64
 5   Promedio de valor_cuota            42659 non-null  float64
 6   ahorro_impacto                     42659 non-null  int64  
 7   coaching_impacto                   42659 non-null  int64  
 8   Desinstalaciones                   42659 non-null  int64  
 9   Tiene_ahorro                       42659 non-null  int64  
 10  monto_ahorro                       42659 non-null  float64
 11  Activo_saldo_actual                42659 non-null  flo

## 2. Segmentación de usuarios

In [4]:
# Eliminamos la variable objetivo ya que no aporta en la segmentación
group_df = df.drop(columns="Y")
group_df.shape

(42659, 66)

### Selección mejor modelo de segmentación

In [5]:
# A function to automate the model fiting and prediction
def model_train(estimator, data, a, b):
    db = []
    ca = []
    sc = []
    bic = []
    aic = []
    n_clusters = []
    
    eps_metric = []
    eps_params = [0.1, 0.2, 0.3]
    
    if estimator == DBSCAN:
        for k in eps_params:
            est = estimator(eps = k)
            est.fit(data)  
            labels = est.labels_
            
            if np.unique(labels).shape[0] > 1:
                db.append(davies_bouldin_score(data, labels))
                ca.append(calinski_harabasz_score(data, labels))
                sc.append(silhouette_score(data, labels))
            else:
                db.append(10)
                ca.append(-10)
                sc.append(-10)
                
            eps_metric.append((np.unique(labels).shape[0], k))
        
        n_clusters.append(eps_metric[np.argmin(db)])
        n_clusters.append(eps_metric[np.argmax(ca)])
        n_clusters.append(eps_metric[np.argmax(sc)])
    else:
        for k in range(a, b):
            est = estimator(n_clusters = k)
            est.fit(data)
            labels = est.labels_                           
            db.append(davies_bouldin_score(data, labels))
            ca.append(calinski_harabasz_score(data, labels))
            sc.append(silhouette_score(data, labels))

        n_clusters.append(np.argmin(db) + a)
        n_clusters.append(np.argmax(ca) + a)
        n_clusters.append(np.argmax(sc) + a)
    return db, ca, sc, labels, n_clusters

In [7]:
clusterers = [KMeans, AgglomerativeClustering, DBSCAN]
# clusterer_names = ['KMeans', 'AgglomerativeClustering', 'DBSCAN']

def model_scores(df, scaler, first_model=False, lower=2, upper=8):
    scores = {
        'Cluster_Method': [],
        'Davies_Bouldin_Score': [], 
        'Calinski_Harabasz_Score': [],
        'Silhouette_Score': [],
        'n_clusters': []
    }
    normalized_df = pd.DataFrame(scaler.fit_transform(df))
    normalized_df.columns = df.columns
    models = clusterers if not first_model else [clusterers[0]]
    for i in models:
        db, ca, sc, labels, n_clusters = model_train(i, normalized_df, lower, upper)
        try:
            scores['Cluster_Method'].append(i.__name__)
            scores['Davies_Bouldin_Score'].append(np.min(db) if db and len(db)>0 else None)
            scores['Calinski_Harabasz_Score'].append(np.max(ca) if ca and len(ca)>0 else None)
            scores['Silhouette_Score'].append(np.max(sc) if sc and len(sc)>0 else None)
            scores['n_clusters'].append(str(n_clusters))
        except ValueError:
            pass
    return scores

In [None]:
data_scores = pd.concat([
    pd.DataFrame(model_scores(group_df, MinMaxScaler(), lower=3, upper=4))
]).set_index(['Cluster_Method'])

data_scores

In [None]:
pd.concat([
    # Mejor modelo de acuerdo con Davies_Bouldin_Score
    data_scores.loc[data_scores.Davies_Bouldin_Score == data_scores.Davies_Bouldin_Score.min()],
    # Mejor modelo de acuerdo con Calinski_Harabasz_Score
    data_scores.loc[data_scores.Calinski_Harabasz_Score == data_scores.Calinski_Harabasz_Score.max()],
    # Mejor modelo de acuerdo con Silhouette_Score
    data_scores.loc[data_scores.Silhouette_Score == data_scores.Silhouette_Score.max()]
]).drop_duplicates()

### Evaluación modelo de segmentación seleccionado

In [None]:
def model_plot(df, scaler, model):
    normalized_df = pd.DataFrame(scaler.fit_transform(df))
    normalized_df.columns = df.columns
    
    # Extract cluster labels
    model.fit(normalized_df)
    cluster_labels = model.labels_
        
    # Create a cluster label column in original dataset
    df_new = df.assign(cluster=cluster_labels)
    
    # Initialise TSNE
    model_tsne = TSNE(n_components=3, random_state=1)
    transformed = model_tsne.fit_transform(df_new)
    
    fig = plt.figure()
    fig.set_size_inches(12, 12)
    ax = fig.add_subplot(111, projection='3d')

    ax.scatter(xs=transformed[:,0], ys=transformed[:,1], zs=transformed[:,2], 
               c=df_new.cluster, cmap='Set1')
    ax.set_title(f'Flattened Graph of {str(model)} model')
    ax.set_xlabel("Component 1")
    ax.set_ylabel("Component 2")
    ax.set_zlabel("Component 3")
    
    plt.show()

def model_plot_2d(df, scaler, model):
    normalized_df = pd.DataFrame(scaler.fit_transform(df))
    normalized_df.columns = df.columns
    
    # Extract cluster labels
    cluster_labels = None
    if str(model).split('(')[0] == 'GaussianMixture':
        cluster_labels = model.fit_predict(normalized_df)
    else:
        model.fit(normalized_df)
        cluster_labels = model.labels_
        
    # Create a cluster label column in original dataset
    df_new = df.assign(cluster=cluster_labels)
    
    # Initialise TSNE
    model_tsne = TSNE(random_state=1)
    transformed = model_tsne.fit_transform(df_new)
    
    # Plot t-SNE
    plt.title(f'Flattened Graph of {str(model)} model')
    sns.scatterplot(x=transformed[:,0], y=transformed[:,1], hue=cluster_labels, 
                    style=cluster_labels, palette='deep')
    
    return normalized_df, df_new

def snake_plot(normalized_df, df_cluster, df_original, columns_vars):

    normalized_df = pd.DataFrame(normalized_df.values, 
                                 index=df_original.index, 
                                 columns=df_original.columns)
    
    normalized_df['cluster'] = df_cluster['cluster']

    # Melt data into long format
    df_melt = pd.melt(normalized_df.reset_index(), 
                      id_vars=['saldo_actual', 'cluster'],
                      value_vars=columns_vars,
                      var_name='metric', 
                      value_name='value')
    plt.xlabel('metric')
    plt.ylabel('value')
    sns.pointplot(data=df_melt, x='metric', y='value', hue='cluster', palette="Set2")
    
    return