# 0. Librerías

In [11]:
# Genérica
# -----------------------------------------------------------------------------
from time import time
import session_info


# Tratamientos datos
# -----------------------------------------------------------------------------
import pandas as pd
import numpy as np


# category encoders
# -----------------------------------------------------------------------------
from category_encoders import OrdinalEncoder


# scikit-learn
# -----------------------------------------------------------------------------
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, silhouette_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer, OneHotEncoder


# Gráficos
# -----------------------------------------------------------------------------
import seaborn as sns
import matplotlib.pyplot as plt


session_info.show(html=False)

-----
category_encoders   2.6.1
matplotlib          3.7.2
numpy               1.23.5
pandas              1.5.3
seaborn             0.12.2
session_info        1.0.0
sklearn             1.3.0
-----
IPython             8.14.0
jupyter_client      8.3.0
jupyter_core        5.3.1
notebook            6.5.4
-----
Python 3.10.11 (main, May 16 2023, 00:28:57) [GCC 11.2.0]
Linux-5.15.0-101-generic-x86_64-with-glibc2.31
-----
Session information updated at 2024-04-04 13:48


# 1. Leer datos

In [49]:
# Ruta
path = './data/clientes.csv'

# Cargar datos
df = pd.read_csv(path, sep=',')

df.head()

Unnamed: 0,Anio,Cliente,Nombre_PdV,Tipo_Cliente,Distribuidor_MSM,Area_Dist,Gama,Formato_1,Formato_2,Num_PdV_CI,Num_PdV_RU,Num_PdV_TU,Sell_In
0,2023,1010325,HCOUTCUCI-UCI TARANCON-UCALSA INTERNACIONAL,DHC,0,ZZ,3,2,3,0.0,0.0,0.0,146518.220146
1,2023,1182275,ODYEL LA ELIPA-DIST. O'DONNELL,CON,0,Y5,8,3,10,1211.0,0.0,0.0,122227.650365
2,2023,542516,"VOLDISTRIBUCION MADRID, S.A",CON,1,Y5,10,3,10,1615.0,0.0,0.0,109619.950367
3,2023,552863,"BARCELONA - EMCADI, S.A.",CON,0,Y1,9,3,11,3361.0,15.0,64.0,102350.520899
4,2023,553657,"MARRATXI VOLDISTRIBUCION BALEARES, S.A",CON,1,Y1,9,3,9,0.0,0.0,1360.0,102257.120397


In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8483 entries, 0 to 8482
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Anio              8483 non-null   int64  
 1   Cliente           8483 non-null   int64  
 2   Nombre_PdV        8483 non-null   object 
 3   Tipo_Cliente      8483 non-null   object 
 4   Distribuidor_MSM  8483 non-null   int64  
 5   Area_Dist         8483 non-null   object 
 6   Gama              8483 non-null   int64  
 7   Formato_1         8483 non-null   int64  
 8   Formato_2         8483 non-null   int64  
 9   Num_PdV_CI        8483 non-null   float64
 10  Num_PdV_RU        8483 non-null   float64
 11  Num_PdV_TU        8483 non-null   float64
 12  Sell_In           8483 non-null   float64
dtypes: float64(4), int64(6), object(3)
memory usage: 861.7+ KB


In [51]:
df.columns

Index(['Anio', 'Cliente', 'Nombre_PdV', 'Tipo_Cliente', 'Distribuidor_MSM',
       'Area_Dist', 'Gama', 'Formato_1', 'Formato_2', 'Num_PdV_CI',
       'Num_PdV_RU', 'Num_PdV_TU', 'Sell_In'],
      dtype='object')

In [52]:
df_train = df[[
    'Tipo_Cliente',
    'Distribuidor_MSM',
    'Area_Dist',
    'Gama',
    'Formato_1',
    'Formato_2',
    'Num_PdV_CI',
    'Num_PdV_RU',
    'Num_PdV_TU',
    'Sell_In'
]]

df_train.head()

Unnamed: 0,Tipo_Cliente,Distribuidor_MSM,Area_Dist,Gama,Formato_1,Formato_2,Num_PdV_CI,Num_PdV_RU,Num_PdV_TU,Sell_In
0,DHC,0,ZZ,3,2,3,0.0,0.0,0.0,146518.220146
1,CON,0,Y5,8,3,10,1211.0,0.0,0.0,122227.650365
2,CON,1,Y5,10,3,10,1615.0,0.0,0.0,109619.950367
3,CON,0,Y1,9,3,11,3361.0,15.0,64.0,102350.520899
4,CON,1,Y1,9,3,9,0.0,0.0,1360.0,102257.120397


In [53]:
# One hot encoding
one_hot_encoding = pd.get_dummies(df_train[['Tipo_Cliente', 'Area_Dist']])

# Unir las columnas codificadas one-hot al DataFrame original
df_train = df_train.join(one_hot_encoding)

# Eliminar columnas categóricas
df_train = df_train.drop(columns=['Tipo_Cliente', 'Area_Dist'])

df_train.head()

Unnamed: 0,Distribuidor_MSM,Gama,Formato_1,Formato_2,Num_PdV_CI,Num_PdV_RU,Num_PdV_TU,Sell_In,Tipo_Cliente_CON,Tipo_Cliente_DHA,Tipo_Cliente_DHC,Area_Dist_Y0,Area_Dist_Y1,Area_Dist_Y2,Area_Dist_Y3,Area_Dist_Y4,Area_Dist_Y5,Area_Dist_Y6,Area_Dist_ZZ
0,0,3,2,3,0.0,0.0,0.0,146518.220146,0,0,1,0,0,0,0,0,0,0,1
1,0,8,3,10,1211.0,0.0,0.0,122227.650365,1,0,0,0,0,0,0,0,1,0,0
2,1,10,3,10,1615.0,0.0,0.0,109619.950367,1,0,0,0,0,0,0,0,1,0,0
3,0,9,3,11,3361.0,15.0,64.0,102350.520899,1,0,0,0,1,0,0,0,0,0,0
4,1,9,3,9,0.0,0.0,1360.0,102257.120397,1,0,0,0,1,0,0,0,0,0,0


generáme un codigo para hacer un kmeans de scikitlearn, sobre una tabla de pandas, atendiendo a las mejores prácticas de kmeans como estandarizar las columnas numéricas, y chequear con el método del codo y de la silueta cual sería el mejor número de clsuters