<a href="https://colab.research.google.com/github/cristiandarioortegayubro/BA/blob/main/cl_km_03.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

![logo](https://github.com/cristiandarioortegayubro/BA/blob/main/dba.png?raw=true)

![](https://scikit-learn.org/stable/_static/scikit-learn-logo-small.png)

## **Carga de bibliotecas necesarias**

### **Para el tratamiento de los datos**

In [1]:
import pandas as pd
import numpy as np

### **Para gráficos**

In [2]:
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff

### **Para preprocesamiento de datos y modelo**

In [3]:
import sklearn
from sklearn import cluster
from sklearn.cluster import KMeans
from sklearn.preprocessing import scale
import sklearn.metrics as metrics
from sklearn.metrics import silhouette_score

## **Extracción de Datos - Creación del DataFrame**

In [4]:
datos = "https://raw.githubusercontent.com/cristiandarioortegayubro/BA/main/Datasets/clientes_mall.csv"

In [5]:
clientes_mall = pd.read_csv(datos)
clientes_mall

Unnamed: 0,CustomerID,Gender,Age,AnnualIncome,SpendingScore
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40
...,...,...,...,...,...
195,196,Female,35,120,79
196,197,Female,45,126,28
197,198,Male,32,126,74
198,199,Male,32,137,18


## **Eliminando variables**

In [6]:
clientes_mall.info() #visualizacion de los tipos de datos del dataframe

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   CustomerID     200 non-null    int64 
 1   Gender         200 non-null    object
 2   Age            200 non-null    int64 
 3   AnnualIncome   200 non-null    int64 
 4   SpendingScore  200 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 7.9+ KB


In [7]:
clientes_mall = clientes_mall.drop(columns=["CustomerID", "Gender"]) #se elimina la variable no relevante
clientes_mall #vista del dataframe

Unnamed: 0,Age,AnnualIncome,SpendingScore
0,19,15,39
1,21,15,81
2,20,16,6
3,23,16,77
4,31,17,40
...,...,...,...
195,35,120,79
196,45,126,28
197,32,126,74
198,32,137,18


## **Número de clusters**

In [8]:
clusters = pd.DataFrame()
inertia = []

In [9]:
clusters["cluster_range"] = range(1, 10)

In [10]:
for k in clusters["cluster_range"]:
    kmeans = cluster.KMeans(n_clusters=k, random_state = 8).fit(clientes_mall)
    inertia.append(kmeans.inertia_)

In [11]:
clusters["inertia"] = inertia

In [12]:
clusters.inertia = round(clusters.inertia, 4)

In [13]:
clusters.head(10)

Unnamed: 0,cluster_range,inertia
0,1,308812.78
1,2,212840.1698
2,3,143342.7516
3,4,104366.1515
4,5,75479.7643
5,6,58300.4433
6,7,51082.543
7,8,44342.3174
8,9,40792.9005


### Graficando clusters óptimos.

In [14]:
fig = px.line(clusters,
              x = "cluster_range",
              y = "inertia",
              markers = True,
              title = "Metodo del codo",
              template = "gridon",
              labels = {"cluster_range":"clusters"})
fig.show()

# **Evaluando el Algoritmo**

## **Algoritmo K-means**

In [16]:
clientes_mall.head()

Unnamed: 0,Age,AnnualIncome,SpendingScore
0,19,15,39
1,21,15,81
2,20,16,6
3,23,16,77
4,31,17,40


In [18]:
km = cluster.KMeans(n_clusters = 6, n_init = 20, random_state = 123)
km

KMeans(n_clusters=6, n_init=20, random_state=123)

In [20]:
km.fit(clientes_mall)

KMeans(n_clusters=6, n_init=20, random_state=123)

In [21]:
centroids = km.cluster_centers_
labels = km.labels_

In [22]:
centroids

array([[56.15555556, 53.37777778, 49.08888889],
       [32.69230769, 86.53846154, 82.12820513],
       [27.        , 56.65789474, 49.13157895],
       [41.68571429, 88.22857143, 17.28571429],
       [25.27272727, 25.72727273, 79.36363636],
       [44.14285714, 25.14285714, 19.52380952]])

In [23]:
centroids = pd.DataFrame(centroids, columns=['Age', 'AnnualIncome', "SpendingScore"])
centroids

Unnamed: 0,Age,AnnualIncome,SpendingScore
0,56.155556,53.377778,49.088889
1,32.692308,86.538462,82.128205
2,27.0,56.657895,49.131579
3,41.685714,88.228571,17.285714
4,25.272727,25.727273,79.363636
5,44.142857,25.142857,19.52381


In [24]:
labels

array([5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4,
       5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 0, 4, 0, 2,
       5, 4, 0, 2, 2, 2, 0, 2, 2, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 2,
       0, 0, 2, 2, 0, 0, 0, 0, 0, 2, 0, 2, 2, 0, 0, 2, 0, 0, 2, 0, 0, 2,
       2, 0, 0, 2, 0, 2, 2, 2, 0, 2, 0, 2, 2, 0, 0, 2, 0, 2, 0, 0, 0, 0,
       0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 1, 2, 1, 3, 1, 3, 1, 3, 1,
       2, 1, 3, 1, 3, 1, 3, 1, 3, 1, 2, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1,
       3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1,
       3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1,
       3, 1], dtype=int32)

## **Métricas**

### **Calinski Harabasz**

In [25]:
metrics.calinski_harabasz_score(clientes_mall, labels)

166.7204931788687

### **Silhouette**

In [26]:
metrics.silhouette_score(clientes_mall, labels)

0.4523443947724053

### **Davies Bouldin**

In [27]:
metrics.davies_bouldin_score(clientes_mall, labels)

0.7469740072755284

In [28]:
clientes_mall['cluster'] = labels

In [29]:
clientes_mall

Unnamed: 0,Age,AnnualIncome,SpendingScore,cluster
0,19,15,39,5
1,21,15,81,4
2,20,16,6,5
3,23,16,77,4
4,31,17,40,5
...,...,...,...,...
195,35,120,79,1
196,45,126,28,3
197,32,126,74,1
198,32,137,18,3


## **Grafico**

### **Plotly**

In [30]:
fig = go.Figure([go.Scatter(x = clientes_mall.Age, 
                            y = clientes_mall.AnnualIncome,
                            mode = "markers",
                            name = "Clusters",
                            marker = dict(color = clientes_mall.AnnualIncome,
                                          colorscale = 'bluered',
                                          showscale = False)),

                 go.Scatter(x = centroids.Age,
                            y = centroids.AnnualIncome,
                            mode = "markers",
                            name = "Centroide",
                            marker_color = "orange",
                            marker = dict(size = 12)),
                 ])

fig.update_layout(template =    "gridon",
                  title =       "Edad e Ingresos Anuales",
                  yaxis_title = "Y",
                  xaxis_title = "X")

fig.show()

In [32]:
fig = go.Figure([go.Scatter(x = clientes_mall.Age, 
                            y = clientes_mall.SpendingScore,
                            mode = "markers",
                            name = "Clusters",
                            marker = dict(color = clientes_mall.SpendingScore,
                                          colorscale = 'bluered',
                                          showscale = False)),

                 go.Scatter(x = centroids.Age,
                            y = centroids.SpendingScore,
                            mode = "markers",
                            name = "Centroide",
                            marker_color = "orange",
                            marker = dict(size = 12)),
                 ])

fig.update_layout(template =    "gridon",
                  title =       "Edad y Score de Gastos",
                  yaxis_title = "Y",
                  xaxis_title = "X")

fig.show()