<a href="https://colab.research.google.com/github/gabriel-cm-saldanha/Clustering_/blob/main/Clustering_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Libraries

In [None]:
!pip install kmodes -q

In [None]:
# Data wrangling
import numpy as np
import pandas as pd

# Dataviz
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer, InterclusterDistance, kelbow_visualizer, silhouette_visualizer

## -- Machine learning -- ##
from sklearn import datasets
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans, DBSCAN, MeanShift
from sklearn.cluster import k_means, dbscan, mean_shift, estimate_bandwidth
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale

from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster

from kmodes.kmodes import KModes            # clustering categorical data
from kmodes.kprototypes import KPrototypes  # clustering mixed data

# Didatic purposes
import string
from ipywidgets import interact

In [None]:
# https://matplotlib.org/stable/tutorials/introductory/customizing.html
sns.set_theme(
    context='talk',
    style='ticks',
    font_scale=.6,
    palette='tab10',
    rc={
        'figure.figsize': (12,6),
        'axes.grid': True,
        'grid.alpha': .2,
        'axes.titlesize': 'x-large',
        'axes.titleweight': 'bold',
        'axes.titlepad': 20,
    }
)

scatter_kwargs = dict(palette='viridis', alpha=0.8, linewidth=0)

### Data

In [None]:
df_wines,_ = datasets.load_wine(as_frame=True, return_X_y=True)
df_wines

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0


### Data Cleaning

In [None]:
df_wines.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   alcohol                       178 non-null    float64
 1   malic_acid                    178 non-null    float64
 2   ash                           178 non-null    float64
 3   alcalinity_of_ash             178 non-null    float64
 4   magnesium                     178 non-null    float64
 5   total_phenols                 178 non-null    float64
 6   flavanoids                    178 non-null    float64
 7   nonflavanoid_phenols          178 non-null    float64
 8   proanthocyanins               178 non-null    float64
 9   color_intensity               178 non-null    float64
 10  hue                           178 non-null    float64
 11  od280/od315_of_diluted_wines  178 non-null    float64
 12  proline                       178 non-null    float64
dtypes: fl

Há dados duplicados?

In [None]:
df_wines.duplicated().any()

False

Há dados faltantes?

In [None]:
df_wines.isna().sum().any()

False

Há outliers?

In [None]:
df_wines.apply(scale).plot.box()
plt.xticks(rotation=60, ha='right');

Identificando e reomvendo linhas com outliers (|Z-Score| > 3):

In [None]:
df_wines_scaled = df_wines.apply(scale)

In [None]:
df_wines_scaled.abs().gt(3).any(axis=1).sum()

10

In [None]:
outlier_rows = df_wines_scaled.abs().gt(3).any(axis=1)
df_wines_scaled = df_wines_scaled[~outlier_rows]

### Verificando melhor K

In [None]:
kelbow_visualizer(KMeans(n_init="auto"),df_wines_scaled, k=12);

### Clusterizando

Criando e Treiando modelo

In [None]:
model = KMeans(5, random_state=0, n_init="auto").fit(df_wines_scaled)

salvando Centroides e Labels

In [None]:
centroids = model.cluster_centers_
labels = model.labels_

### Interpretando Clusters

In [None]:
(
  pd.DataFrame(centroids, columns = df_wines_scaled.columns)
  .T.add_prefix("Grupo_")
  .plot.bar(subplots=True, figsize=(12,12), legend=False)
)
plt.xticks(rotation=60, ha='right')
plt.tight_layout()

In [None]:
centers = pd.DataFrame(centroids, columns=df_wines_scaled.columns)
centers

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,0.948874,-0.284919,0.299388,-0.771847,0.451743,0.891644,0.96824,-0.603017,0.556018,0.232061,0.444536,0.781897,1.19581
1,-0.511714,-0.530424,-0.605353,-0.42434,-0.045685,-0.904041,-0.949384,1.105829,-1.060789,-0.277198,-0.3733,-1.167004,-0.429567
2,-0.766628,-0.733554,-1.275776,-0.423885,-0.604998,0.162056,0.202363,-0.5857,0.113041,-0.748601,0.77349,0.431946,-0.682071
3,0.210916,1.082514,0.299344,0.639624,-0.061643,-1.025672,-1.245098,0.671806,-0.805941,0.983252,-1.187351,-1.28353,-0.389978
4,-1.017927,-0.214405,0.272033,0.748477,-0.604998,-0.166422,0.061912,0.244422,0.05044,-0.97543,0.245639,0.337489,-0.834595


In [None]:
(
    df_wines[~outlier_rows].assign(cluster=labels)
    .groupby("cluster")
    .agg(['mean','min','median','max'])
)

Unnamed: 0_level_0,alcohol,alcohol,alcohol,alcohol,malic_acid,malic_acid,malic_acid,malic_acid,ash,ash,...,hue,hue,od280/od315_of_diluted_wines,od280/od315_of_diluted_wines,od280/od315_of_diluted_wines,od280/od315_of_diluted_wines,proline,proline,proline,proline
Unnamed: 0_level_1,mean,min,median,max,mean,min,median,max,mean,min,...,median,max,mean,min,median,max,mean,min,median,max
cluster,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,13.768772,12.85,13.76,14.83,2.018947,1.35,1.77,4.04,2.448421,2.04,...,1.06,1.28,3.165263,2.51,3.17,4.0,1122.403509,680.0,1095.0,1680.0
1,12.586364,11.84,12.6,13.34,1.745455,0.94,1.36,3.43,2.200909,1.98,...,0.906,1.25,1.785455,1.29,1.67,2.52,612.0,372.0,640.0,870.0
2,12.38,11.62,12.31,13.67,1.519167,0.9,1.375,3.17,2.0175,1.7,...,1.155,1.42,2.9175,2.14,2.96,3.57,532.708333,278.0,498.5,1020.0
3,13.171364,12.2,13.17,14.16,3.542273,1.67,3.41,5.65,2.448409,2.15,...,0.665,0.96,1.702955,1.27,1.685,2.47,624.431818,415.0,607.5,880.0
4,12.176562,11.41,12.08,13.86,2.0975,0.74,1.71,4.43,2.440937,2.0,...,0.965,1.45,2.850625,2.23,2.81,3.64,484.8125,315.0,469.0,680.0


In [None]:
(
    df_wines[~outlier_rows].assign(cluster=labels)
    .groupby("cluster")
    .mean()
    .T
    .style.background_gradient(cmap="YlOrRd", axis=1)
)

cluster,0,1,2,3,4
alcohol,13.768772,12.586364,12.38,13.171364,12.176562
malic_acid,2.018947,1.745455,1.519167,3.542273,2.0975
ash,2.448421,2.200909,2.0175,2.448409,2.440937
alcalinity_of_ash,16.924561,18.081818,18.083333,21.625,21.9875
magnesium,106.175439,99.090909,91.125,98.863636,91.125
total_phenols,2.851579,1.730909,2.39625,1.655,2.19125
flavanoids,2.993684,1.083636,2.230833,0.789091,2.090937
nonflavanoid_phenols,0.287018,0.499091,0.289167,0.445227,0.392188
proanthocyanins,1.908246,0.985455,1.655417,1.130909,1.619687
color_intensity,5.594561,4.417273,3.3275,7.331136,2.803125


In [None]:
silhouette_score(df_wines_scaled, labels)

0.2226812880160987