In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
import optuna

pd.set_option("display.float_format", lambda x: "%.4f" % x)
np.set_printoptions(suppress=True, precision=10)

In [2]:
df = pd.read_csv("./data/clientes.csv")
df.head()

Unnamed: 0,atividade_economica,faturamento_mensal,numero_de_funcionarios,localizacao,idade,inovacao
0,Comércio,713109.95,12,Rio de Janeiro,6,1
1,Comércio,790714.38,9,São Paulo,15,0
2,Comércio,1197239.33,17,São Paulo,4,9
3,Indústria,449185.78,15,São Paulo,6,0
4,Agronegócio,1006373.16,15,São Paulo,15,8


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   atividade_economica     500 non-null    object 
 1   faturamento_mensal      500 non-null    float64
 2   numero_de_funcionarios  500 non-null    int64  
 3   localizacao             500 non-null    object 
 4   idade                   500 non-null    int64  
 5   inovacao                500 non-null    int64  
dtypes: float64(1), int64(3), object(2)
memory usage: 23.6+ KB


In [4]:
df.describe()

Unnamed: 0,faturamento_mensal,numero_de_funcionarios,idade,inovacao
count,500.0,500.0,500.0,500.0
mean,1026715.6294,13.69,9.254,4.388
std,420609.4577,3.1224,2.9596,2.9028
min,18421.22,2.0,0.0,0.0
25%,763253.585,12.0,7.0,2.0
50%,1022957.085,14.0,9.0,4.0
75%,1295888.515,16.0,11.0,7.0
max,2390677.22,21.0,16.0,9.0


## Preparing dataset for training


In [5]:
# Select all columns for clustering
X = df.copy()

# Separate features by type
ordinal_features = np.array(["inovacao"])
categorical_features = X.select_dtypes(include=["object"]).columns
numerical_features = X.select_dtypes(include=["number"]).columns.difference(ordinal_features)

print(f"Ordinal: {', '.join(ordinal_features)}")
print(f"Categorical: {', '.join(categorical_features)}")
print(f"Numerical: {', '.join(numerical_features)}")

Ordinal: inovacao
Categorical: atividade_economica, localizacao
Numerical: faturamento_mensal, idade, numero_de_funcionarios


In [6]:
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore")
ordinal_transformer = OrdinalEncoder()

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features),
        ("ord", ordinal_transformer, ordinal_features),
    ]
)

X_transformed = preprocessor.fit_transform(X)
X_transformed[0:5]

array([[-0.7463449774, -1.1005884861, -0.5417919104,  0.          ,
         1.          ,  0.          ,  0.          ,  0.          ,
         1.          ,  0.          ,  0.          ,  1.          ],
       [-0.5616554761,  1.9434485069, -1.5035526981,  0.          ,
         1.          ,  0.          ,  0.          ,  0.          ,
         0.          ,  1.          ,  0.          ,  0.          ],
       [ 0.4058265391, -1.7770411512,  1.0611427358,  0.          ,
         1.          ,  0.          ,  0.          ,  0.          ,
         0.          ,  1.          ,  0.          ,  9.          ],
       [-1.3744537512, -1.1005884861,  0.4199688773,  0.          ,
         0.          ,  1.          ,  0.          ,  0.          ,
         0.          ,  1.          ,  0.          ,  0.          ],
       [-0.0484127069,  1.9434485069,  0.4199688773,  1.          ,
         0.          ,  0.          ,  0.          ,  0.          ,
         0.          ,  1.          ,  0.   

## Training the GMM model


In [7]:
N_COMPONENTS_RANGE = range(3, 11)
COVARIANCE_TYPE = ["full", "tied", "diag", "spherical"]


def object(trial: optuna.Trial):
    n_components = trial.suggest_categorical("n_components", N_COMPONENTS_RANGE)
    covariance_type = trial.suggest_categorical("covariance_type", COVARIANCE_TYPE)

    model = GaussianMixture(
        n_components=n_components, covariance_type=covariance_type, random_state=51
    )
    model.fit(X_transformed)

    return model.bic(X_transformed)

In [8]:
search_space = {"n_components": N_COMPONENTS_RANGE, "covariance_type": COVARIANCE_TYPE}
sampler = optuna.samplers.GridSampler(search_space)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(object)

[I 2024-10-04 13:15:46,971] A new study created in memory with name: no-name-8fdd17ba-aef7-417c-a727-6618195b3f84
[I 2024-10-04 13:15:47,013] Trial 0 finished with value: -177.47638665739862 and parameters: {'n_components': 6, 'covariance_type': 'tied'}. Best is trial 0 with value: -177.47638665739862.
[I 2024-10-04 13:15:47,021] Trial 1 finished with value: -23479.731809039633 and parameters: {'n_components': 9, 'covariance_type': 'diag'}. Best is trial 0 with value: -177.47638665739862.
[I 2024-10-04 13:15:47,027] Trial 2 finished with value: -239.16103280493996 and parameters: {'n_components': 5, 'covariance_type': 'tied'}. Best is trial 0 with value: -177.47638665739862.
[I 2024-10-04 13:15:47,041] Trial 3 finished with value: -16669.227646920866 and parameters: {'n_components': 5, 'covariance_type': 'full'}. Best is trial 0 with value: -177.47638665739862.
[I 2024-10-04 13:15:47,045] Trial 4 finished with value: 1570.009606919802 and parameters: {'n_components': 3, 'covariance_typ

In [9]:
best_config = study.best_params
best_bic = study.best_value

print(f"Best BIC: {best_bic}")
print(f"Best config: {best_config}")

Best BIC: 13517.954765258868
Best config: {'n_components': 3, 'covariance_type': 'spherical'}


In [10]:
best_model = GaussianMixture(
    n_components=best_config["n_components"],
    covariance_type=best_config["covariance_type"],
    random_state=51,
)
best_model.fit(X_transformed)

## Predicting the clusters


In [13]:
# Predict the clusters
clusters = best_model.predict(X_transformed)
clusters[0:10]

array([0, 0, 1, 0, 1, 2, 0, 1, 0, 2])

In [14]:
# Generate probabilities of belonging to each cluster
probabilities = best_model.predict_proba(X_transformed)
probabilities[0:10]

array([[0.9999798986, 0.          , 0.0000201014],
       [0.9999999957, 0.          , 0.0000000043],
       [0.          , 1.          , 0.          ],
       [0.9999999795, 0.          , 0.0000000205],
       [0.          , 0.9999999963, 0.0000000037],
       [0.0000597702, 0.0000066642, 0.9999335655],
       [0.9999722321, 0.          , 0.0000277679],
       [0.          , 0.8511800001, 0.1488199999],
       [0.9999829541, 0.          , 0.0000170459],
       [0.0474980051, 0.0000000053, 0.9525019896]])

In [15]:
df["cluster"] = clusters.astype(int)
df.head()

Unnamed: 0,atividade_economica,faturamento_mensal,numero_de_funcionarios,localizacao,idade,inovacao,cluster
0,Comércio,713109.95,12,Rio de Janeiro,6,1,0
1,Comércio,790714.38,9,São Paulo,15,0,0
2,Comércio,1197239.33,17,São Paulo,4,9,1
3,Indústria,449185.78,15,São Paulo,6,0,0
4,Agronegócio,1006373.16,15,São Paulo,15,8,1


## Visualizing the clusters


In [16]:
px.scatter(df, x="idade", y="faturamento_mensal", color="cluster")

In [20]:
px.scatter(df, x="inovacao", y="faturamento_mensal", color="cluster")

In [18]:
px.scatter(df, x="inovacao", y="idade", color="cluster")

## Saving the model


In [22]:
import joblib
import os

os.makedirs("models", exist_ok=True)
joblib.dump(best_model, "models/clientes_gmm.pkl")
joblib.dump(preprocessor, "models/clientes_gmm_preprocessor.pkl")

['models/clientes_gmm_preprocessor.pkl']