# Criando agrupamento dos produtos com clusters

In [2]:
import os
from dotenv import load_dotenv
from sqlalchemy import create_engine

import pandas as pd
import ast

In [3]:
load_dotenv()

user=os.environ['user']
password=os.environ['password']
host=os.environ['host']
port=os.environ['port']
database=os.environ['database']

url = f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}"
conn = create_engine(url)

In [4]:
# Importando os vetores

query_vetores = ''' 
    SELECT * FROM dim_vetores
'''

df = pd.read_sql(query_vetores, conn)

In [5]:
df.head()

Unnamed: 0,medicamento_id,vetor
0,1295,"[-0.30060092,-0.17868689,0.84854037,-0.0226843..."
1,1297,"[-0.08556356,-0.15087642,0.8269448,0.002032624..."
2,1,"[0.5235412,0.01954539,0.56722444,-0.23013225,0..."
3,2,"[-0.026552362,-0.2981731,1.2109025,-0.35614938..."
4,3,"[0.19392093,0.16702786,1.432826,0.0046516056,0..."


In [6]:
df['vetor'] = df['vetor'].apply(ast.literal_eval) # Passando a coluna para que a lista deixe de ser string para ser um array

In [7]:
# Transformando todos os elementos do vetor em colunas 
df_expanded = pd.DataFrame(df['vetor'].tolist(), index=df.index)
df = df.drop('vetor', axis=1).join(df_expanded)

In [8]:
# Todos os 769 elementos de tamanho do vetor agora são as features para a clusterizacao
df.head()

Unnamed: 0,medicamento_id,0,1,2,3,4,5,6,7,8,...,758,759,760,761,762,763,764,765,766,767
0,1295,-0.300601,-0.178687,0.84854,-0.022684,0.05211,0.063259,-0.092176,-0.260243,0.574185,...,0.247425,-0.184957,-0.234493,-0.264691,0.203122,-0.009221,0.065672,-0.022478,0.117547,-0.372807
1,1297,-0.085564,-0.150876,0.826945,0.002033,0.35307,0.05319,-0.074402,-0.079209,0.092373,...,0.039097,-0.251527,-0.087902,-0.514921,0.337589,-0.31026,-0.003216,0.080516,0.034797,-0.665637
2,1,0.523541,0.019545,0.567224,-0.230132,0.269285,0.02379,-0.146655,-0.514713,0.150731,...,0.887437,-0.007601,-0.133593,0.456493,0.499499,0.241619,0.573746,0.35793,-0.124403,-0.185171
3,2,-0.026552,-0.298173,1.210902,-0.356149,0.801698,0.01731,0.307211,-0.260344,0.330627,...,0.834441,-0.175378,-0.296927,0.099576,0.524566,0.176383,0.293972,0.709457,0.014983,-0.425568
4,3,0.193921,0.167028,1.432826,0.004652,0.500479,-0.093389,-0.112578,-0.371284,-0.026065,...,0.258389,0.052877,-0.249932,0.336112,0.391251,0.303242,0.4696,0.400498,-0.168094,-0.380091


# Montando a clusterização

In [74]:
from sklearn.decomposition import PCA
from sklearn.cluster       import KMeans
from sklearn.metrics       import silhouette_score

In [15]:
SEED = 1337
x = df.sort_values("medicamento_id").set_index("medicamento_id")

In [16]:
x.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
medicamento_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.523541,0.019545,0.567224,-0.230132,0.269285,0.02379,-0.146655,-0.514713,0.150731,0.3698,...,0.887437,-0.007601,-0.133593,0.456493,0.499499,0.241619,0.573746,0.35793,-0.124403,-0.185171
2,-0.026552,-0.298173,1.210902,-0.356149,0.801698,0.01731,0.307211,-0.260344,0.330627,0.023948,...,0.834441,-0.175378,-0.296927,0.099576,0.524566,0.176383,0.293972,0.709457,0.014983,-0.425568
3,0.193921,0.167028,1.432826,0.004652,0.500479,-0.093389,-0.112578,-0.371284,-0.026065,-0.333763,...,0.258389,0.052877,-0.249932,0.336112,0.391251,0.303242,0.4696,0.400498,-0.168094,-0.380091
4,0.330913,0.087221,0.767785,-0.250887,0.697848,-0.186091,0.293285,0.110699,0.054768,0.134924,...,0.897957,0.239198,0.128513,0.021122,0.285721,0.31134,0.162864,0.096283,0.011656,-0.420956
5,0.311636,0.000135,0.578208,-0.251975,0.657058,0.093672,0.208355,-0.292581,-0.114231,0.1751,...,0.969762,0.288961,0.147421,0.030505,0.496645,0.325845,0.265427,0.4402,0.23387,-0.468628


In [49]:
# Aplica redução de dimensionalidade do X para criação do cluster

pca = PCA(n_components=200, random_state=SEED)
pca.fit(x)

print(f"% de explicação: {pca.explained_variance_ratio_.sum()}")

% de explicação: 0.9592797971306577


In [50]:
pca_data = pca.transform(x.copy())
df_pca = pd.DataFrame(pca_data)

df_pca.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,-8.556503,1.012994,-1.365612,0.994851,-0.544008,0.759498,1.118039,-0.346489,-0.210964,0.406425,...,-0.170294,-0.095541,0.380696,0.295853,0.02309,0.054659,-0.041341,0.231815,0.024083,0.225583
1,-9.280031,1.554039,-1.941022,1.484084,0.823276,0.592701,1.651494,-0.119697,0.564189,-0.27338,...,0.273763,-0.309241,-0.080726,0.234515,-0.027096,-0.235478,-0.703119,0.013708,-0.255336,0.018773
2,-8.737266,0.191652,-0.023542,0.270845,-1.8625,0.305419,0.095542,0.055068,0.727278,-0.060317,...,0.124116,0.034418,-0.262682,-0.023748,-0.113453,0.019382,0.387409,-0.126181,0.076706,0.00314
3,-8.963469,0.360455,-2.241995,1.630768,-0.051841,-0.037664,1.288381,-0.271834,-0.14682,-0.155585,...,-0.118282,-0.280217,0.080986,0.023779,0.024101,0.024812,-0.171245,-0.039325,-0.099983,-0.28203
4,-8.916765,1.537687,-2.739109,1.66139,0.245077,-0.055271,1.655327,-0.161209,-0.2618,0.070388,...,0.054294,-0.280801,-0.260751,0.114945,0.106807,-0.008633,0.049022,0.170718,-0.023203,-0.08818


In [117]:
best_n_clusters = 12155
cluster = KMeans(n_clusters=best_n_clusters, random_state=SEED)
grupos = cluster.fit_predict(df_pca)

score_cluster = silhouette_score(df_pca, grupos)
print(f"Silhouette score de {score_cluster} para clusterização com {best_n_clusters}")

Silhouette score de 0.8074673664314065 para clusterização com 12155


In [120]:
df_cluster = x.reset_index()
df_cluster['cluster'] = grupos

In [132]:
df_cluster.value_counts("cluster")

cluster
3        46
41       43
235      38
1943     38
221      36
         ..
7145      1
7144      1
905       1
906       1
12154     1
Name: count, Length: 12155, dtype: int64

# Salvando os dados no S3

In [134]:
import awswrangler as wr

s3_path = "s3://catalogo-medicamentos/gold/dim_cluster.parquet"
wr.s3.to_parquet(df_cluster[["medicamento_id", "cluster"]], s3_path, index=False)

{'paths': ['s3://catalogo-medicamentos/gold/dim_cluster.parquet'],
 'partitions_values': {}}