In [4]:
import pandas as pd
import numpy as np

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from yellowbrick.cluster import KElbowVisualizer

import matplotlib.pyplot as plt

In [5]:
df = pd.read_csv('../dados/dados_totvs/historico.csv', sep=';')

In [6]:
df

Unnamed: 0,NR_PROPOSTA,ITEM_PROPOSTA,DT_UPLOAD,HOSPEDAGEM,CD_CLI,FAT_FAIXA,CD_PROD,QTD,MESES_BONIF,VL_PCT_DESC_TEMP,VL_PCT_DESCONTO,PRC_UNITARIO,VL_DESCONTO_TEMPORARIO,VL_TOTAL,VL_FULL,VL_DESCONTO
0,AAMQSF,1,2025-03-25,ON PREMISES,TFDPFE,Sem Informações de Faturamento,0113301112,1,0,0,286492879623732,210192868395988,0,210192868395988,659931618873727,449738750477739
1,AAJUVA,7,2024-03-28,ON PREMISES,T03306,Faixa 08 - De 150 M ate 300 M,AUT.04.000450,1,0,0,0,053388988572581,0,053388988572581,053388988572581,0
2,AAKX71,1,2024-08-21,ON PREMISES,T48463,Faixa 03 - De 15 M ate 25 M,1M13301050,1,0,0,0,122289790447049,0,122289790447049,630578605187964,0
3,AAMJNP,1,2025-02-17,ON PREMISES,TFEED1,Sem Informações de Faturamento,71A3301148,1,0,0,0,601067526465167,0,601067526465167,601067526465167,0
4,AAKFC4,1,2024-05-23,ON PREMISES,TDC1GA,Sem Informações de Faturamento,CONSV.502,45,0,0,0,939520083156387,0,422786139349058,00189173581556389,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22735,AAMEYG,1,2025-01-24,ON PREMISES,TFCRX2,Faixa 06 - De 50 M ate 75 M,0131001016-3,1,0,0,0,369469036737469,0,369469036737469,369469036737469,0
22736,AALLSL,1,2024-10-30,ON PREMISES,TFEC85,"Faixa 02 - De 7,5 M ate 15 M",01A9001001,1,0,0,350601704484508,549478209231793,0,549478209231793,569645374182914,475262890872801
22737,AALXKZ,1,2024-12-17,ON PREMISES,TFDJ86,Sem Informações de Faturamento,CONSV.212,19,0,0,0,717682529851261,0,13635968067174,538266101245814,0
22738,AALVEJ,3,2024-12-19,TOTVS CLOUD,TFECJG,Sem Informações de Faturamento,71A3301405,15,999,126115721037593,0,154701951139447,696158780127512,232052926709171,232052926709171,0


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22740 entries, 0 to 22739
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   NR_PROPOSTA             22740 non-null  object 
 1   ITEM_PROPOSTA           22740 non-null  float64
 2   DT_UPLOAD               22740 non-null  object 
 3   HOSPEDAGEM              22740 non-null  object 
 4   CD_CLI                  22740 non-null  object 
 5   FAT_FAIXA               22740 non-null  object 
 6   CD_PROD                 22740 non-null  object 
 7   QTD                     22740 non-null  float64
 8   MESES_BONIF             22740 non-null  int64  
 9   VL_PCT_DESC_TEMP        22740 non-null  float64
 10  VL_PCT_DESCONTO         22740 non-null  float64
 11  PRC_UNITARIO            22740 non-null  float64
 12  VL_DESCONTO_TEMPORARIO  22740 non-null  float64
 13  VL_TOTAL                22740 non-null  float64
 14  VL_FULL                 22740 non-null

In [8]:
df.columns

Index(['NR_PROPOSTA', 'ITEM_PROPOSTA', 'DT_UPLOAD', 'HOSPEDAGEM', 'CD_CLI',
       'FAT_FAIXA', 'CD_PROD', 'QTD', 'MESES_BONIF', 'VL_PCT_DESC_TEMP',
       'VL_PCT_DESCONTO', 'PRC_UNITARIO', 'VL_DESCONTO_TEMPORARIO', 'VL_TOTAL',
       'VL_FULL', 'VL_DESCONTO'],
      dtype='object')

In [15]:
df = df.applymap(lambda x: str(x).replace(",", ".") if isinstance(x, str) else x)

  df = df.applymap(lambda x: str(x).replace(",", ".") if isinstance(x, str) else x)


In [16]:
# lista de colunas que quero converter
cols_para_float = ['ITEM_PROPOSTA',
       'QTD', 'VL_PCT_DESC_TEMP',
       'VL_PCT_DESCONTO', 'PRC_UNITARIO', 'VL_DESCONTO_TEMPORARIO', 'VL_TOTAL',
       'VL_FULL', 'VL_DESCONTO']

# conversão
df[cols_para_float] = df[cols_para_float].astype(float)

In [18]:
df = df.rename(columns={"CD_CLI": "CD_CLIENTE"})

In [19]:
# Definindo função de agregação pela moda

def agg_moda(x):
    m = x.dropna().mode()
    if not m.empty:
        return m.iloc[0]
    return np.nan

In [20]:
df_agrupado = df.groupby('CD_CLIENTE').agg({'NR_PROPOSTA':'count',
                                            'ITEM_PROPOSTA': agg_moda,
                                            'HOSPEDAGEM': agg_moda,
                                            'FAT_FAIXA': agg_moda,
                                            'CD_PROD': agg_moda,
                                            'QTD': ['median', 'sum'],
                                            'VL_PCT_DESC_TEMP': ['median', 'sum'],
                                            'VL_PCT_DESCONTO': ['median', 'sum'],
                                            'PRC_UNITARIO': ['median', 'sum'],
                                            'VL_DESCONTO_TEMPORARIO': ['median', 'sum'],
                                            'VL_TOTAL': ['median', 'sum'],
                                            'VL_FULL': ['median', 'sum'],
                                            'VL_DESCONTO': ['median', 'sum']}).reset_index()

In [23]:
df_agrupado.columns = [
    'CD_CLIENTE',
    'NR_PROPOSTA_count',
    'ITEM_PROPOSTA_agg_moda',
    'HOSPEDAGEM_agg_moda',
    'FAT_FAIXA_agg_moda',
    'CD_PROD_agg_moda',
    'QTD_median',
    'QTD_sum',
    'VL_PCT_DESC_TEMP_median',
    'VL_PCT_DESC_TEMP_sum',
    'VL_PCT_DESCONTO_median',
    'VL_PCT_DESCONTO_sum',
    'PRC_UNITARIO_median',
    'PRC_UNITARIO_sum',
    'VL_DESCONTO_TEMPORARIO_median',
    'VL_DESCONTO_TEMPORARIO_sum',
    'VL_TOTAL_median',
    'VL_TOTAL_sum',
    'VL_FULL_median',
    'VL_FULL_sum',
    'VL_DESCONTO_median',
    'VL_DESCONTO_sum'
]


In [24]:
df_agrupado

Unnamed: 0,CD_CLIENTE,NR_PROPOSTA_count,ITEM_PROPOSTA_agg_moda,HOSPEDAGEM_agg_moda,FAT_FAIXA_agg_moda,CD_PROD_agg_moda,QTD_median,QTD_sum,VL_PCT_DESC_TEMP_median,VL_PCT_DESC_TEMP_sum,...,PRC_UNITARIO_median,PRC_UNITARIO_sum,VL_DESCONTO_TEMPORARIO_median,VL_DESCONTO_TEMPORARIO_sum,VL_TOTAL_median,VL_TOTAL_sum,VL_FULL_median,VL_FULL_sum,VL_DESCONTO_median,VL_DESCONTO_sum
0,T00053,2,1.0,ON PREMISES,Faixa 05 - De 35 M ate 50 M,5J16017014,25.5,51.0,0.000000,0.000000,...,686.359589,1372.719177,0.000000,0.000000,2158.975028,4317.950057,0.107198,0.214397,0.000000,0.000000
1,T00082,7,1.0,ON PREMISES,Faixa 04 - De 25 M ate 35 M,01A3374001,2.0,74.5,0.000000,39.948710,...,194.445219,3463.331077,0.000000,849.523905,1323.676977,12933.688471,366.618401,6747.453817,0.000000,0.000000
2,T00145,1,1.0,ON PREMISES,Faixa 03 - De 15 M ate 25 M,7TA1001079,10.0,10.0,0.000000,0.000000,...,2.349956,2.349956,0.000000,0.000000,23.499563,23.499563,23.499563,23.499563,0.000000,0.000000
3,T00336,2,1.0,ON PREMISES,Faixa 08 - De 150 M ate 300 M,1131001030-6,1.0,2.0,0.000000,0.000000,...,8611.751055,17223.502110,0.000000,0.000000,8611.751055,17223.502110,8611.751055,17223.502110,0.000000,0.000000
4,T00673,4,1.0,ON PREMISES,Faixa 03 - De 15 M ate 25 M,0111053007,1.0,4.0,0.000000,0.000000,...,240.288283,6782.373158,0.000000,0.000000,240.288283,6782.373158,338.866637,13031.352485,98.578353,6248.979327
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4048,TFEEXW,28,2.0,ON PREMISES,Faixa 08 - De 150 M ate 300 M,2010067-2,40.0,6966.0,14.713501,321.595089,...,44.695412,21425.505577,130.424675,4181.093480,486.174003,79593.926853,511.397147,87149.275877,0.000000,9236.374896
4049,TFEEYC,1,1.0,ON PREMISES,Sem Informações de Faturamento,71A3301148,1.0,1.0,6.305786,6.305786,...,60.106753,60.106753,9.017274,9.017274,60.106753,60.106753,60.106753,60.106753,0.000000,0.000000
4050,TFEEYH,7,1.0,ON PREMISES,Sem Informações de Faturamento,CONSV.207,1.0,58.0,0.000000,0.000000,...,19.337744,411.616490,0.000000,0.000000,32.630341,3880.568125,96.318780,913.262790,11.636277,122.832508
4051,TFEEYP,13,2.0,ON PREMISES,Sem Informações de Faturamento,CMV.001518,105.0,850.0,0.000000,0.000000,...,1.345234,288.384615,0.000000,0.000000,88.281005,1705.925320,79.452904,1899.302759,0.000000,279.031033
