In [None]:
#!pip install mlxtend
#!pip install efficient-apriori
!pip install openai -U
!pip install tiktoken -U

In [127]:
import warnings, random

import datetime, time
import openai
import tiktoken
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import matplotlib.pyplot as plt
import seaborn as sns

import efficient_apriori as efa
from mlxtend.frequent_patterns import apriori, association_rules, fpmax, hmine, fpcommon, fpgrowth
from mlxtend.preprocessing import TransactionEncoder

from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, silhouette_score, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler

from tqdm import tqdm  # Para visualizar o progresso

warnings.filterwarnings('ignore')

In [None]:
api_key = 'api-key'

client = openai.OpenAI(api_key=api_key)

In [2]:
path = 'D:/Disco/Data/PUCRS/proj CDIA III/'

tables = ['PUC_VENDAS', 'PUC_CAMPANHAS']

In [3]:
vendas_dataset = pq.ParquetDataset(f'{path}/dataset/{tables[0]}')
#campanhas_dataset = pq.ParquetDataset(f'{path}/dataset/{tables[1]}')

In [95]:
vendas = vendas_dataset.read_pandas()
#campanhas = campanhas_dataset.read_pandas()

vendas = vendas.to_pandas()

In [96]:
vendas['COD_CUPOM_LOJA'] = vendas['COD_CUPOM'].astype(str) + '-' + vendas['COD_LOJA'].astype(str)

max_cod = vendas['COD_CLIENTE'].max()
mask_zero = (vendas['COD_CLIENTE'] == 0) | (pd.isna(vendas['COD_CLIENTE']))
loc_cupom = vendas.loc[mask_zero, 'COD_CUPOM_LOJA']
unique_cod_cupom = loc_cupom.unique()
cod_clientes = np.arange(max_cod + 1, max_cod + len(unique_cod_cupom) + 1, dtype=int)

map_codes = dict(zip(unique_cod_cupom, cod_clientes))
vendas.loc[mask_zero, 'COD_CLIENTE'] = vendas.loc[mask_zero, 'COD_CUPOM_LOJA'].map(map_codes)

vendas['COD_CLIENTE'] = vendas['COD_CLIENTE'].astype(int)

In [97]:
cod_clientes = vendas.groupby('COD_CLIENTE')['COD_CUPOM'].nunique().reset_index()
cod_clientes.rename(columns={'COD_CUPOM':'n_cupom'}, inplace=True)

n_skus = vendas.groupby(['COD_CLIENTE', 'COD_CUPOM'])['COD_SKU'].count().reset_index()
n_skus.rename(columns={'COD_SKU': 'n_skus'}, inplace=True)

avg_skus = n_skus.groupby('COD_CLIENTE')['n_skus'].mean().reset_index()
avg_skus.rename(columns={'n_skus': 'avg_skus'}, inplace=True)

n_skus = n_skus.groupby('COD_CLIENTE')['n_skus'].sum().reset_index()

n_skus_u = vendas.groupby('COD_CLIENTE')['COD_SKU'].nunique().reset_index()
n_skus_u.rename(columns={'COD_SKU':'n_skus_u'}, inplace=True)

total_sum = vendas.groupby('COD_CLIENTE')['TOTAL_LIQUIDO'].sum().reset_index()
total_sum.rename(columns={'TOTAL_LIQUIDO':'total_sum'}, inplace=True)

avg_cupom = total_sum.merge(cod_clientes, on='COD_CLIENTE')
avg_cupom['avg_cupom'] = avg_cupom['total_sum'] / avg_cupom['n_cupom']

#clientes_features = cod_clientes.merge(n_skus, on='COD_CLIENTE')
clientes_features = cod_clientes.merge(avg_skus, on='COD_CLIENTE')
clientes_features = clientes_features.merge(n_skus, on='COD_CLIENTE')
clientes_features = clientes_features.merge(n_skus_u, on='COD_CLIENTE')
clientes_features = clientes_features.merge(avg_cupom[['COD_CLIENTE', 'avg_cupom', 'total_sum']], on='COD_CLIENTE')
clientes_features


Unnamed: 0,COD_CLIENTE,n_cupom,avg_skus,n_skus,n_skus_u,avg_cupom,total_sum
0,120,1,2.0,2,2,25.990,25.99
1,250,10,2.2,22,19,80.183,801.83
2,253,2,2.5,5,5,56.755,113.51
3,408,1,7.0,7,6,171.080,171.08
4,460,2,1.5,3,3,25.095,50.19
...,...,...,...,...,...,...,...
1474962,94980993,1,3.0,3,2,10.510,10.51
1474963,94980994,1,2.0,2,2,12.190,12.19
1474964,94980995,1,1.0,1,1,9.490,9.49
1474965,94980996,1,2.0,2,2,25.650,25.65


In [119]:
products = vendas.groupby(['COD_SKU', 'SKU', 'CATEGORIA_SKU', 'SUBCATEGORIA_SKU'], as_index=False).agg({'PRECO_REGULAR': 'mean'})
products.rename(columns={'PRECO_REGULAR': 'PRECO_REGULAR_AVG'}, inplace=True)
products['description'] = products['SKU'] + ' ' + products['CATEGORIA_SKU'] + ' ' + products['SUBCATEGORIA_SKU']
products

Unnamed: 0,COD_SKU,SKU,CATEGORIA_SKU,SUBCATEGORIA_SKU,PRECO_REGULAR_AVG,description
0,7,SERVICO EM DOMICILIO,DIVERSOS,SERVICOS TELE ENTREGA,5.000000,SERVICO EM DOMICILIO DIVERSOS SERVICOS TELE EN...
1,10,MANIPULADOS,MANIPULADOS,MANIPULADOS MEDICAMENTOS,0.010000,MANIPULADOS MANIPULADOS MANIPULADOS MEDICAMENTOS
2,11,SERVICO DE APLICACAO DE INJETAVEIS,SERVIÇOS,SERVICOS FARMACEUTICOS,5.000000,SERVICO DE APLICACAO DE INJETAVEIS SERVIÇOS SE...
3,12,SERVICO DE GESTO VACINAL GRIPE CONVENIOS,SERVIÇOS,SERVICOS FARMACEUTICOS,7.983931,SERVICO DE GESTO VACINAL GRIPE CONVENIOS SERVI...
4,13,SERVICO DE VERIFICACAO DE GLICEMIA CAPIL,SERVIÇOS,SERVICOS FARMACEUTICOS,3.000000,SERVICO DE VERIFICACAO DE GLICEMIA CAPIL SERVI...
...,...,...,...,...,...,...
20653,100027853,ESC CAB MARCO BONI 7,PERFUMARIA,PERFUMARIA,14.990000,ESC CAB MARCO BONI 7 PERFUMARIA PERFUMARIA
20654,100027855,ESC CAB MARCO BONI 8,PERFUMARIA,PERFUMARIA,29.990000,ESC CAB MARCO BONI 8 PERFUMARIA PERFUMARIA
20655,100027878,NEBULIZADOR PULMOMAI,MEDICAMENTOS,LIBERADOS,219.900000,NEBULIZADOR PULMOMAI MEDICAMENTOS LIBERADOS
20656,100027882,AMPOLA PANTENE 3UN NUTRE,PERFUMARIA,PERFUMARIA,27.956292,AMPOLA PANTENE 3UN NUTRE PERFUMARIA PERFUMARIA


In [151]:

batch = []
for description in products['description']:
    batch.append({"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "text-embedding-3-small", "input": description, "encoding_format": "float"}})

batch


[{'custom_id': 'request-1',
  'method': 'POST',
  'url': '/v1/embeddings',
  'body': {'model': 'text-embedding-3-small',
   'input': 'SERVICO EM DOMICILIO DIVERSOS SERVICOS TELE ENTREGA',
   'encoding_format': 'float'}},
 {'custom_id': 'request-1',
  'method': 'POST',
  'url': '/v1/embeddings',
  'body': {'model': 'text-embedding-3-small',
   'input': 'MANIPULADOS MANIPULADOS MANIPULADOS MEDICAMENTOS',
   'encoding_format': 'float'}},
 {'custom_id': 'request-1',
  'method': 'POST',
  'url': '/v1/embeddings',
  'body': {'model': 'text-embedding-3-small',
   'input': 'SERVICO DE APLICACAO DE INJETAVEIS SERVIÇOS SERVICOS FARMACEUTICOS',
   'encoding_format': 'float'}},
 {'custom_id': 'request-1',
  'method': 'POST',
  'url': '/v1/embeddings',
  'body': {'model': 'text-embedding-3-small',
   'input': 'SERVICO DE GESTO VACINAL GRIPE CONVENIOS SERVIÇOS SERVICOS FARMACEUTICOS',
   'encoding_format': 'float'}},
 {'custom_id': 'request-1',
  'method': 'POST',
  'url': '/v1/embeddings',
  'body'

In [129]:
def num_tokens(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [147]:
def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding




In [121]:
descriptions = products['description'].tolist()


In [132]:
num_tokens(descriptions[:1][0], 'cl100k_base')

18

In [149]:
# Gerar embeddings
embeddings = get_embedding(descriptions[:1][0])

# Converter lista de embeddings em array numpy
embeddings_array = np.array(embeddings)

In [150]:
embeddings_array

array([ 0.01674971, -0.00475598,  0.02654027, ...,  0.02567606,
       -0.02190084, -0.01184874])

In [None]:
# Normalizar o preço
scaler = MinMaxScaler()
precos_normalizados = scaler.fit_transform(products[['PRECO_REGULAR_AVG']])

# Concatenar embeddings com o preço normalizado
embeddings_com_preco = np.concatenate((embeddings_array, precos_normalizados), axis=1)

# Criar um DataFrame com os embeddings finais
df_embeddings = pd.DataFrame(embeddings_com_preco)

# Combinar com o DataFrame original
df_final = pd.concat([products.reset_index(drop=True), df_embeddings.reset_index(drop=True)], axis=1)


#### Clustering customers

In [100]:
X = clientes_features[['n_cupom', 'avg_skus', 'n_skus', 'n_skus_u', 'avg_cupom', 'total_sum']]

scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test = train_test_split(X, test_size=.2, random_state=21)


In [104]:
def silhouette_scorer(estimator, X):
    cluster_labels = estimator.fit_predict(X)
    return silhouette_score(X, cluster_labels)

In [106]:
cls = KMeans(n_clusters=3)
cls.fit(X_train)

In [110]:
kmeans_params = {'n_clusters':[2,3,4]}

cls = KMeans()

grid_kms = GridSearchCV(cls, kmeans_params, scoring=silhouette_scorer, cv=5)
grid_kms.fit(X_train)
best_cls = grid_kms.best_estimator_
print("Melhor Configuração k-NN:", grid_kms.best_params_)
print("Acurácia:", grid_kms.best_score_)

Melhor Configuração k-NN: {'n_clusters': 2}
Acurácia: 0.8573033241140807


In [109]:
grid_kms.best_score_

0.8695083860053316

In [63]:
cupons_list = vendas['COD_CUPOM_LOJA'].unique()
vendas_train, vendas_test =  train_test_split(cupons_list, test_size=.2, random_state=36)

vendas_train = vendas[vendas['COD_CUPOM_LOJA'].isin(vendas_train)]
vendas_test = vendas[vendas['COD_CUPOM_LOJA'].isin(vendas_test)]

In [66]:
len(vendas['COD_CUPOM'].unique()), len(vendas['COD_CUPOM_LOJA'].unique())

(951832, 4099925)

In [62]:
vendas[vendas['COD_CUPOM'] == 497951]

Unnamed: 0,COD_CUPOM,COD_CLIENTE,CLIENTE_FISICO_JURIDICO,SEXO_CLIENTE,DTNASCIMENTO_CLIENTE,COD_SKU,SKU,CATEGORIA_SKU,SUBCATEGORIA_SKU,UF_CIDADE,...,NOME_CIDADE,DATA_CUPOM,UNIDADES,IDENTIFICADOR_PROMOCIONAL,PRECO_REGULAR,TOTAL_DESCONTO,TOTAL_BRUTO,TOTAL_LIQUIDO,COD_LOJA,COD_CUPOM_LOJA
387524,497951,34537301.0,Pessoa Física,M,1966-09-19,100002903,SERINGA BD INS ULTRA-FINE 30UI 6MM 10UN,MEDICAMENTOS,HOSPITALARES,RS,...,PORTO ALEGRE,2024-03-28 15:04:34.000,1.0,100296834.0,39.9,7.98,39.9,31.92,1036,497951-1036
5888340,497951,251552.0,Pessoa Física,M,1970-08-06,100027005,BOMBOM LACTA OURO BRANCO,CONVENIENCIA,CONVENIENCIA PERECIVEIS,RS,...,PORTO ALEGRE,2024-02-22 16:41:20.000,1.0,,1.99,0.0,1.99,1.99,598,497951-598
5888341,497951,251552.0,Pessoa Física,M,1970-08-06,100027005,BOMBOM LACTA OURO BRANCO,CONVENIENCIA,CONVENIENCIA PERECIVEIS,RS,...,PORTO ALEGRE,2024-02-22 16:41:20.000,1.0,,1.99,0.0,1.99,1.99,598,497951-598
5888342,497951,251552.0,Pessoa Física,M,1970-08-06,11903,LEXOTAN 3MG 30CP MOKSHA8 (B1),MEDICAMENTOS,REFERENCIA ONEROSO CONTROLADO,RS,...,PORTO ALEGRE,2024-02-22 16:41:20.000,2.0,,51.07,12.26,102.14,89.88,598,497951-598
7549866,497951,41905354.0,Pessoa Física,F,1955-12-13,10034985,FR HUGGIES ROUPINHA SUPREME CARE M 80UN,PERFUMARIA,FRALDAS,RS,...,PORTO ALEGRE,2024-01-19 20:17:06.000,1.0,100270636.0,84.99,5.09,84.99,79.9,774,497951-774
7549867,497951,41905354.0,Pessoa Física,F,1955-12-13,10022473,TOALHAS UMED SNOW BA,PERFUMARIA,LENCOS E TOALHAS UMEDECIDAS,RS,...,PORTO ALEGRE,2024-01-19 20:17:06.000,1.0,,13.99,0.0,13.99,13.99,774,497951-774
8337746,497951,91026519.0,Pessoa Física,M,1957-03-18,10004993,GLIFAGE XR 500MG 30C,MEDICAMENTOS,REFERENCIA FARMACIA POPULAR,RS,...,PORTO ALEGRE,2024-03-06 10:20:28.000,2.0,,5.4,10.8,10.8,10.8,829,497951-829
9219216,497951,94980913.0,Pessoa Física,M,,10030882,BOLINHO BAUDUCCO DUPLO CHOCOLATE 40G,CONVENIENCIA,CONVENIENCIA PERECIVEIS,RS,...,PORTO ALEGRE,2024-05-10 14:21:41.933,1.0,,2.29,0.0,2.29,2.29,988,497951-988
9219217,497951,94980913.0,Pessoa Física,M,,10035581,HAMBURGUER SEARA HOT HIT PICANHA 145G,CONVENIENCIA,CONGELADOS,RS,...,PORTO ALEGRE,2024-05-10 14:21:41.933,1.0,,9.9,0.0,9.9,9.9,988,497951-988


In [59]:
vendas[vendas['COD_CLIENTE'] == 0]['COD_CUPOM_LOJA']

2764       162203-1001
2765       162203-1001
2766       160629-1001
2767       168754-1001
2768       168754-1001
              ...     
9219217     497951-988
9219218     515815-988
9219219     527434-988
9219220     527434-988
9219221     511506-988
Name: COD_CUPOM_LOJA, Length: 1042842, dtype: object

In [14]:
vendas.groupby(by=['COD_CLIENTE'], as_index=False).agg({'COD_CUPOM':'count', 'TOTAL_LIQUIDO':'sum', 'TOTAL_LIQUIDO':'mean', 'COD_SKU':'count'})

Unnamed: 0,COD_CLIENTE,COD_CUPOM,TOTAL_LIQUIDO,COD_SKU
0,0.0,1042842,13.939908,1042842
1,120.0,2,12.995000,2
2,250.0,22,36.446818,22
3,253.0,5,22.702000,5
4,408.0,7,24.440000,7
...,...,...,...,...
782981,94288844.0,3,20.623333,3
782982,94288852.0,2,15.590000,2
782983,94288893.0,2,27.695000,2
782984,94288950.0,2,17.815000,2


In [None]:
for cod_cliente in vendas.groupby(by=['COD_CLIENTE'], as_index=False)['COD_CLIENTE']:
    cod_cliente

In [7]:
vendas.groupby(by=['COD_CLIENTE'], as_index=False).agg({'':'count'})

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001F5BFB385E0>

In [126]:
def extract_rules_eff(transactions_df, min_support, min_threshold):
    transactions = [tuple(set(transaction)) for transaction in transactions_df.tolist()]
    itemsets, rules = efa.apriori(transactions, min_support=min_support, min_confidence=min_threshold)

    temp_df = []
    for rule in rules:
        rl = {
            'lhs':rule.lhs,
            'rhs':rule.rhs,
            'confidence':rule.confidence,
            'conviction':rule.conviction,
            'lift':rule.lift,
            'rpf':rule.rpf,
            'support':rule.support
        }

        temp_df.append(rl)

    temp_df = pd.DataFrame(temp_df)

    del transactions, itemsets, rules

    return temp_df    

def extract_rules(transactions_df, min_support, min_threshold, metric):
    
    # Assuming 'transaction_data' is a list of lists where each sublist contains SKUs bought in a transaction.
    te = TransactionEncoder()
    te_ary = te.fit(transactions_df).transform(transactions_df)
    df_encoded = pd.DataFrame(te_ary, columns=te.columns_)

    # Apply the apriori algorithm with a minimum support of 0.01 (adjustable)
    #frequent_itemsets = apriori(df_encoded, min_support=min_support, use_colnames=True)
    frequent_itemsets = fpgrowth(df_encoded, min_support=min_support)

    # Generate association rules with a minimum confidence of 0.5 (adjustable)
    rules = association_rules(frequent_itemsets, metric=metric, min_threshold=min_threshold)

    del te, te_ary, df_encoded, frequent_itemsets

    return rules

'''
def get_suggestions(sku_list, rules_df):
    
    if not isinstance(rules_df['antecedents'].iloc[0], (list, tuple, set)):
        rules_df['antecedents'] = rules_df['antecedents'].apply(lambda x: eval(x) if isinstance(x, str) else x)
    
    filtered_rules = rules_df[rules_df['antecedents'].apply(lambda x: any(sku in x for sku in sku_list))]
    
    sorted_rules = filtered_rules.sort_values(by=['consequent support', 'confidence'], ascending=False)

    # Obter todos os SKUs sugeridos da coluna 'consequents'
    suggested_skus = sorted_rules['consequents'].explode().unique()
    
    # Selecionar no máximo 6 SKUs
    suggested_skus = suggested_skus[:6]
    
    return suggested_skus
'''
    
def get_suggestions(sku_list, rules_df, top_n=6):

    suggestions = pd.DataFrame(columns=['rhs', 'confidence', 'conviction', 'lift'])

    filtered_rules = rules_df[rules_df['lhs'].apply(lambda x: any(sku in x for sku in sku_list))]
    filtered_rules = filtered_rules.sort_values(by=['confidence', 'lift', 'conviction'], ascending=False)

    if filtered_rules.empty == False:
        suggestions = filtered_rules[['rhs', 'confidence', 'conviction', 'lift']].copy()
        
        # Remover SKUs já presentes na sku_list
        suggestions = suggestions[~suggestions['rhs'].isin(sku_list)]

        # Expandir o 'rhs' caso contenha múltiplos itens
        suggestions = suggestions.explode('rhs')
        
        # Remover duplicatas baseadas no 'rhs'
        suggestions = suggestions.drop_duplicates(subset='rhs')
        
        # Selecionar apenas os top_n resultados
        suggestions = suggestions.head(top_n)
        
        # Resetar o índice para melhor apresentação
        suggestions = suggestions.reset_index(drop=True)
    
    return suggestions

def get_recommendations(suggestions_df, products_df, discount):
    # Verificar se o desconto está entre 0 e 1
    if not 0 <= discount <= 1:
        raise ValueError("O desconto deve ser um valor decimal entre 0 e 1 (por exemplo, 0.10 para 10%)")
    
    # Mesclar suggestions_df com products_df com base na correspondência de 'rhs' e 'COD_SKU'
    merged_df = suggestions_df.merge(products_df, left_on='rhs', right_on='COD_SKU', how='left')
    
    # Verificar se a mesclagem resultou em produtos correspondentes
    if merged_df.empty:
        #print("Nenhum produto correspondente encontrado para as sugestões fornecidas.")
        return merged_df
    
    # Aplicar o desconto ao preço regular e adicionar nova coluna
    merged_df['PRECO_COM_DESCONTO'] = merged_df['PRECO_REGULAR_AVG'] * (1 - discount)
    
    # Selecionar as colunas desejadas
    columns_to_return = ['COD_SKU', 'SKU', 'CATEGORIA_SKU', 'PRECO_REGULAR_AVG', 'PRECO_COM_DESCONTO']
    recommended_products = merged_df[columns_to_return]
    
    return recommended_products


def evalSuggestions(sales_test, products, rules):
    """
    Avalia o desempenho das recomendações geradas pelas regras de associação.
    
    Parâmetros:
    - sales_test: DataFrame contendo as vendas reais no conjunto de teste.
    - rules: DataFrame contendo as regras de associação.

    Retorna:
    - metrics_df: DataFrame com as métricas de avaliação calculadas.
    """
    # Lista para armazenar as métricas
    metrics_list = []

    # Agrupar as compras por cliente (ou por cupom, dependendo da granularidade desejada)
    grouped_sales = sales_test.groupby('COD_CUPOM')

    # Iterar sobre cada cupom
    for cod_cupom, group in tqdm(grouped_sales, desc='Avaliando Recomendações'):
        #for cod_cupom in sales_test.groupby('COD_CUPOM'):

        # Produtos comprados pelo cliente neste período
        purchased_products = group['COD_SKU'].unique().tolist()

        # Simular o carrinho atual (pode ser os produtos comprados anteriormente)
        # Aqui, assumimos que o cliente já comprou alguns produtos e queremos recomendar adicionais
        # Para simplificar, vamos assumir que o carrinho está vazio ou usar os produtos do cliente no conjunto de treinamento
        cart = purchased_products  # Aqui você pode customizar como obter o carrinho do cliente

        # Gerar sugestões com base no carrinho e nas regras
        suggestions = get_suggestions(cart, rules)

        # Obter as recomendações (DataFrame)
        recommendations_df = get_recommendations(suggestions, products, discount=.05)
        recommended_products = recommendations_df['COD_SKU'].tolist()

        # Criar vetores binários para cálculo das métricas
        # Produtos relevantes (1 se o produto foi comprado, 0 caso contrário)
        y_true = [1 if sku in purchased_products else 0 for sku in recommended_products]

        # Produtos recomendados (1 para todos os recomendados)
        y_pred = [1]*len(recommended_products)

        # Se não houver recomendações, pular para evitar divisões por zero
        if not recommended_products:
            continue

        # Calcular métricas
        precision = precision_score(y_true, y_pred, zero_division=0)
        recall = recall_score(y_true, y_pred, zero_division=0)
        f1 = f1_score(y_true, y_pred, zero_division=0)
        accuracy = accuracy_score(y_true, y_pred)

        # Armazenar as métricas
        metrics_list.append({
            'COD_CUPOM': cod_cupom,
            'COD_CLIENTE': group['COD_CLIENTE'].unique()[0],
            'Items':len(purchased_products),
            'Precision': precision,
            'Recall': recall,
            'F1-Score': f1,
            'Accuracy': accuracy,
            'Num_Recommendations': len(recommended_products),
            'Num_Relevant': sum(y_true),
            'purchased_products':purchased_products,
            'recommended_products':recommended_products
        })

    # Converter a lista de métricas em DataFrame
    metrics_df = pd.DataFrame(metrics_list)

    # Calcular as métricas médias
    avg_metrics = metrics_df[['Precision', 'Recall', 'F1-Score', 'Accuracy']].mean().to_dict()
    print("Métricas Médias:")
    for metric, value in avg_metrics.items():
        print(f"{metric}: {value:.4f}")

    return metrics_df


In [117]:
#transaction_data = vendas[vendas['COD_CLIENTE'] > 0].groupby(by=['COD_CLIENTE'])['COD_SKU'].apply(list)
transaction_data = vendas_train.groupby(by=['COD_CUPOM'])['COD_SKU'].apply(list)
#transaction_data

COD_CUPOM
1                                          [10099287, 6278]
2                  [10033544, 37, 5692, 10004182, 10085918]
4                                  [10006305, 37, 10031975]
6         [100003920, 37, 10034458, 10038466, 10000787, ...
7                                  [10034125, 37, 10004993]
                                ...                        
990196                                           [10002213]
990197                                           [10037790]
990198                                           [10004206]
990199                                      [1001381, 2059]
990200                                 [10005480, 10005480]
Name: COD_SKU, Length: 761465, dtype: object

In [97]:
metric = "confidence"
min_support=0.001
min_threshold = 0.03

# Convert the data into a transactional format where each transaction (COD_CUPOM) lists the products (COD_SKU) purchased
# Grouping the data by 'COD_CUPOM' and aggregating the SKUs purchased in each transaction
# sales_tmp.groupby('COD_CUPOM')['COD_SKU'].apply(list)
sample_data = transaction_data.sample(frac=1)
rules = extract_rules(sample_data, min_support=min_support, min_threshold=min_threshold, metric=metric)

In [145]:
metric = "confidence"
min_support=0.001
min_threshold = 0.001

#transactions = [tuple(set(transaction)) for transaction in transaction_data.tolist()]
#_, rules = efa.apriori(transactions, min_support=min_support, min_confidence=min_threshold)
rules = extract_rules_eff(transaction_data, min_support, min_threshold)
print(f'extracted {len(rules)} rules')

extracted 1240 rules


In [146]:
evalSuggestions(vendas_test[:20000], products, rules)

Avaliando Recomendações: 100%|██████████| 8409/8409 [03:27<00:00, 40.47it/s]

Métricas Médias:
Precision: 0.0261
Recall: 0.0800
F1-Score: 0.0380
Accuracy: 0.0261





Unnamed: 0,COD_CUPOM,COD_CLIENTE,Items,Precision,Recall,F1-Score,Accuracy,Num_Recommendations,Num_Relevant,purchased_products,recommended_products
0,140202,58638508.0,6,0.0,0.0,0.0,0.0,6,0,"[2705, 100026478, 100003920, 10086657, 1009118...","[6857, 10102451, 10185, 10032109, 10027058, 10..."
1,140234,22371961.0,4,0.0,0.0,0.0,0.0,1,0,"[100019526, 4264, 100009844, 100027743]",[10032109]
2,140238,26932981.0,1,0.0,0.0,0.0,0.0,1,0,[10106052],[10032109]
3,140252,68472667.0,1,0.0,0.0,0.0,0.0,6,0,[100011848],"[10032109, 10027271, 100011752, 10027058, 1000..."
4,140254,26590677.0,2,0.0,0.0,0.0,0.0,6,0,"[10103182, 10090254]","[10094058, 10004993, 10102141, 10104602, 10001..."
...,...,...,...,...,...,...,...,...,...,...,...
2057,225117,0.0,3,0.0,0.0,0.0,0.0,6,0,"[973, 1004872, 10185]","[10032109, 2523, 10027058, 11864, 10004405, 6833]"
2058,225152,89757854.0,1,0.0,0.0,0.0,0.0,6,0,[10021213],"[10032109, 10021214, 10027058, 10004405, 6833,..."
2059,225158,46731939.0,9,0.0,0.0,0.0,0.0,1,0,"[10098918, 94442, 10011211, 10100052, 10029140...",[10032109]
2060,225179,81953498.0,2,0.0,0.0,0.0,0.0,6,0,"[6833, 1002472]","[10032109, 10027058, 10004405, 11864, 10036714..."


In [None]:
rules.sort_values(by=['confidence', 'lift', 'conviction'], ascending=False).head(40)

In [110]:
vendas_test.head(10)

Unnamed: 0,COD_CUPOM,COD_CLIENTE,CLIENTE_FISICO_JURIDICO,SEXO_CLIENTE,DTNASCIMENTO_CLIENTE,COD_SKU,SKU,CATEGORIA_SKU,SUBCATEGORIA_SKU,UF_CIDADE,COD_CIDADE,NOME_CIDADE,DATA_CUPOM,UNIDADES,IDENTIFICADOR_PROMOCIONAL,PRECO_REGULAR,TOTAL_DESCONTO,TOTAL_BRUTO,TOTAL_LIQUIDO,COD_LOJA
7556783,501913,38966436.0,Pessoa Física,F,1991-03-22,10096759,OLEO CORP NUPILL 100,PERFUMARIA,PERFUMARIA,RS,80400,PORTO ALEGRE,2024-02-03 18:25:22.000,1.0,,14.99,0.0,14.99,14.99,774
9070657,477284,0.0,Pessoa Física,M,,10101965,IBUPROFENO 100MG 20M,MEDICAMENTOS,GENERICOS ONEROSOS,RS,80400,PORTO ALEGRE,2024-04-01 21:17:10.000,1.0,100282155.0,17.31,7.32,17.31,9.99,988
2930628,85008,0.0,Pessoa Física,M,,100002458,BARRA CEREAL NATURALE 25G AVEIA/BANANA/,CONVENIENCIA,CONVENIENCIA PERECIVEIS,RS,80400,PORTO ALEGRE,2024-04-15 14:25:20.000,1.0,,1.5,0.0,1.5,1.5,1263
6542812,615068,25730921.0,Pessoa Física,M,1957-11-23,100027385,TAMARINE GELEIA 250G ZERO ACUCAR HYPERA,MEDICAMENTOS,REFERENCIA ONEROSOS,RS,80400,PORTO ALEGRE,2024-05-18 12:39:25.474,1.0,,119.86,11.99,119.86,107.87,684
7801998,481504,51768036.0,Pessoa Física,M,1962-12-05,100001424,ESCITALOPRAM 10MG 60,MEDICAMENTOS,GENERICO CONTROLADO,RS,80400,PORTO ALEGRE,2024-06-28 18:43:47.272,1.0,100324417.0,127.18,67.28,127.18,59.9,815
2789404,46307,32014829.0,Pessoa Física,F,1961-08-24,1002472,BENEGRIP 6CP REV HYPERA PHARMA AV*,MEDICAMENTOS,REFERENCIA AVULSO,RS,80400,PORTO ALEGRE,2024-06-06 19:22:13.183,4.0,,14.01,0.0,56.04,56.04,1259
34818,141605,20881474.0,Pessoa Física,F,1955-07-13,5400,DONAREN 50MG 60CP RE,MEDICAMENTOS,REFERENCIA CONTROLADO,RS,80400,PORTO ALEGRE,2024-01-11 12:38:51.000,1.0,100275596.0,114.49,59.59,114.49,54.9,1001
6422971,299595,25697083.0,Pessoa Física,F,1983-01-23,10000570,SALGADINHO DEUTSCHIPS ONDUL TRADICIONAL,CONVENIENCIA,CONVENIENCIA PERECIVEIS,RS,80400,PORTO ALEGRE,2024-06-21 09:32:20.238,1.0,,18.29,9.15,18.29,9.14,682
7055001,671991,42061227.0,Pessoa Física,M,1987-02-12,10018440,ABCLER 10ML AIRELA AV*,MEDICAMENTOS,SIMILAR AVULSO,RS,80400,PORTO ALEGRE,2024-05-24 16:45:03.474,2.0,,1.99,0.0,3.98,3.98,714
3376582,84231,0.0,Pessoa Física,M,,100020204,CR DENT CLOSE UP TR,PERFUMARIA,PERFUMARIA,RS,80400,PORTO ALEGRE,2024-01-24 23:13:38.000,1.0,,3.29,0.0,3.29,3.29,1326


In [156]:
cart = [100027743]#, 4264, 100009844, 100027743 
products[products['COD_SKU'].isin(cart)][['COD_SKU', 'SKU', 'PRECO_REGULAR_AVG']]

Unnamed: 0,COD_SKU,SKU,PRECO_REGULAR_AVG
20615,100027743,SIMETICONA 125MG 10C,9.487579


In [157]:
suggestions = get_suggestions(cart, rules)
suggestions

Unnamed: 0,rhs,confidence,conviction,lift
0,10032109,0.091507,1.0165,1.195905


In [158]:
get_recommendations(suggestions, products, discount=.05)[['COD_SKU', 'SKU', 'PRECO_REGULAR_AVG', 'PRECO_COM_DESCONTO']]

Unnamed: 0,COD_SKU,SKU,PRECO_REGULAR_AVG,PRECO_COM_DESCONTO
0,10032109,PAPEL HIG LOUVRE FOLHA DUPLA 20M 12UN NE,13.057047,12.404195


In [66]:
rules.to_csv('rules.csv')

In [None]:
rules.sort_values(by=['confidence'], ascending=False).head(50)

10035299, 10035289, 10035291

In [None]:
get_suggestions([12722], rules)

In [None]:
produtos[produtos['COD_SKU'].isin([12722])]

In [None]:
#frequent_itemsets 
# Sort frequent itemsets by support in descending order
sorted_itemsets = frequent_itemsets.sort_values(by='support', ascending=False)

# Select the top 20 most frequent items
top_20_items = sorted_itemsets.head(20)

print("Top 20 most frequent items:")
print(top_20_items)

# If you want to visualize these items
plt.figure(figsize=(12, 6))
plt.bar(range(len(top_20_items)), top_20_items['support'])
plt.xticks(range(len(top_20_items)), top_20_items['itemsets'].apply(lambda x: ', '.join(list(x))), rotation=90)
plt.xlabel('Items')
plt.ylabel('Support')
plt.title('Top 20 Most Frequent Items')
plt.tight_layout()
plt.show()

# Select items with support greater than 1%
relevant_items = frequent_itemsets[frequent_itemsets['support'] > 0.01]

print("\nItems with support greater than 1%:")
print(relevant_items)


In [None]:
df = vendas

# Creating a pivot table where rows are products (SKU), columns are products, and values are the count of co-occurrences
product_matrix = df.groupby(['COD_CUPOM', 'CATEGORIA_SKU'])['UNIDADES'].sum().unstack().fillna(0)

# Creating a correlation matrix to find relationships between products
product_correlation = product_matrix.corr()

# Plotting the heatmap to show relationships between products
plt.figure(figsize=(10, 8))
sns.heatmap(product_correlation, cmap='coolwarm', annot=False)
plt.title('Correlação dos produtos')
plt.show()