In [None]:
!pip install mlxtend
!pip install efficient-apriori

In [1]:
import warnings, random

import datetime
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.compute as pc
import matplotlib.pyplot as plt
import seaborn as sns

import efficient_apriori as efa
from mlxtend.frequent_patterns import apriori, association_rules, fpmax, hmine, fpcommon, fpgrowth
from mlxtend.preprocessing import TransactionEncoder

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from tqdm import tqdm  # Para visualizar o progresso

warnings.filterwarnings('ignore')

In [220]:
def buildTransactionList(sales_df, transaction_type='cupom', min_size=1):
    transaction_data = []
    
    if transaction_type == 'cupom':
        transaction_data = sales_df.groupby(by=['COD_CUPOM_LOJA'])['COD_SKU'].apply(list)
    if transaction_type == 'customer':
        transaction_data = sales_df.groupby(by=['COD_CLIENTE'])['COD_SKU'].apply(list)

    transaction_data = transaction_data[transaction_data.apply(lambda x: len(x) >= min_size)]

    return transaction_data

def extract_rules_eff(transactions_df, min_support, min_threshold):
    transactions = [tuple(set(transaction)) for transaction in transactions_df.tolist()]
    itemsets, rules = efa.apriori(transactions, min_support=min_support, min_confidence=min_threshold)

    temp_df = []
    for rule in rules:
        rl = {
            'lhs':rule.lhs,
            'rhs':rule.rhs,
            'confidence':rule.confidence,
            'conviction':rule.conviction,
            'lift':rule.lift,
            'rpf':rule.rpf,
            'support':rule.support
        }

        temp_df.append(rl)

    temp_df = pd.DataFrame(temp_df)

    del transactions, itemsets, rules

    return temp_df    

def extract_rules(transactions_df, min_support, min_threshold, metric, algo='apriori'):
    
    # Assuming 'transaction_data' is a list of lists where each sublist contains SKUs bought in a transaction.
    te = TransactionEncoder()
    te_ary = te.fit(transactions_df).transform(transactions_df)
    df_encoded = pd.DataFrame(te_ary, columns=te.columns_)

    # Apply the apriori/fpgrowth algorithm
    if algo == 'apriori':
        frequent_itemsets = apriori(df_encoded, min_support=min_support, use_colnames=True)
    else:
        frequent_itemsets = fpgrowth(df_encoded, min_support=min_support, use_colnames=True)

    # Generate association rules
    rules = association_rules(frequent_itemsets, metric=metric, min_threshold=min_threshold)

    del te, te_ary, df_encoded, frequent_itemsets

    return rules

'''
def get_suggestions(sku_list, rules_df):
    
    if not isinstance(rules_df['antecedents'].iloc[0], (list, tuple, set)):
        rules_df['antecedents'] = rules_df['antecedents'].apply(lambda x: eval(x) if isinstance(x, str) else x)
    
    filtered_rules = rules_df[rules_df['antecedents'].apply(lambda x: any(sku in x for sku in sku_list))]
    
    sorted_rules = filtered_rules.sort_values(by=['consequent support', 'confidence'], ascending=False)

    # Obter todos os SKUs sugeridos da coluna 'consequents'
    suggested_skus = sorted_rules['consequents'].explode().unique()
    
    # Selecionar no máximo 6 SKUs
    suggested_skus = suggested_skus[:6]
    
    return suggested_skus
'''
    
def get_suggestions(sku_list, rules_df, top_n=7):

    suggestions = pd.DataFrame(columns=['rhs', 'confidence', 'conviction', 'lift'])

    filtered_rules = rules_df[rules_df['lhs'].apply(lambda x: any(sku in x for sku in sku_list))]
    filtered_rules = filtered_rules.sort_values(by=['confidence', 'lift', 'conviction'], ascending=False)

    if filtered_rules.empty == False:
        suggestions = filtered_rules[['rhs', 'confidence', 'conviction', 'lift']].copy()
        
        # Remover SKUs já presentes na sku_list
        # suggestions = suggestions[~suggestions['rhs'].isin(sku_list)]

        # Expandir o 'rhs' caso contenha múltiplos itens
        suggestions = suggestions.explode('rhs')
        
        # Remover duplicatas baseadas no 'rhs'
        suggestions = suggestions.drop_duplicates(subset='rhs')
        
        # Selecionar apenas os top_n resultados
        suggestions = suggestions.head(top_n)
        
        # Resetar o índice para melhor apresentação
        suggestions = suggestions.reset_index(drop=True)
    
    return suggestions

def get_recommendations(suggestions_df, products_df, discount):
    # Verificar se o desconto está entre 0 e 1
    if not 0 <= discount <= 1:
        raise ValueError("O desconto deve ser um valor decimal entre 0 e 1 (por exemplo, 0.10 para 10%)")
    
    # Mesclar suggestions_df com products_df com base na correspondência de 'rhs' e 'COD_SKU'
    merged_df = suggestions_df.merge(products_df, left_on='rhs', right_on='COD_SKU', how='left')
    
    # Verificar se a mesclagem resultou em produtos correspondentes
    if merged_df.empty:
        #print("Nenhum produto correspondente encontrado para as sugestões fornecidas.")
        return merged_df
    
    # Aplicar o desconto ao preço regular e adicionar nova coluna
    merged_df['PRECO_COM_DESCONTO'] = merged_df['PRECO_REGULAR_AVG'] * (1 - discount)
    
    # Selecionar as colunas desejadas
    columns_to_return = ['COD_SKU', 'SKU', 'CATEGORIA_SKU', 'PRECO_REGULAR_AVG', 'PRECO_COM_DESCONTO']
    recommended_products = merged_df[columns_to_return]
    
    return recommended_products

def evalSuggestions(sales_test, products, rules):
    """
    Avalia o desempenho das recomendações geradas pelas regras de associação.
    
    Parâmetros:
    - sales_test: DataFrame contendo as vendas reais no conjunto de teste.
    - rules: DataFrame contendo as regras de associação.

    Retorna:
    - metrics_df: DataFrame com as métricas de avaliação calculadas.
    """
    # Lista para armazenar as métricas
    metrics_list = []

    # Agrupar as compras por cupom
    grouped_sales = sales_test.groupby('COD_CUPOM_LOJA')

    # Iterar sobre cada cupom
    for cod_cupom, group in tqdm(grouped_sales, desc='Avaliando Recomendações'):
        #for cod_cupom in sales_test.groupby('COD_CUPOM'):

        # Produtos comprados pelo cliente neste período
        purchased_products = group['COD_SKU'].unique().tolist()

        # Simular o carrinho atual (pode ser os produtos comprados anteriormente)
        # Aqui, assumimos que o cliente já comprou alguns produtos e queremos recomendar adicionais
        # Para simplificar, vamos assumir que o carrinho está vazio ou usar os produtos do cliente no conjunto de treinamento
        #cart = purchased_products  # Aqui você pode customizar como obter o carrinho do cliente

        cart = random.sample(purchased_products, int(len(purchased_products)*1))
        target_products = cart#[p for p in purchased_products if p not in cart ]

        # Gerar sugestões com base no carrinho e nas regras
        suggestions = get_suggestions(cart, rules)

        # Obter as recomendações (DataFrame)
        recommendations_df = get_recommendations(suggestions, products, discount=.05)
        recommended_products = recommendations_df['COD_SKU'].unique().tolist()

        # Se não houver recomendações, pular para evitar divisões por zero
        if not recommended_products:
            continue

        
        target_set = set(target_products)
        recommended_set = set(recommended_products)
        all_products = list(target_set.union(recommended_set))

        y_true = [1 if sku in target_set else 0 for sku in all_products]
        y_pred = [1 if sku in recommended_set else 0 for sku in all_products]


        # Calcular métricas
        precision = precision_score(y_true, y_pred, zero_division=0)
        recall = recall_score(y_true, y_pred, zero_division=0)
        f1 = f1_score(y_true, y_pred, zero_division=0)
        accuracy = accuracy_score(y_true, y_pred)

        # Armazenar as métricas
        metrics_list.append({
            'COD_CUPOM': cod_cupom,
            'COD_CLIENTE': group['COD_CLIENTE'].unique()[0],
            'items':len(purchased_products),
            'precision': precision,
            'recall': recall,
            'f1-score': f1,
            'accuracy': accuracy,
            'recommendations': len(recommended_products),
            'relevant': sum(y_true),
            'purchased_products':purchased_products,
            'recommended_products':recommended_products
        })

    # Converter a lista de métricas em DataFrame
    metrics_df = pd.DataFrame(metrics_list, columns=['COD_CUPOM','COD_CLIENTE','items','precision','recall','f1-score','accuracy','recommendations','relevant','purchased_products','recommended_products'])

    # Calcular as métricas médias
    avg_metrics = metrics_df[['precision', 'recall', 'f1-score', 'accuracy']].mean().to_dict()
    print("Métricas Médias:")
    for metric, value in avg_metrics.items():
        print(f"{metric}: {value:.4f}")

    return metrics_df, avg_metrics



def splitTrainTestData(sales_df, column, test_size=.2):

    cupons_list = sales_df[column].unique()
    sales_train, sales_test =  train_test_split(cupons_list, test_size=test_size, random_state=36)

    sales_train = sales_df[sales_df['COD_CUPOM_LOJA'].isin(sales_train)]
    sales_test = sales_df[sales_df['COD_CUPOM_LOJA'].isin(sales_test)]

    return sales_train, sales_test

In [156]:
#recommended_products_eval = [rp for rp in purchased_products if rp in recommended_products]

# Criar vetores binários para cálculo das métricas
# Produtos relevantes (1 se o produto foi comprado, 0 caso contrário)
#y_true = [1 if sku in purchased_products else 0 for sku in recommended_products]


cart = [1,2,3,4]
sug = [1,2,4,5,6]

rp = [e for e in cart if e in sug]
print(rp)

y_true = [1 if e in sug else 0 for e in cart]
print(y_true)

y_pred = [1]*len(y_true)
print(y_pred)

print(accuracy_score(y_true, y_pred), recall_score(y_true, y_pred, zero_division=0))

[1, 2, 4]
[1, 1, 0, 1]
[1, 1, 1, 1]
0.75 1.0


In [2]:
path = 'D:/Disco/Data/PUCRS/proj CDIA III/'

tables = ['VW_PUC_VENDAS', 'PUC_CAMPANHAS']

In [3]:
vendas_dataset = pq.ParquetDataset(f'{path}/dataset-ii/{tables[0]}')
#campanhas_dataset = pq.ParquetDataset(f'{path}/dataset/{tables[1]}')

In [85]:
vendas = vendas_dataset.read()
#campanhas = campanhas_dataset.read_pandas()

In [86]:
cod_cupom = pc.cast(pc.cast(vendas['COD_CUPOM'], pa.int32()), pa.string())
cod_loja = pc.cast(pc.cast(vendas['COD_LOJA'], pa.int32()), pa.string())
sep = pa.array(['-'] * len(vendas))

vendas = vendas.drop_columns(['COD_CUPOM', 'COD_LOJA', 'CLIENTE_FISICO_JURIDICO', 'SEXO_CLIENTE', 'DTNASCIMENTO_CLIENTE', 'UF_CIDADE', 'COD_CIDADE', 'NOME_CIDADE'])

vendas = vendas.append_column('COD_CUPOM', cod_cupom)
vendas = vendas.append_column('COD_LOJA', cod_loja)
vendas = vendas.append_column('COD_CUPOM_LOJA', pc.binary_join_element_wise(cod_cupom, cod_loja, sep))


In [87]:
vendas = vendas.to_pandas()

max_cod = vendas['COD_CLIENTE'].max()
mask_zero = (vendas['COD_CLIENTE'] == 0) | (pd.isna(vendas['COD_CLIENTE']))
loc_cupom = vendas.loc[mask_zero, 'COD_CUPOM_LOJA']
unique_cod_cupom = loc_cupom.unique()
cod_clientes = np.arange(max_cod + 1, max_cod + len(unique_cod_cupom) + 1, dtype=int)

map_codes = dict(zip(unique_cod_cupom, cod_clientes))
vendas.loc[mask_zero, 'COD_CLIENTE'] = vendas.loc[mask_zero, 'COD_CUPOM_LOJA'].map(map_codes)

vendas['COD_CLIENTE'] = vendas['COD_CLIENTE'].astype(int)

del max_cod, mask_zero, loc_cupom, unique_cod_cupom, cod_clientes, map_codes

In [16]:
vendas.columns

Index(['COD_CUPOM', 'COD_CLIENTE', 'COD_SKU', 'SKU', 'CATEGORIA_SKU',
       'SUBCATEGORIA_SKU', 'DATA_CUPOM', 'UNIDADES',
       'IDENTIFICADOR_PROMOCIONAL', 'PRECO_REGULAR', 'TOTAL_DESCONTO',
       'TOTAL_BRUTO', 'TOTAL_LIQUIDO', 'COD_LOJA', 'COD_CUPOM_LOJA'],
      dtype='object')

In [206]:
units = vendas.groupby(['COD_CLIENTE', 'COD_CUPOM_LOJA'])['UNIDADES'].sum().reset_index()
skus = vendas.groupby(['COD_CLIENTE', 'COD_CUPOM_LOJA'])['COD_SKU'].nunique().reset_index()
cupom_df = units.merge(skus, on=['COD_CLIENTE', 'COD_CUPOM_LOJA'])

del units, skus

In [207]:
cupom_df

Unnamed: 0,COD_CLIENTE,COD_CUPOM_LOJA,UNIDADES,COD_SKU
0,120,513894-693,2.0,2
1,250,797045-77,1.0,1
2,250,800862-77,3.0,3
3,250,802754-77,1.0,1
4,250,809563-77,1.0,1
...,...,...,...,...
4099919,94980992,455250-988,1.0,1
4099920,94980993,455584-988,1.0,1
4099921,94980994,434588-988,1.0,1
4099922,94980995,434583-988,1.0,1


In [93]:
cod_clientes = vendas.groupby('COD_CLIENTE')['COD_CUPOM'].nunique().reset_index()
cod_clientes.rename(columns={'COD_CUPOM':'n_cupom'}, inplace=True)

n_items = vendas.groupby(['COD_CLIENTE', 'COD_CUPOM'])['UNIDADES'].sum().reset_index()
n_items.rename(columns={'UNIDADES': 'n_items'}, inplace=True)

avg_items = n_items.groupby('COD_CLIENTE')['n_items'].mean().reset_index()
avg_items.rename(columns={'n_items': 'avg_items'}, inplace=True)

n_items = n_items.groupby('COD_CLIENTE')['n_items'].sum().reset_index()

n_skus = vendas.groupby('COD_CLIENTE')['COD_SKU'].nunique().reset_index()
n_skus.rename(columns={'COD_SKU':'n_skus'}, inplace=True)

total_sum = vendas.groupby('COD_CLIENTE')['TOTAL_LIQUIDO'].sum().reset_index()
total_sum.rename(columns={'TOTAL_LIQUIDO':'total_sum'}, inplace=True)

avg_cupom = total_sum.merge(cod_clientes, on='COD_CLIENTE')
avg_cupom['avg_cupom'] = avg_cupom['total_sum'] / avg_cupom['n_cupom']

#clientes_features = cod_clientes.merge(n_skus, on='COD_CLIENTE')
clientes_features = cod_clientes.merge(avg_items, on='COD_CLIENTE')
clientes_features = clientes_features.merge(n_items, on='COD_CLIENTE')
clientes_features = clientes_features.merge(n_skus, on='COD_CLIENTE')
clientes_features = clientes_features.merge(avg_cupom[['COD_CLIENTE', 'avg_cupom', 'total_sum']], on='COD_CLIENTE')

del cod_clientes, n_items, avg_items, n_skus, total_sum, avg_cupom

clientes_features

Unnamed: 0,COD_CLIENTE,n_cupom,avg_items,n_items,n_skus,avg_cupom,total_sum
0,120,1,2.0,2.0,2,25.990,25.99
1,250,10,2.3,23.0,19,80.183,801.83
2,253,2,2.5,5.0,5,56.755,113.51
3,408,1,19.0,19.0,6,171.080,171.08
4,460,2,1.5,3.0,3,25.095,50.19
...,...,...,...,...,...,...,...
1474961,94980992,1,1.0,1.0,1,11.990,11.99
1474962,94980993,1,1.0,1.0,1,19.900,19.90
1474963,94980994,1,1.0,1.0,1,10.000,10.00
1474964,94980995,1,1.0,1.0,1,20.990,20.99


In [94]:
sales_train, sales_test = splitTrainTestData(vendas, 'COD_CUPOM_LOJA', test_size=.2)

In [95]:
transaction_data = buildTransactionList(sales_train, transaction_type='customer', min_size=2)

In [96]:
products = produtos = vendas.groupby(['COD_SKU', 'SKU', 'CATEGORIA_SKU', 'SUBCATEGORIA_SKU'], as_index=False).agg({'PRECO_REGULAR': 'mean'})
products.rename(columns={'PRECO_REGULAR': 'PRECO_REGULAR_AVG'}, inplace=True)
products

Unnamed: 0,COD_SKU,SKU,CATEGORIA_SKU,SUBCATEGORIA_SKU,PRECO_REGULAR_AVG
0,7,SERVICO EM DOMICILIO,DIVERSOS,SERVICOS TELE ENTREGA,5.000000
1,10,MANIPULADOS,MANIPULADOS,MANIPULADOS MEDICAMENTOS,0.010000
2,11,SERVICO DE APLICACAO DE INJETAVEIS,SERVICOS,SERVICOS FARMACEUTICOS,5.000000
3,12,SERVICO DE GESTO VACINAL GRIPE CONVENIOS,SERVICOS,SERVICOS FARMACEUTICOS,7.921640
4,13,SERVICO DE VERIFICACAO DE GLICEMIA CAPIL,SERVICOS,SERVICOS FARMACEUTICOS,3.000000
...,...,...,...,...,...
20653,100027853,ESC CAB MARCO BONI 7,PERFUMARIA,PERFUMARIA,14.990000
20654,100027855,ESC CAB MARCO BONI 8,PERFUMARIA,PERFUMARIA,29.990000
20655,100027878,NEBULIZADOR PULMOMAI,MEDICAMENTOS,LIBERADOS,219.900000
20656,100027882,AMPOLA PANTENE 3UN NUTRE,PERFUMARIA,PERFUMARIA,27.887918


In [110]:
metric = "confidence"
min_support=0.001
min_threshold = 0.03

# Convert the data into a transactional format where each transaction (COD_CUPOM) lists the products (COD_SKU) purchased
# Grouping the data by 'COD_CUPOM' and aggregating the SKUs purchased in each transaction
# sales_tmp.groupby('COD_CUPOM')['COD_SKU'].apply(list)
#transaction_data = buildTransactionList(vendas, transaction_type='customer')
sample_data = transaction_data#.sample(frac=.5)
rules = extract_rules(sample_data, min_support=min_support, min_threshold=min_threshold, metric=metric, algo='fpgrowth')
rules.rename(columns={'antecedents':'rhs', 'consequents':'lhs'}, inplace=True)
print(f'extracted {len(rules)} rules')

extracted 3506 rules


In [None]:
metric = "confidence"
min_support=0.0001
min_threshold = 0.1


rules = extract_rules_eff(transaction_data, min_support, min_threshold)
print(f'extracted {len(rules)} rules')

In [49]:
cupom_size = vendas.groupby(by=['COD_CUPOM_LOJA']).count()['COD_SKU']
vendas['ITENS_PURCHASED'] = vendas['COD_CUPOM_LOJA'].map(cupom_size)

In [None]:
vendas[(vendas['ITENS_PURCHASED']>50) &(vendas['COD_CUPOM_LOJA'] == '213545-1030')]

In [63]:
rules[rules['confidence'] > .95]
r_filter = (rules['confidence'] > .1)# (rules['rhs'].apply(len) > 1) & 
rules[r_filter]

Unnamed: 0,rhs,lhs,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(10104602),(10032109),0.012084,0.043522,0.001271,0.105192,2.416991,0.000745,1.068920,0.593433
4,(10104602),(10090254),0.012084,0.013389,0.001916,0.158570,11.843621,0.001754,1.172541,0.926765
5,(10090254),(10104602),0.013389,0.012084,0.001916,0.143113,11.843621,0.001754,1.152913,0.927991
7,(10104602),(10004993),0.012084,0.017202,0.001571,0.130004,7.557560,0.001363,1.129658,0.878295
9,(10021373),(10027058),0.008957,0.033844,0.002428,0.271027,8.008020,0.002124,1.325365,0.883034
...,...,...,...,...,...,...,...,...,...,...
3501,(10086657),(6857),0.004768,0.008780,0.001125,0.235931,26.870071,0.001083,1.297291,0.967397
3502,(10039529),(10024602),0.002200,0.011396,0.002087,0.949026,83.279422,0.002062,19.394417,0.990170
3503,(10024602),(10039529),0.011396,0.002200,0.002087,0.183175,83.279422,0.002062,1.221559,0.999381
3504,(10038480),(12),0.005380,0.001235,0.001203,0.223601,181.118888,0.001196,1.286407,0.999858


In [None]:
results = {}
for n in np.arange(start=2, stop=10):
    print('Quantidade de itens diferentes no cupom: ', n)
    cupom_lst = cupom_df[cupom_df['COD_SKU'] == n].sample(n=20000, replace=True)['COD_CUPOM_LOJA'].unique()

    r_filter = (rules['confidence'] > .1)
    filtered_rules = rules[r_filter]
    eval_df, avg_df = evalSuggestions(vendas[vendas['COD_CUPOM_LOJA'].isin(cupom_lst)], products, filtered_rules)

    results[n] = {'eval_df':eval_df, 'avg_df':avg_df}


Quantidade de itens diferentes no cupom:  2


Avaliando Recomendações: 100%|██████████| 20000/20000 [02:51<00:00, 116.39it/s]


Métricas Médias:
precision: 0.0552
recall: 0.1129
f1-score: 0.0710
accuracy: 0.0524
Quantidade de itens diferentes no cupom:  3


Avaliando Recomendações: 100%|██████████| 20000/20000 [03:12<00:00, 103.72it/s]


Métricas Médias:
precision: 0.0653
recall: 0.1068
f1-score: 0.0768
accuracy: 0.0525
Quantidade de itens diferentes no cupom:  4


Avaliando Recomendações: 100%|██████████| 20000/20000 [03:34<00:00, 93.37it/s] 


Métricas Médias:
precision: 0.0802
recall: 0.1055
f1-score: 0.0873
accuracy: 0.0581
Quantidade de itens diferentes no cupom:  5


Avaliando Recomendações: 100%|██████████| 20000/20000 [03:54<00:00, 85.21it/s] 


Métricas Médias:
precision: 0.0918
recall: 0.1005
f1-score: 0.0920
accuracy: 0.0604
Quantidade de itens diferentes no cupom:  6


Avaliando Recomendações: 100%|██████████| 20000/20000 [04:08<00:00, 80.45it/s] 


Métricas Médias:
precision: 0.1000
recall: 0.0942
f1-score: 0.0937
accuracy: 0.0608
Quantidade de itens diferentes no cupom:  7


Avaliando Recomendações: 100%|██████████| 20000/20000 [04:25<00:00, 75.27it/s] 


Métricas Médias:
precision: 0.1108
recall: 0.0924
f1-score: 0.0978
accuracy: 0.0629
Quantidade de itens diferentes no cupom:  8


Avaliando Recomendações: 100%|██████████| 20000/20000 [04:41<00:00, 70.96it/s]


Métricas Médias:
precision: 0.1189
recall: 0.0880
f1-score: 0.0983
accuracy: 0.0624
Quantidade de itens diferentes no cupom:  9


ValueError: Cannot take a larger sample than population when 'replace=False'

In [222]:
avg_df

{'precision': 0.06054828305824321,
 'recall': 0.12151394422310757,
 'f1-score': 0.0779738190096756,
 'accuracy': 0.058200531208499336}

In [190]:

lst = [1,2,3,4,5,6,7,8,9,10] 
#len(lst)//
si = int(len(lst)*.5)
print(si)

cart = random.sample(lst, si)
target = [e for e in lst if e not in cart]
print(cart, target)

5
[8, 9, 3, 2, 4] [1, 5, 6, 7, 10]


In [191]:
len(lst)//2

5

In [None]:
transaction_data[transaction_data.index == 120]

In [None]:
rules_temp = extract_rules_eff(transaction_data[transaction_data.index == 120], min_support, min_threshold)
rules_temp

In [None]:
filtered_rules = rules[rules['lhs'].apply(lambda x: any(sku in x for sku in [91659, 100018365]))]
filtered_rules = filtered_rules.sort_values(by=['support', 'lift', 'confidence', 'conviction'], ascending=False)

filtered_rules

In [None]:
filtered_rules.lhs.unique()

In [None]:
cart = [10032109,91659, 100018365]#, 4264, 100009844, 100027743 
products[products['COD_SKU'].isin(cart)][['COD_SKU', 'SKU', 'PRECO_REGULAR_AVG']]

In [None]:
suggestions = get_suggestions(cart, rules)
suggestions

In [None]:
get_recommendations(suggestions, products, discount=.05)[['COD_SKU', 'SKU', 'PRECO_REGULAR_AVG', 'PRECO_COM_DESCONTO']]

In [None]:
vendas.columns

In [None]:
['COD_CLIENTE', 'COD_SKU', 'SKU', 'CATEGORIA_SKU','SUBCATEGORIA_SKU','DATA_CUPOM', 'UNIDADES', 'PRECO_REGULAR','TOTAL_DESCONTO', 'TOTAL_BRUTO', 'TOTAL_LIQUIDO', 'COD_CUPOM_LOJA']

In [None]:
rules.to_csv('rules.csv')

In [None]:
rules.sort_values(by=['confidence'], ascending=False).head(50)

10035299, 10035289, 10035291

In [None]:
get_suggestions([12722], rules)

In [None]:
produtos[produtos['COD_SKU'].isin([12722])]

In [None]:
#frequent_itemsets 
# Sort frequent itemsets by support in descending order
sorted_itemsets = frequent_itemsets.sort_values(by='support', ascending=False)

# Select the top 20 most frequent items
top_20_items = sorted_itemsets.head(20)

print("Top 20 most frequent items:")
print(top_20_items)

# If you want to visualize these items
plt.figure(figsize=(12, 6))
plt.bar(range(len(top_20_items)), top_20_items['support'])
plt.xticks(range(len(top_20_items)), top_20_items['itemsets'].apply(lambda x: ', '.join(list(x))), rotation=90)
plt.xlabel('Items')
plt.ylabel('Support')
plt.title('Top 20 Most Frequent Items')
plt.tight_layout()
plt.show()

# Select items with support greater than 1%
relevant_items = frequent_itemsets[frequent_itemsets['support'] > 0.01]

print("\nItems with support greater than 1%:")
print(relevant_items)


In [None]:
df = vendas

# Creating a pivot table where rows are products (SKU), columns are products, and values are the count of co-occurrences
product_matrix = df.groupby(['COD_CUPOM', 'CATEGORIA_SKU'])['UNIDADES'].sum().unstack().fillna(0)

# Creating a correlation matrix to find relationships between products
product_correlation = product_matrix.corr()

# Plotting the heatmap to show relationships between products
plt.figure(figsize=(10, 8))
sns.heatmap(product_correlation, cmap='coolwarm', annot=False)
plt.title('Correlação dos produtos')
plt.show()