# Regras de Associação

***


In [1]:
import warnings
warnings.filterwarnings('ignore')

!pip install ucimlrepo



In [2]:
import pandas as pd
import numpy as np

from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

# !pip install mlxtend -U
from mlxtend.frequent_patterns.fpgrowth import fpgrowth

## Exemplo Clássico

Inicialmente utilizado na análise de cesta de compras
em supermercados (Market Basket Analysis) para
determinar como os itens comprados por clientes
estão relacionados

In [3]:
# Gerando um mini dataset binário representando transações
data = {
    'Milk':  [1, 0, 1, 1, 0],
    'Bread': [0, 1, 1, 1, 0],
    'Butter':[0, 1, 1, 1, 1],
    'Eggs':  [0, 1, 1, 0, 1]
}
df = pd.DataFrame(data)
df

Unnamed: 0,Milk,Bread,Butter,Eggs
0,1,0,0,0
1,0,1,1,1
2,1,1,1,1
3,1,1,1,0
4,0,0,1,1


In [4]:
# Contando os items
# Milk: 3 ocorrencias -> support 3/5 = 0.6
# Bread: 3 ocorrencias -> support 3/5 = 0.6
# Butter: 4 ocorrencias -> support 4/5 = 0.8
# Eggs: 3 ocorrencias -> support 3/5 = 0.6

# Apriori parametro min_support=0.5
# Vai trazer regras que apareceram ao menos em 5 ocorrencias * 0.5 = 2.5 ocorrencias (arredondado para 3).
frequent_itemsets = apriori(df, min_support=0.5, use_colnames=True)
frequent_itemsets



Unnamed: 0,support,itemsets
0,0.6,(Milk)
1,0.6,(Bread)
2,0.8,(Butter)
3,0.6,(Eggs)
4,0.6,"(Butter, Bread)"
5,0.6,"(Butter, Eggs)"


In [5]:
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)
print("\nAssociation Rules (min_confidence=0.7):")
rules


Association Rules (min_confidence=0.7):


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(Butter),(Bread),0.8,0.6,0.6,0.75,1.25,1.0,0.12,1.6,1.0,0.75,0.375,0.875
1,(Bread),(Butter),0.6,0.8,0.6,1.0,1.25,1.0,0.12,inf,0.5,0.75,1.0,0.875
2,(Butter),(Eggs),0.8,0.6,0.6,0.75,1.25,1.0,0.12,1.6,1.0,0.75,0.375,0.875
3,(Eggs),(Butter),0.6,0.8,0.6,1.0,1.25,1.0,0.12,inf,0.5,0.75,1.0,0.875


Principais métricas: Support, Confidence e Lift.

Support ({x} => {y}) = Transações que contenham x e y / total de transações

Confidence ({x} => {y}) = Transações que contenham x e y / transacoes que contenham x

Lift = ({x} => {y}) = *Transacoes que contenham x e y) / (transacoes que contenham x) / (transacoes que contenham y)

Outra forma de ver Lift = ({x} => {y}) = Confidence(x=>y) / Support(y)

## Análise do conjunto de dados (Online Retail)

Regras de associação são algoritmos que extraem conjuntos de itens frequentes em datasets que cada instância é um conjunto de itens. Vamos visualizar o que isso significa.

Vamos observar o dataset de compras de um supermercado. Segue o link para mais informações [dataset](https://archive.ics.uci.edu/dataset/352/online+retail)

In [6]:
# Carregando o dataset
df = pd.read_csv("https://raw.githubusercontent.com/gabrielh10/ml_practice_tutoring_2/master/datasets/online_retail.csv")

In [7]:
df.columns

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country'],
      dtype='object')

Vamos visualizar uma pequena parte desse dataset.

In [8]:
df.head(3)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom


Aqui temos diversas informações sobre compras em um supermercado:

- (_InvoiceNo_) o número da fatura, identificador de uma compra
- (_StockCode_) o código de certo produto no estoque
- (_Description_) a descrição do produto
- (_Quantity_) a quantidade de produtos que foi comprada
- (_InvoiceDate_) o dia da compra
- (_UnitPrice_) o preço por unidade
- (_CustomerID_) o id do consumidor
- (_Country_) o país de venda

In [9]:
# vamos olhar todos os países existentes
print(df["Country"].unique())
# print(df["Country"].value_counts())

# dos países, vamos escolher apenas as vendas feitas no Reino Unido, apenas pelo fato de existirem mais vendas no dataset
country = "United Kingdom"
sales = df[df["Country"] == country]

['United Kingdom' 'France' 'Australia' 'Netherlands' 'Germany' 'Norway'
 'EIRE' 'Switzerland' 'Spain' 'Poland' 'Portugal' 'Italy' 'Belgium'
 'Lithuania' 'Japan' 'Iceland' 'Channel Islands' 'Denmark' 'Cyprus'
 'Sweden' 'Austria' 'Israel' 'Finland' 'Bahrain' 'Greece' 'Hong Kong'
 'Singapore' 'Lebanon' 'United Arab Emirates' 'Saudi Arabia'
 'Czech Republic' 'Canada' 'Unspecified' 'Brazil' 'USA'
 'European Community' 'Malta' 'RSA']


In [10]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
Index: 495478 entries, 0 to 541893
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    495478 non-null  object 
 1   StockCode    495478 non-null  object 
 2   Description  494024 non-null  object 
 3   Quantity     495478 non-null  int64  
 4   InvoiceDate  495478 non-null  object 
 5   UnitPrice    495478 non-null  float64
 6   CustomerID   361878 non-null  float64
 7   Country      495478 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 34.0+ MB


In [11]:
"C - indica cancelamento"
sales[sales["InvoiceNo"].str.contains("C")].head(10)

# sales[sales["Description"].isna()]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
141,C536379,D,Discount,-1,2010-12-01 09:41:00,27.5,14527.0,United Kingdom
154,C536383,35004C,SET OF 3 COLOURED FLYING DUCKS,-1,2010-12-01 09:49:00,4.65,15311.0,United Kingdom
235,C536391,22556,PLASTERS IN TIN CIRCUS PARADE,-12,2010-12-01 10:24:00,1.65,17548.0,United Kingdom
236,C536391,21984,PACK OF 12 PINK PAISLEY TISSUES,-24,2010-12-01 10:24:00,0.29,17548.0,United Kingdom
237,C536391,21983,PACK OF 12 BLUE PAISLEY TISSUES,-24,2010-12-01 10:24:00,0.29,17548.0,United Kingdom
238,C536391,21980,PACK OF 12 RED RETROSPOT TISSUES,-24,2010-12-01 10:24:00,0.29,17548.0,United Kingdom
239,C536391,21484,CHICK GREY HOT WATER BOTTLE,-12,2010-12-01 10:24:00,3.45,17548.0,United Kingdom
240,C536391,22557,PLASTERS IN TIN VINTAGE PAISLEY,-12,2010-12-01 10:24:00,1.65,17548.0,United Kingdom
241,C536391,22553,PLASTERS IN TIN SKULLS,-24,2010-12-01 10:24:00,1.65,17548.0,United Kingdom
939,C536506,22960,JAM MAKING SET WITH JARS,-6,2010-12-01 12:38:00,4.25,17897.0,United Kingdom


In [12]:
# antes de utilizar nosso dataset, precisamos fazer alguns pre-processamentos:
# remover todas as instâncias que não possuem description
filtered_sales = sales.dropna(axis=0, subset=["Description"])
print(filtered_sales.shape)
# transformar em strings
filtered_sales["InvoiceNo"] = filtered_sales["InvoiceNo"].astype("str")
# remover instancias que contenham "C" no _InvoiceNo_, representando instancias que não possuem essa feature
filtered_sales = filtered_sales[~filtered_sales["InvoiceNo"].str.contains("C")]

# remover espaços desnecessários no começo e fim de cada _Description_
filtered_sales["Description"] = filtered_sales["Description"].str.strip()
filtered_sales["Description"] = filtered_sales["Description"].astype("str")


(494024, 8)


In [13]:
filtered_sales.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


Como o dataset a ser avaliado deverá consistir em conjuntos de itens, vamos transformar cada compra em um conjunto de itens. Note que um conjunto não possui informação sobre quantidade de elementos repetidos. Portanto, a coluna com a contagem de cada item deve ser removida, indicando apenas a presença daquele item em uma compra.

In [14]:
sales_set = filtered_sales.groupby(['InvoiceNo', 'Description'])["Quantity"].sum().unstack().reset_index().fillna(0).set_index('InvoiceNo')
sales_set.head()

Description,*Boombox Ipod Classic,*USB Office Mirror Ball,10 COLOUR SPACEBOY PEN,12 COLOURED PARTY BALLOONS,12 DAISY PEGS IN WOOD BOX,12 EGG HOUSE PAINTED WOOD,12 HANGING EGGS HAND PAINTED,12 IVORY ROSE PEG PLACE SETTINGS,12 MESSAGE CARDS WITH ENVELOPES,12 PENCIL SMALL TUBE WOODLAND,...,wrongly coded 20713,wrongly coded 23343,wrongly coded-23343,wrongly marked,wrongly marked 23343,wrongly marked carton 22804,wrongly marked. 23343 in box,wrongly sold (22719) barcode,wrongly sold as sets,wrongly sold sets
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
536365,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536366,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536367,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536368,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536369,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
sales_set[["POSTAGE", "DOTCOM POSTAGE"]]

Description,POSTAGE,DOTCOM POSTAGE
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1
536365,0.0,0.0
536366,0.0,0.0
536367,0.0,0.0
536368,0.0,0.0
536369,0.0,0.0
...,...,...
581585,0.0,0.0
581586,0.0,0.0
A563185,0.0,0.0
A563186,0.0,0.0


In [16]:
#vamos reduzir o número de colunas para 1500
sales_set = sales_set.iloc[:, :1500]
print(sales_set.shape)

# também estão descritos o tipo de postagem, se pela internet ou não, vamos remove-los
sales_set = sales_set.drop("POSTAGE", axis=1, errors="ignore")
sales_set = sales_set.drop("DOTCOM POSTAGE", axis=1, errors="ignore")

# nosso dataset final
sales_set.head()

(18668, 1500)


Description,*Boombox Ipod Classic,*USB Office Mirror Ball,10 COLOUR SPACEBOY PEN,12 COLOURED PARTY BALLOONS,12 DAISY PEGS IN WOOD BOX,12 EGG HOUSE PAINTED WOOD,12 HANGING EGGS HAND PAINTED,12 IVORY ROSE PEG PLACE SETTINGS,12 MESSAGE CARDS WITH ENVELOPES,12 PENCIL SMALL TUBE WOODLAND,...,GLASS CHALICE GREEN SMALL,GLASS CLOCHE LARGE,GLASS CLOCHE SMALL,GLASS HEART T-LIGHT HOLDER,GLASS JAR DAISY FRESH COTTON WOOL,GLASS JAR DIGESTIVE BISCUITS,GLASS JAR ENGLISH CONFECTIONERY,GLASS JAR KINGS CHOICE,GLASS JAR MARMALADE,GLASS JAR PEACOCK BATH SALTS
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
536365,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536366,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536367,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536368,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536369,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
# agora, vamos transformar o dataset de uma contagem de itens, para apenas conjuntos
# faremos isso transformando toda contagem para uma presença ou não do item

count_to_set = lambda x: x > 0 # aqui se uma venda possui pelo menos 1 item, afime True, caso contrário, False
sales_set = sales_set.applymap(count_to_set)

print(sales_set.shape)
sales_set.head()

(18668, 1499)


Description,*Boombox Ipod Classic,*USB Office Mirror Ball,10 COLOUR SPACEBOY PEN,12 COLOURED PARTY BALLOONS,12 DAISY PEGS IN WOOD BOX,12 EGG HOUSE PAINTED WOOD,12 HANGING EGGS HAND PAINTED,12 IVORY ROSE PEG PLACE SETTINGS,12 MESSAGE CARDS WITH ENVELOPES,12 PENCIL SMALL TUBE WOODLAND,...,GLASS CHALICE GREEN SMALL,GLASS CLOCHE LARGE,GLASS CLOCHE SMALL,GLASS HEART T-LIGHT HOLDER,GLASS JAR DAISY FRESH COTTON WOOL,GLASS JAR DIGESTIVE BISCUITS,GLASS JAR ENGLISH CONFECTIONERY,GLASS JAR KINGS CHOICE,GLASS JAR MARMALADE,GLASS JAR PEACOCK BATH SALTS
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
536365,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
536366,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
536367,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
536368,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
536369,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


Após esse pré-processamento, conseguimos extrair quais itens são comprados em conjunto com maior frequência para todas as vendas.

## Apriori

Inicialmente, vamos utilizar o algoritmo a priori. Nele precisamos indicar qual a repetição mínima necessária de repetição que buscamos. Por exemplo, se quisermos apenas as repetições que aconteçam pelo menos $n\%$ das vezes.

O algoritmo funciona por criar todas as combinações possíveis de conjuntos e então checar suas frequências.

In [18]:


# use_colnames retorna o itemset como nomes ao invés de indices das colunas
frequency_set = apriori(sales_set, min_support=0.01, use_colnames=True)
print(frequency_set)

#Unable to allocate 9.89 GiB for an array with shape (263901, 2, 20122) and data type bool

      support                                           itemsets
0    0.014624                           (10 COLOUR SPACEBOY PEN)
1    0.012910                  (12 MESSAGE CARDS WITH ENVELOPES)
2    0.017517                    (12 PENCIL SMALL TUBE WOODLAND)
3    0.018159              (12 PENCILS SMALL TUBE RED RETROSPOT)
4    0.018052                      (12 PENCILS SMALL TUBE SKULL)
..        ...                                                ...
283  0.012374  (FELTCRAFT PRINCESS CHARLOTTE DOLL, FELTCRAFT ...
284  0.028980  (GARDENERS KNEELING PAD KEEP CALM, GARDENERS K...
285  0.010660  (ALARM CLOCK BAKELIKE GREEN, ALARM CLOCK BAKEL...
286  0.012696  (ALARM CLOCK BAKELIKE GREEN, ALARM CLOCK BAKEL...
287  0.013713  (ALARM CLOCK BAKELIKE PINK, ALARM CLOCK BAKELI...

[288 rows x 2 columns]


In [19]:
# perceba que temos muitas conjuntos com apenas um elemento
# vamos filtrar esses conjuntos que tem apenas um elemento

def filter_set_lenght(input_set, lenght=2):
    input_set['set_lenght'] = input_set['itemsets'].apply(lambda x: len(x))
    new_set = input_set[input_set['set_lenght'] >= lenght]
    new_set.reset_index(inplace=True, drop=True)
    return new_set

combo_set = filter_set_lenght(frequency_set)
print(combo_set)

     support                                           itemsets  set_lenght
0   0.013338  (72 SWEETHEART FAIRY CAKE CASES, 60 TEATIME FA...           2
1   0.013338  (ALARM CLOCK BAKELIKE GREEN, ALARM CLOCK BAKEL...           2
2   0.013606  (ALARM CLOCK BAKELIKE CHOCOLATE, ALARM CLOCK B...           2
3   0.016392  (ALARM CLOCK BAKELIKE GREEN, ALARM CLOCK BAKEL...           2
4   0.012963  (ALARM CLOCK BAKELIKE GREEN, ALARM CLOCK BAKEL...           2
5   0.018749  (ALARM CLOCK BAKELIKE PINK, ALARM CLOCK BAKELI...           2
6   0.030159  (ALARM CLOCK BAKELIKE GREEN, ALARM CLOCK BAKEL...           2
7   0.013285  (ALARM CLOCK BAKELIKE PINK, ALARM CLOCK BAKELI...           2
8   0.018320  (ALARM CLOCK BAKELIKE RED, ALARM CLOCK BAKELIK...           2
9   0.014195  (ALARM CLOCK BAKELIKE ORANGE, ALARM CLOCK BAKE...           2
10  0.021052  (ALARM CLOCK BAKELIKE PINK, ALARM CLOCK BAKELI...           2
11  0.015106  (BAKING SET 9 PIECE RETROSPOT, BAKING SET SPAC...           2
12  0.010392

In [20]:
# escolhendo a combinação de itens com maior repetição
def get_highest_support(input_set):
    instance = input_set.iloc[input_set["support"].idxmax()]
    return instance

# e criando uma função auxiliar para mostrar as estatísticas do nosso combo a partir do nosso conjunto de vendas
def print_combo_stats(combo, sales):
    print("Itemset: %s"%(str(tuple(combo["itemsets"]))))
    print("Frequência (Support): %.2f%%" %(float(combo["support"])*100))
    print("Vendas totais do itemset: %d"%(float(combo["support"]) * sales.shape[0]))

combo = get_highest_support(combo_set)
print_combo_stats(combo, sales_set)

Itemset: ('ALARM CLOCK BAKELIKE GREEN', 'ALARM CLOCK BAKELIKE RED')
Frequência (Support): 3.02%
Vendas totais do itemset: 563


<table>
<tr>
<td><img src="https://www.mzube.co.uk/cdn/shop/products/retro-alarm-clock-bakelite-style-various-coloursmzube-945122.jpg?v=1616870562" width="250" /></td>
<td><img src="https://withinreason.co.uk/cdn/shop/products/22726_11x9x6cm_0.jpg?v=1542893682" width="250" /></td>
</tr>
</table>

## FP-Growth

Ao contrário do algoritmo a priori, FP-Growth não precisa criar todas os conjuntos de combinações. Para datasets em que a quantidade de combinações é muito grande, ele possui uma vantagem sobre seu tempo de execução.

In [21]:
# use_colnames retorna o itemset como nomes ao invés de indices das colunas
frequency_set = fpgrowth(sales_set, min_support=0.01, use_colnames=True)
print(frequency_set)

      support                                           itemsets
0    0.014356                   (CREAM CUPID HEARTS COAT HANGER)
1    0.073441                    (ASSORTED COLOUR BIRD ORNAMENT)
2    0.030105                              (DOORMAT NEW ENGLAND)
3    0.022123                (FELTCRAFT PRINCESS CHARLOTTE DOLL)
4    0.012696                   (BOX OF VINTAGE ALPHABET BLOCKS)
..        ...                                                ...
283  0.012213  (CHARLOTTE BAG PINK POLKADOT, CHARLOTTE BAG AP...
284  0.011571  (CHARLOTTE BAG APPLES DESIGN, CHARLOTTE BAG VI...
285  0.012428  (CHARLOTTE BAG VINTAGE ALPHABET, CHARLOTTE BAG...
286  0.010446  (CHARLOTTE BAG PINK POLKADOT, CHARLOTTE BAG VI...
287  0.028980  (GARDENERS KNEELING PAD KEEP CALM, GARDENERS K...

[288 rows x 2 columns]


In [22]:
combo_set = filter_set_lenght(frequency_set)
combo = get_highest_support(combo_set)
print_combo_stats(combo, sales_set)

Itemset: ('ALARM CLOCK BAKELIKE GREEN', 'ALARM CLOCK BAKELIKE RED')
Frequência (Support): 3.02%
Vendas totais do itemset: 563


Temos o mesmo resultado, mas vamos avaliar o tempo de execução entre cada algoritmo...

In [23]:
from time import time
t0 = time()
frequency_set = apriori(sales_set, min_support=0.01, use_colnames=True)
t1 = time()
frequency_set = fpgrowth(sales_set, min_support=0.01, use_colnames=True)
t2 = time()

print("Tempos:")
print("APriori: %f" %(t1-t0))
print("FP-Growth: %f" %(t2-t1))

Tempos:
APriori: 1.369365
FP-Growth: 3.030914


## Outro Dataset (CDC Diabetes)

In [24]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
cdc_diabetes_health_indicators = fetch_ucirepo(id=891)

# data (as pandas dataframes)
X = cdc_diabetes_health_indicators.data.features
y = cdc_diabetes_health_indicators.data.targets

dataset = pd.concat([X,y], axis=1)
dataset

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Diabetes_binary
0,1,1,1,40,1,0,0,0,0,1,...,0,5,18,15,1,0,9,4,3,0
1,0,0,0,25,1,0,0,1,0,0,...,1,3,0,0,0,0,7,6,1,0
2,1,1,1,28,0,0,0,0,1,0,...,1,5,30,30,1,0,9,4,8,0
3,1,0,1,27,0,0,0,1,1,1,...,0,2,0,0,0,0,11,3,6,0
4,1,1,1,24,0,0,0,1,1,1,...,0,2,3,0,0,0,11,5,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253675,1,1,1,45,0,0,0,0,1,1,...,0,3,0,5,0,1,5,6,7,0
253676,1,1,1,18,0,0,0,0,0,0,...,0,4,0,0,1,0,11,2,4,1
253677,0,0,1,28,0,0,0,1,1,0,...,0,1,0,0,0,0,2,5,2,0
253678,1,0,1,23,0,0,0,0,1,1,...,0,3,0,0,0,1,7,5,1,0


In [25]:
df = dataset
# Selecionar um subconjunto de colunas relevantes para o exemplo
# Isso evita que o processamento se torne muito lento com todas as 22 colunas.
selected_features = ['Diabetes_binary', 'HighBP', 'HighChol', 'BMI', 'Smoker', 'Age', 'Education', 'Income']
df = df[selected_features]

# Para simplificar o exemplo, vamos usar uma amostra menor
# do dataset, já que o Apriori pode ser lento em datasets grandes.
df_sample = df.sample(n=253680, random_state=42)


# Vamos discretizar algumas colunas contínuas ou com muitas categorias.

# Discretizar a coluna 'BMI'
# A OMS classifica o BMI em faixas: <18.5 (baixo peso), 18.5-24.9 (normal), 25.0-29.9 (sobrepeso), >=30 (obeso)
bmi_bins = [0, 18.5, 25.0, 30.0, 100]
bmi_labels = ['BMI_AbaixoPeso', 'BMI_PesoNormal', 'BMI_Sobrepeso', 'BMI_Obeso']
df_sample['BMI_Categoria'] = pd.cut(df_sample['BMI'], bins=bmi_bins, labels=bmi_labels, right=False)

# Discretizar a coluna 'Age' (que no dataset já são categorias de 1-13)
# Vamos traduzir para rótulos mais descritivos
age_labels = ['Idade_18-24', 'Idade_25-29', 'Idade_30-34', 'Idade_35-39', 'Idade_40-44',
              'Idade_45-49', 'Idade_50-54', 'Idade_55-59', 'Idade_60-64', 'Idade_65-69',
              'Idade_70-74', 'Idade_75-79', 'Idade_80+']
df_sample['Age_Group'] = df_sample['Age'].apply(lambda x: age_labels[int(x) - 1])

# Discretizar a coluna 'Education' (nível de escolaridade 1-6)
# Vamos traduzir para rótulos mais descritivos
education_labels = ['Educ_SemEscola', 'Educ_Fundamental', 'Educ_Medio_Incompleto',
                    'Educ_Medio_Completo', 'Educ_Superior_Incompleto', 'Educ_Superior_Completo']
df_sample['Education_Level'] = df_sample['Education'].apply(lambda x: education_labels[int(x) - 1])

# Discretizar a coluna 'Income' (renda 1-8)
# Vamos traduzir para rótulos mais descritivos
income_labels = ['Renda_Ate_10k', 'Renda_10-15k', 'Renda_15-20k', 'Renda_20-25k',
                 'Renda_25-35k', 'Renda_35-50k', 'Renda_50-75k', 'Renda_Mais_75k']
df_sample['Income_Level'] = df_sample['Income'].apply(lambda x: income_labels[int(x) - 1])

# As colunas 'HighBP', 'HighChol' e 'Smoker' já são binárias (0=não, 1=sim).
# Vamos renomear para descrições mais claras para o Apriori
df_sample['HighBP_Sim'] = df_sample['HighBP'].apply(lambda x: 'PressaoAlta' if x == 1 else None)
df_sample['HighChol_Sim'] = df_sample['HighChol'].apply(lambda x: 'ColesterolAlto' if x == 1 else None)
df_sample['Smoker_Sim'] = df_sample['Smoker'].apply(lambda x: 'Fumante' if x == 1 else None)
df_sample['Diabetes_Sim'] = df_sample['Diabetes_binary'].apply(lambda x: 'Diabetes' if x == 1 or x == 2 else 'Nao_Diabetes')

# Colunas originais que serão descartadas
columns_to_drop = ['HighBP', 'HighChol', 'BMI', 'Smoker', 'Age', 'Education', 'Income', 'Diabetes_binary']
df_processed = df_sample.drop(columns=columns_to_drop)

# A próxima etapa exige que o dataframe seja um one-hot encoded (0s e 1s).
# Vamos transformar as colunas categóricas e as que criamos.
# `get_dummies` fará isso automaticamente.
df_encoded = pd.get_dummies(df_processed.astype(str), prefix='', prefix_sep='')

# Remover colunas que representam 'não' (como 'Nao_Diabetes'), pois não queremos
# regras sobre ausência de algo. Vamos manter apenas os itens de interesse.
columns_to_exclude = ['Nao_Diabetes', 'None']
df_encoded = df_encoded.drop(columns=columns_to_exclude, errors='ignore')
df_encoded

Unnamed: 0,BMI_AbaixoPeso,BMI_Obeso,BMI_PesoNormal,BMI_Sobrepeso,Idade_18-24,Idade_25-29,Idade_30-34,Idade_35-39,Idade_40-44,Idade_45-49,...,Renda_20-25k,Renda_25-35k,Renda_35-50k,Renda_50-75k,Renda_Ate_10k,Renda_Mais_75k,PressaoAlta,ColesterolAlto,Fumante,Diabetes
219620,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
132821,False,False,False,True,False,False,False,False,False,False,...,False,False,True,False,False,False,True,True,False,False
151862,False,False,True,False,True,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
139717,False,False,False,True,False,True,False,False,False,False,...,False,False,False,True,False,False,False,False,True,False
239235,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119879,False,True,False,False,False,False,False,False,True,False,...,False,False,False,False,True,False,True,False,True,False
103694,False,False,False,True,False,False,False,False,False,False,...,False,False,False,True,False,False,True,True,True,False
131932,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,True,False,True,False,False
146867,False,False,True,False,False,False,False,False,True,False,...,False,False,True,False,False,False,False,False,False,False


In [26]:

# 3. Executar o Algoritmo Apriori
# Definir o suporte mínimo. Um valor baixo (e.g., 0.05) é útil
# para datasets grandes.
min_support = 0.05
frequent_itemsets = apriori(df_encoded, min_support=min_support, use_colnames=True)

print("Conjuntos de Itens Frequentes (suporte >= 5%):")
frequent_itemsets.sort_values(by='support', ascending=False, inplace=False).head(20)

Conjuntos de Itens Frequentes (suporte >= 5%):


Unnamed: 0,support,itemsets
24,0.443169,(Fumante)
22,0.429001,(PressaoAlta)
23,0.424121,(ColesterolAlto)
14,0.423072,(Educ_Superior_Completo)
2,0.369556,(BMI_Sobrepeso)
21,0.356295,(Renda_Mais_75k)
0,0.346306,(BMI_Obeso)
15,0.275583,(Educ_Superior_Incompleto)
1,0.271811,(BMI_PesoNormal)
91,0.254888,"(PressaoAlta, ColesterolAlto)"


In [27]:

# 4. Gerar Regras de Associação
# Definir a confiança mínima
min_confidence = 0.1
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)

# Ordenar as regras por 'lift' para ver as mais fortes
rules = rules.sort_values(by='lift', ascending=False)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
208,"(BMI_Obeso, PressaoAlta)",(Diabetes),0.195822,0.139333,0.064286,0.328287,2.356134,1.0,0.037001,1.281302,0.715732,0.237332,0.219544,0.394835
213,(Diabetes),"(BMI_Obeso, PressaoAlta)",0.139333,0.195822,0.064286,0.461382,2.356134,1.0,0.037001,1.493040,0.668756,0.237332,0.330226,0.394835
221,"(BMI_Obeso, ColesterolAlto)",(Diabetes),0.169686,0.139333,0.055211,0.325373,2.335217,1.0,0.031568,1.275767,0.688624,0.217532,0.216158,0.360814
224,(Diabetes),"(BMI_Obeso, ColesterolAlto)",0.139333,0.169686,0.055211,0.396254,2.335217,1.0,0.031568,1.375270,0.664339,0.217532,0.272870,0.360814
383,"(PressaoAlta, ColesterolAlto)",(Diabetes),0.254888,0.139333,0.075737,0.297139,2.132581,1.0,0.040223,1.224519,0.712758,0.237805,0.183353,0.420354
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
328,"(PressaoAlta, Fumante)",(Educ_Superior_Completo),0.213966,0.423072,0.062153,0.290481,0.686598,1.0,-0.028370,0.813125,-0.367372,0.108114,-0.229824,0.218695
366,"(PressaoAlta, Fumante)",(Renda_Mais_75k),0.213966,0.356295,0.050146,0.234363,0.657778,1.0,-0.026089,0.840744,-0.398276,0.096413,-0.189422,0.187553
367,(Renda_Mais_75k),"(PressaoAlta, Fumante)",0.356295,0.213966,0.050146,0.140742,0.657778,1.0,-0.026089,0.914782,-0.446977,0.096413,-0.093156,0.187553
29,(PressaoAlta),(BMI_PesoNormal),0.429001,0.271811,0.075812,0.176718,0.650149,1.0,-0.040795,0.884495,-0.485173,0.121299,-0.130589,0.227816


In [28]:
frequency_set = fpgrowth(df_encoded, min_support=0.05, use_colnames=True)
frequency_set.sort_values(by='support', ascending=False, inplace=False).head(20)

Unnamed: 0,support,itemsets
10,0.443169,(Fumante)
3,0.429001,(PressaoAlta)
4,0.424121,(ColesterolAlto)
5,0.423072,(Educ_Superior_Completo)
6,0.369556,(BMI_Sobrepeso)
13,0.356295,(Renda_Mais_75k)
11,0.346306,(BMI_Obeso)
15,0.275583,(Educ_Superior_Incompleto)
0,0.271811,(BMI_PesoNormal)
46,0.254888,"(PressaoAlta, ColesterolAlto)"


In [29]:
min_confidence = 0.1
rules = association_rules(frequency_set, metric="confidence", min_threshold=min_confidence)
rules = rules.sort_values(by='lift', ascending=False)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
395,(Diabetes),"(BMI_Obeso, PressaoAlta)",0.139333,0.195822,0.064286,0.461382,2.356134,1.0,0.037001,1.493040,0.668756,0.237332,0.330226,0.394835
390,"(BMI_Obeso, PressaoAlta)",(Diabetes),0.195822,0.139333,0.064286,0.328287,2.356134,1.0,0.037001,1.281302,0.715732,0.237332,0.219544,0.394835
397,"(BMI_Obeso, ColesterolAlto)",(Diabetes),0.169686,0.139333,0.055211,0.325373,2.335217,1.0,0.031568,1.275767,0.688624,0.217532,0.216158,0.360814
400,(Diabetes),"(BMI_Obeso, ColesterolAlto)",0.139333,0.169686,0.055211,0.396254,2.335217,1.0,0.031568,1.375270,0.664339,0.217532,0.272870,0.360814
373,"(PressaoAlta, ColesterolAlto)",(Diabetes),0.254888,0.139333,0.075737,0.297139,2.132581,1.0,0.040223,1.224519,0.712758,0.237805,0.183353,0.420354
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,"(PressaoAlta, Fumante)",(Educ_Superior_Completo),0.213966,0.423072,0.062153,0.290481,0.686598,1.0,-0.028370,0.813125,-0.367372,0.108114,-0.229824,0.218695
298,"(PressaoAlta, Fumante)",(Renda_Mais_75k),0.213966,0.356295,0.050146,0.234363,0.657778,1.0,-0.026089,0.840744,-0.398276,0.096413,-0.189422,0.187553
299,(Renda_Mais_75k),"(PressaoAlta, Fumante)",0.356295,0.213966,0.050146,0.140742,0.657778,1.0,-0.026089,0.914782,-0.446977,0.096413,-0.093156,0.187553
9,(PressaoAlta),(BMI_PesoNormal),0.429001,0.271811,0.075812,0.176718,0.650149,1.0,-0.040795,0.884495,-0.485173,0.121299,-0.130589,0.227816


In [30]:
from time import time
t0 = time()
frequency_set = apriori(df_encoded, min_support=0.01, use_colnames=True)
t1 = time()
frequency_set = fpgrowth(df_encoded, min_support=0.01, use_colnames=True)
t2 = time()

print("Tempos:")
print("APriori: %f" %(t1-t0))
print("FP-Growth: %f" %(t2-t1))

Tempos:
APriori: 3.498783
FP-Growth: 837.302742
