<a href="https://colab.research.google.com/github/fabiornunes/python/blob/main/Exemplo_Gera%C3%A7%C3%A3o_de_Regras_apriori.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from pandas import read_csv
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

#Realiza a leitura do csv contendo uma amostra reduzida dos dados do dataset titanic
dataset = read_csv('https://telescopeinstorage.blob.core.windows.net/datasets/titanic-apriori.csv', sep=';' , engine='python')
dataset.head()

Unnamed: 0,Class,Sex,Age,Survived
0,3rd,Male,Child,No
1,3rd,Male,Child,No
2,3rd,Male,Child,No
3,3rd,Male,Child,No
4,3rd,Male,Child,No


In [None]:
#obtem quatidade de linhas e colunas

qtdlinhas = dataset.shape[0]
qtdcols = dataset.shape[1]

print(qtdlinhas)
print(qtdcols)

2201
4


In [None]:
#converte o dataset em lista de transações

transacoes = []
for i in range(0, qtdlinhas):
    linhaTransacao = []
    for j in range(0, qtdcols):        
        linhaTransacao.append(str(dataset.values[i,j]))
    
    transacoes.append(linhaTransacao)
print(transacoes)

[['3rd', 'Male', 'Child', 'No'], ['3rd', 'Male', 'Child', 'No'], ['3rd', 'Male', 'Child', 'No'], ['3rd', 'Male', 'Child', 'No'], ['3rd', 'Male', 'Child', 'No'], ['3rd', 'Male', 'Child', 'No'], ['3rd', 'Male', 'Child', 'No'], ['3rd', 'Male', 'Child', 'No'], ['3rd', 'Male', 'Child', 'No'], ['3rd', 'Male', 'Child', 'No'], ['3rd', 'Male', 'Child', 'No'], ['3rd', 'Male', 'Child', 'No'], ['3rd', 'Male', 'Child', 'No'], ['3rd', 'Male', 'Child', 'No'], ['3rd', 'Male', 'Child', 'No'], ['3rd', 'Male', 'Child', 'No'], ['3rd', 'Male', 'Child', 'No'], ['3rd', 'Male', 'Child', 'No'], ['3rd', 'Male', 'Child', 'No'], ['3rd', 'Male', 'Child', 'No'], ['3rd', 'Male', 'Child', 'No'], ['3rd', 'Male', 'Child', 'No'], ['3rd', 'Male', 'Child', 'No'], ['3rd', 'Male', 'Child', 'No'], ['3rd', 'Male', 'Child', 'No'], ['3rd', 'Male', 'Child', 'No'], ['3rd', 'Male', 'Child', 'No'], ['3rd', 'Male', 'Child', 'No'], ['3rd', 'Male', 'Child', 'No'], ['3rd', 'Male', 'Child', 'No'], ['3rd', 'Male', 'Child', 'No'], ['3rd',

In [None]:
te = TransactionEncoder()

#Coloca em memórias as trasações e interpreta a quantidade de colunas que serão geradas durante o processamento
te.fit(transacoes)

TransactionEncoder()

In [None]:
#O objeto TransactionEncoder faz a conversão das transações em uma matriz binária onde cada linha da matriz representa uma transação
matriz_transacoes = te.transform(transacoes)
matriz_transacoes

array([[False, False,  True, ...,  True,  True, False],
       [False, False,  True, ...,  True,  True, False],
       [False, False,  True, ...,  True,  True, False],
       ...,
       [False, False, False, ..., False, False,  True],
       [False, False, False, ..., False, False,  True],
       [False, False, False, ..., False, False,  True]])

In [None]:
print(te.columns_)

['1st', '2nd', '3rd', 'Adult', 'Child', 'Crew', 'Female', 'Male', 'No', 'Yes']


In [None]:
#Cria um dataframe auxiliar com a matriz binária (passo te.transform(transacoes)) de transações e as colunas obtidas (passo te.fit(transacoes))
dfAuxiliar = pd.DataFrame(matriz_transacoes, columns=te.columns_)

dfAuxiliar.head()

Unnamed: 0,1st,2nd,3rd,Adult,Child,Crew,Female,Male,No,Yes
0,False,False,True,False,True,False,False,True,True,False
1,False,False,True,False,True,False,False,True,True,False
2,False,False,True,False,True,False,False,True,True,False
3,False,False,True,False,True,False,False,True,True,False
4,False,False,True,False,True,False,False,True,True,False


In [None]:
#Obtêm os itemsets mais frequentes com um suporte mínimo igual a 0.01. O paramêtro use_colnames significa que vamos usar os nomes das colunas do DataFrame dfAuxiliar 
#para construir as regras de Associação
itemsets_freq = apriori(dfAuxiliar, min_support=0.01, use_colnames=True)
itemsets_freq

Unnamed: 0,support,itemsets
0,0.147660,(1st)
1,0.129487,(2nd)
2,0.320763,(3rd)
3,0.950477,(Adult)
4,0.049523,(Child)
...,...,...
104,0.015902,"(No, Child, Male, 3rd)"
105,0.005906,"(Child, Male, 3rd, Yes)"
106,0.009087,"(Adult, Crew, Female, Yes)"
107,0.304407,"(No, Crew, Male, Adult)"


In [None]:
itemsets_freq

Unnamed: 0,support,itemsets
0,0.147660,(1st)
1,0.129487,(2nd)
2,0.320763,(3rd)
3,0.950477,(Adult)
4,0.049523,(Child)
...,...,...
91,0.175829,"(No, Male, 3rd, Adult)"
92,0.034075,"(Adult, Male, 3rd, Yes)"
93,0.015902,"(No, Child, Male, 3rd)"
94,0.304407,"(No, Crew, Male, Adult)"


In [None]:
#Algumas métricas:
#- support(A->C) = support(A+C) [aka 'support'], range: [0, 1]
#- confidence(A->C) = support(A+C) / support(A), range: [0, 1]
#- lift(A->C) = confidence(A->C) / support(C), range: [0, inf]
#- leverage(A->C) = support(A->C) - support(A)*support(C), range: [-1, 1]
#- conviction = [1 - support(C)] / [1 - confidence(A->C)],

#Obtêm as regras de associação a partir dos itemsets mais frequêntes
regras = association_rules(itemsets_freq, metric="confidence", min_threshold=0.4)

#Ordena as Regras por confiança
regrasOrdenadas = regras.sort_values('confidence' , ascending=False)

#mantém apenas as colunas que vamos utilizar 
regrasOrdenadas = regrasOrdenadas[['antecedents', 'consequents', 'support', 'confidence']]
regrasOrdenadas.head(100)

In [None]:
regras_sobreviventes =  regrasOrdenadas[regrasOrdenadas['consequents'] == {'Yes'}]
#OU
subset_sobrevivou = {'Yes'}
regras_sobreviventes =  regrasOrdenadas[  regrasOrdenadas['consequents'].apply(lambda x: subset_sobrevivou.issubset(x))]

regras_sobreviventes

In [None]:
regras_naoSobreviventes =  regrasOrdenadas[regrasOrdenadas['consequents'] == {'No'}]
regras_naoSobreviventes

Unnamed: 0,antecedents,consequents,support,confidence
152,"(2nd, Male, Adult)",(No),0.069968,0.916667
63,"(2nd, Male)",(No),0.069968,0.860335
168,"(3rd, Male, Adult)",(No),0.175829,0.837662
86,"(3rd, Male)",(No),0.191731,0.827451
110,"(Male, Adult)",(No),0.603816,0.797241
32,(Male),(No),0.619718,0.787984
125,"(Crew, Male)",(No),0.304407,0.777262
185,"(Crew, Male, Adult)",(No),0.304407,0.777262
98,"(Crew, Adult)",(No),0.30577,0.760452
28,(Crew),(No),0.30577,0.760452


In [None]:
subset_Mulheres = {'Female'}
regras_mulheres = regrasOrdenadas[  regrasOrdenadas['antecedents'].apply(lambda x: subset_Mulheres.issubset(x))]
regras_mulheres

In [None]:
subset_YesNo = {'Yes', 'No'}
regras_YesNo = regrasOrdenadas[regrasOrdenadas['consequents'].apply(lambda x: len(subset_YesNo.intersection(x)) > 0 )]
regras_YesNo

Unnamed: 0,antecedents,consequents,support,confidence
58,"(2nd, Child)",(Yes),0.010904,1.000000
44,"(1st, Female)",(Yes),0.064062,0.972414
130,"(1st, Female, Adult)",(Yes),0.063607,0.972222
135,"(1st, Female)","(Yes, Adult)",0.063607,0.965517
152,"(2nd, Male, Adult)",(No),0.069968,0.916667
...,...,...,...,...
134,"(1st, Adult)","(Female, Yes)",0.063607,0.438871
47,(1st),"(Female, Yes)",0.064062,0.433846
138,(1st),"(Yes, Female, Adult)",0.063607,0.430769
7,(2nd),(Yes),0.053612,0.414035


In [None]:
#Concatena as regras relacionadas dos sobreviventes e não-sobreviventes para análise única
regrasGeral =  pd.concat([regras_sobreviventes,regras_naoSobreviventes])

regrasGeral = regrasGeral.sort_values('confidence' , ascending=False)

regrasGeral
