In [None]:
pip install mlxtend

In [3]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori, association_rules

In [4]:
df = pd.read_csv("./GroceryStoreDataSet.csv", names = ['products'], sep = ',')

In [5]:
df.replace(to_replace='COCK', value='COKE', regex=True, inplace=True)
df.replace(to_replace='SUGER', value='SUGAR', regex=True, inplace=True)

In [6]:
df.head(10)

Unnamed: 0,products
0,"MILK,BREAD,BISCUIT"
1,"BREAD,MILK,BISCUIT,CORNFLAKES"
2,"BREAD,TEA,BOURNVITA"
3,"JAM,MAGGI,BREAD,MILK"
4,"MAGGI,TEA,BISCUIT"
5,"BREAD,TEA,BOURNVITA"
6,"MAGGI,TEA,CORNFLAKES"
7,"MAGGI,BREAD,TEA,BISCUIT"
8,"JAM,MAGGI,BREAD,TEA"
9,"BREAD,MILK"


In [7]:
data = list(df['products'].apply(lambda x:x.split(',')))
data

[['MILK', 'BREAD', 'BISCUIT'],
 ['BREAD', 'MILK', 'BISCUIT', 'CORNFLAKES'],
 ['BREAD', 'TEA', 'BOURNVITA'],
 ['JAM', 'MAGGI', 'BREAD', 'MILK'],
 ['MAGGI', 'TEA', 'BISCUIT'],
 ['BREAD', 'TEA', 'BOURNVITA'],
 ['MAGGI', 'TEA', 'CORNFLAKES'],
 ['MAGGI', 'BREAD', 'TEA', 'BISCUIT'],
 ['JAM', 'MAGGI', 'BREAD', 'TEA'],
 ['BREAD', 'MILK'],
 ['COFFEE', 'COKE', 'BISCUIT', 'CORNFLAKES'],
 ['COFFEE', 'COKE', 'BISCUIT', 'CORNFLAKES'],
 ['COFFEE', 'SUGAR', 'BOURNVITA'],
 ['BREAD', 'COFFEE', 'COKE'],
 ['BREAD', 'SUGAR', 'BISCUIT'],
 ['COFFEE', 'SUGAR', 'CORNFLAKES'],
 ['BREAD', 'SUGAR', 'BOURNVITA'],
 ['BREAD', 'COFFEE', 'SUGAR'],
 ['BREAD', 'COFFEE', 'SUGAR'],
 ['TEA', 'MILK', 'COFFEE', 'CORNFLAKES']]

In [8]:
from mlxtend.preprocessing import TransactionEncoder
te = TransactionEncoder()
te_data = te.fit(data).transform(data)
df2 = pd.DataFrame(te_data,columns=te.columns_)
df2

Unnamed: 0,BISCUIT,BOURNVITA,BREAD,COFFEE,COKE,CORNFLAKES,JAM,MAGGI,MILK,SUGAR,TEA
0,True,False,True,False,False,False,False,False,True,False,False
1,True,False,True,False,False,True,False,False,True,False,False
2,False,True,True,False,False,False,False,False,False,False,True
3,False,False,True,False,False,False,True,True,True,False,False
4,True,False,False,False,False,False,False,True,False,False,True
5,False,True,True,False,False,False,False,False,False,False,True
6,False,False,False,False,False,True,False,True,False,False,True
7,True,False,True,False,False,False,False,True,False,False,True
8,False,False,True,False,False,False,True,True,False,False,True
9,False,False,True,False,False,False,False,False,True,False,False


In [9]:
df3 = apriori(df2,min_support=0.20,use_colnames=True, verbose=1)
df3.sort_values(by="support", ascending=False)

Processing 42 combinations | Sampling itemset size 3


Unnamed: 0,support,itemsets
2,0.65,(BREAD)
3,0.4,(COFFEE)
0,0.35,(BISCUIT)
8,0.35,(TEA)
4,0.3,(CORNFLAKES)
7,0.3,(SUGAR)
5,0.25,(MAGGI)
6,0.25,(MILK)
1,0.2,(BOURNVITA)
9,0.2,"(BISCUIT, BREAD)"


In [10]:
AR = association_rules(df3, metric = "confidence",min_threshold = 0.6)
AR.sort_values(by="support", ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(MILK),(BREAD),0.25,0.65,0.2,0.8,1.230769,0.0375,1.75,0.25
1,(SUGAR),(BREAD),0.3,0.65,0.2,0.666667,1.025641,0.005,1.05,0.035714
2,(CORNFLAKES),(COFFEE),0.3,0.4,0.2,0.666667,1.666667,0.08,1.8,0.571429
3,(SUGAR),(COFFEE),0.3,0.4,0.2,0.666667,1.666667,0.08,1.8,0.571429
4,(MAGGI),(TEA),0.25,0.35,0.2,0.8,2.285714,0.1125,3.25,0.75


In [11]:
AR.sort_values(by="confidence", ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(MILK),(BREAD),0.25,0.65,0.2,0.8,1.230769,0.0375,1.75,0.25
4,(MAGGI),(TEA),0.25,0.35,0.2,0.8,2.285714,0.1125,3.25,0.75
1,(SUGAR),(BREAD),0.3,0.65,0.2,0.666667,1.025641,0.005,1.05,0.035714
2,(CORNFLAKES),(COFFEE),0.3,0.4,0.2,0.666667,1.666667,0.08,1.8,0.571429
3,(SUGAR),(COFFEE),0.3,0.4,0.2,0.666667,1.666667,0.08,1.8,0.571429


In [12]:
AR.sort_values(by="lift", ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
4,(MAGGI),(TEA),0.25,0.35,0.2,0.8,2.285714,0.1125,3.25,0.75
2,(CORNFLAKES),(COFFEE),0.3,0.4,0.2,0.666667,1.666667,0.08,1.8,0.571429
3,(SUGAR),(COFFEE),0.3,0.4,0.2,0.666667,1.666667,0.08,1.8,0.571429
0,(MILK),(BREAD),0.25,0.65,0.2,0.8,1.230769,0.0375,1.75,0.25
1,(SUGAR),(BREAD),0.3,0.65,0.2,0.666667,1.025641,0.005,1.05,0.035714


In [13]:
df_filter = AR[ (AR["support"] >= 0.15) & (AR["confidence"] >= 0.75 )]
df_filter

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(MILK),(BREAD),0.25,0.65,0.2,0.8,1.230769,0.0375,1.75,0.25
4,(MAGGI),(TEA),0.25,0.35,0.2,0.8,2.285714,0.1125,3.25,0.75


In [None]:
"""
The Apriori algorithm is a popular algorithm for association rule mining in data mining and market basket analysis.
While it has been widely used and influential, it also has certain limitations and drawbacks:
1. Exponential Growth of Itemsets: Apriori generates a large number of candidate itemsets, especially as the number of
items in the dataset increases. The number of itemsets grows exponentially with the number of items, which can lead to
memory and computational efficiency issues.
2. High Memory Usage: Storing and processing a large number of itemsets and their corresponding support counts can
require a significant amount of memory. This can become problematic when working with large datasets.
3. Inefficient for Sparse Data: Apriori performs inefficiently on datasets with low-density itemsets, where most item
combinations do not occur frequently. It generates many candidates with low support, leading to unnecessary
computations.
4. Limited to Small Itemsets: In practice, Apriori is typically used for finding associations among items with relatively
small itemsets. It may not perform well with datasets containing very long itemsets or complex relationships.
5. Difficulty Handling Continuous Data: Apriori is primarily designed for categorical data with discrete items. It may
not work well with continuous or numerical data without appropriate discretization.
6. Fixed Thresholds: Apriori requires users to set minimum support and confidence thresholds before mining. Selecting
appropriate threshold values can be challenging and may affect the quality of discovered rules.
7. Doesn't Discover All Association Rules: Apriori may miss some interesting association rules due to its level-wise
search strategy and pruning techniques. It cannot discover rules that do not meet the specified minimum support and
confidence thresholds.
8. Doesn't Consider Item Order: Apriori treats items as independent, ignoring their order of occurrence. This limitation
makes it less suitable for capturing sequential patterns or time-dependent associations.
9. Limited Scalability: While Apriori works well for small to medium-sized datasets, it may not scale efficiently to very
large datasets. More scalable algorithms like FP-growth have been developed to address this issue.

10. Apriori Pruning Overhead: The pruning techniques used in Apriori to reduce the number of candidate itemsets
introduce computational overhead. These overheads can slow down the mining process.
Despite these limitations, Apriori remains a valuable algorithm for association rule mining in many applications,
especially when the dataset size and itemset cardinality are moderate. Researchers and practitioners often explore
alternative algorithms to overcome some of the limitations, such as FP-growth, Eclat, or more specialized approaches
for sequential pattern mining.
"""