### EXERCISE: APPLY BOTH APRIORI AND ECLAT ON GROCERIES_DATASET.CSV. THEN DO A COMPARATIVE ANALYSIS ON ECLAT, APRIORI, AND FP-GROWTH.

In [2]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth, association_rules
import matplotlib.pyplot as plt
from graphviz import Digraph

df = pd.read_csv('Groceries_dataset.csv').sample(n=3000, random_state=42)
df

Unnamed: 0,Member_number,Date,itemDescription
22400,1449,03-10-2015,tropical fruit
38462,3010,13-06-2014,hygiene articles
36986,4789,09-04-2015,specialty cheese
2182,1884,11-12-2015,frankfurter
32319,1997,28-12-2015,pastry
...,...,...,...
9633,2312,25-02-2014,other vegetables
10657,3294,26-02-2014,pastry
18959,1594,23-01-2015,yogurt
13793,2755,25-11-2014,whole milk


In [3]:
# Converting the df to transactions
grouped = df.groupby(['Member_number', 'Date'])
transactions = grouped['itemDescription'].apply(list).values.tolist()

transactions

[['sausage'],
 ['rolls/buns'],
 ['dental care'],
 ['dish cleaner'],
 ['whole milk'],
 ['margarine'],
 ['chicken'],
 ['yogurt', 'domestic eggs'],
 ['herbs'],
 ['bottled water'],
 ['tropical fruit'],
 ['frozen vegetables'],
 ['whipped/sour cream'],
 ['canned beer'],
 ['butter milk'],
 ['rolls/buns'],
 ['root vegetables'],
 ['curd'],
 ['butter'],
 ['curd'],
 ['rolls/buns'],
 ['butter'],
 ['cream cheese '],
 ['whole milk'],
 ['snack products'],
 ['cake bar'],
 ['rolls/buns'],
 ['butter', 'whipped/sour cream'],
 ['whole milk'],
 ['fruit/vegetable juice'],
 ['specialty bar'],
 ['meat'],
 ['beverages', 'waffles'],
 ['yogurt'],
 ['shopping bags'],
 ['yogurt'],
 ['pork', 'other vegetables', 'detergent'],
 ['pastry'],
 ['waffles'],
 ['sugar'],
 ['soda'],
 ['soda'],
 ['other vegetables'],
 ['whole milk'],
 ['tropical fruit'],
 ['whole milk'],
 ['whole milk'],
 ['whole milk'],
 ['pork'],
 ['soda'],
 ['bottled beer'],
 ['brown bread'],
 ['semi-finished bread'],
 ['shopping bags'],
 ['canned fish'],

In [4]:
data = pd.DataFrame(transactions)
data

Unnamed: 0,0,1,2,3
0,sausage,,,
1,rolls/buns,,,
2,dental care,,,
3,dish cleaner,,,
4,whole milk,,,
...,...,...,...,...
2775,salty snack,,,
2776,soda,,,
2777,white wine,,,
2778,detergent,,,


In [5]:

# Looking for itemSETS
# we do not want to have any individual products returned
min_n_products = 2

# we want to set min support to 7
# but we have to express it as a percentage
min_support = 700/len(transactions)

# we have no limit on the size of association rules
# so we set it to the longest transaction
max_length = max([len(x) for x in transactions])

In [6]:
print(max_length)

4


In [7]:
from pyECLAT import ECLAT

# Creating an instance of eclat
my_eclat = ECLAT(data=data, verbose=True)

# Fitting the algorithm
rule_indices, rule_supports = my_eclat.fit(min_support=min_support,
                                           min_combination=min_n_products,
                                           max_combination=max_length)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 149/149 [00:00<00:00, 1746.28it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 149/149 [00:00<00:00, 16515.19it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 149/149 [00:00<00:00, 9124.31it/s]


Combination 2 by 2


0it [00:00, ?it/s]


Combination 3 by 3


0it [00:00, ?it/s]


Combination 4 by 4


0it [00:00, ?it/s]


In [8]:
rule_indices

{}

In [9]:
print(rule_supports)

{}


In [10]:
# result via FP growth

# Transforming data using TransactionEncoder
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df = pd.DataFrame(te_ary, columns=te.columns_)

In [11]:
# Generating frequent itemsets using FP-Growth
frequent_itemsets = fpgrowth(df, min_support=0.005, use_colnames=True)
print("Frequent Itemsets:")
print(frequent_itemsets)

# Generating association rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.1)
print("\nAssociation Rules:")
print(rules)

Frequent Itemsets:
     support                    itemsets
0   0.026619                   (sausage)
1   0.049281                (rolls/buns)
2   0.076259                (whole milk)
3   0.014388                 (margarine)
4   0.013309                   (chicken)
..       ...                         ...
56  0.009712                   (dessert)
57  0.007914  (long life bakery product)
58  0.011151                 (chocolate)
59  0.007914                       (ham)
60  0.006475          (hygiene articles)

[61 rows x 2 columns]

Association Rules:
Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, representativity, leverage, conviction, zhangs_metric, jaccard, certainty, kulczynski]
Index: []
