In [47]:
!pip --quiet install mlxtend

import pandas as pd

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules, fpgrowth

In [48]:
# Download this dataset from https://www.kaggle.com/heeraldedhia/groceries-dataset
df = pd.read_csv('Groceries_dataset.csv')
df.head()

Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk


In [49]:
df.describe()

Unnamed: 0,Member_number
count,38765.0
mean,3003.641868
std,1153.611031
min,1000.0
25%,2002.0
50%,3005.0
75%,4007.0
max,5000.0


# Apriori algorithm
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
https://www.youtube.com/watch?v=h_l3b2CIQ_o
- Time and memory consuming
- Suitable for small dataset and big support value
- Tree based algorithm

In [50]:
# creating the dataset with the required format  for apriori algorithm
dataset = df.groupby(['Member_number', 'Date'])['itemDescription'].apply(list).reset_index(name='items', drop=True)
dataset.head()

0    [sausage, whole milk, semi-finished bread, yog...
1                    [whole milk, pastry, salty snack]
2                       [canned beer, misc. beverages]
3                          [sausage, hygiene articles]
4                           [soda, pickled vegetables]
Name: itemDescription, dtype: object

In [51]:
# one hot encoding
te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset)
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)
df_encoded.head()

Unnamed: 0,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,beef,berries,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,True,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [52]:
# Generating candidates
frequent_itemsets = apriori(df_encoded, min_support=0.001, use_colnames=True, low_memory=True)
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
frequent_itemsets.sort_values(by=['support'], ascending=False)

Unnamed: 0,support,itemsets,length
146,0.157923,(whole milk),1
90,0.122101,(other vegetables),1
109,0.110005,(rolls/buns),1
123,0.097106,(soda),1
147,0.085879,(yogurt),1
...,...,...,...
344,0.001002,"(margarine, chicken)",2
201,0.001002,"(chicken, bottled beer)",2
202,0.001002,"(chocolate, bottled beer)",2
516,0.001002,"(hamburger meat, pastry)",2


In [53]:
# Create the association based on  criteria(rules), lift > 80% and confidence > 0.01
apriori_rules  = association_rules(frequent_itemsets, metric='lift', min_threshold=0.8)
apriori_rules_filtered = apriori_rules[apriori_rules['confidence'] > 0.01]
apriori_rules_filtered.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(UHT-milk),(bottled water),0.021386,0.060683,0.001069,0.05,0.823954,-0.000228,0.988755
1,(bottled water),(UHT-milk),0.060683,0.021386,0.001069,0.017621,0.823954,-0.000228,0.996168
2,(other vegetables),(UHT-milk),0.122101,0.021386,0.002139,0.017515,0.818993,-0.000473,0.99606
3,(UHT-milk),(other vegetables),0.021386,0.122101,0.002139,0.1,0.818993,-0.000473,0.975443
4,(sausage),(UHT-milk),0.060349,0.021386,0.001136,0.018826,0.880298,-0.000154,0.997391


** Recommendations

In [54]:
# function to make recommendation
def make_recommendations(my_dataset, product):
    recommend = []
    for i in range(0,my_dataset.shape[0]):
        if product == my_dataset.iloc[i, 0]:
            recommend.append(my_dataset.iloc[i, 1])
    return recommend

In [55]:
product_name = {'coffee'}
recommendations = make_recommendations(apriori_rules_filtered, product_name)
for item in recommendations:
    print(list(item)[0])

bottled water
domestic eggs
frankfurter
pastry
root vegetables
shopping bags
soda


# Fpgrowth algorithm

http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/fpgrowth/
https://www.youtube.com/watch?v=VB8KWm8MXss
- Used for large dataset
- Use less memory
- Faster for small support value
- Use array based algorithm

In [56]:
freq_items = fpgrowth(df_encoded, min_support=0.001, use_colnames=True)
freq_items['length'] = freq_items['itemsets'].apply(lambda x: len(x))
freq_items.sort_values(by=['support'], ascending=False)

Unnamed: 0,support,itemsets,length
0,0.157923,(whole milk),1
17,0.122101,(other vegetables),1
13,0.110005,(rolls/buns),1
9,0.097106,(soda),1
1,0.085879,(yogurt),1
...,...,...,...
552,0.001002,"(canned beer, coffee)",2
209,0.001002,"(root vegetables, hygiene articles)",2
215,0.001002,"(whole milk, soda, rolls/buns)",3
217,0.001002,"(whole milk, pickled vegetables)",2


In [57]:
# Create the association based on  criteria(rules), lift > 80%
fpgrowth_rules  = association_rules(freq_items, metric='lift', min_threshold=0.8)
fpgrowth_rules_filtered = fpgrowth_rules[fpgrowth_rules['confidence'] > 0.01]
fpgrowth_rules_filtered.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(whole milk),(yogurt),0.157923,0.085879,0.011161,0.070673,0.82294,-0.002401,0.983638
1,(yogurt),(whole milk),0.085879,0.157923,0.011161,0.129961,0.82294,-0.002401,0.967861
2,(rolls/buns),(yogurt),0.110005,0.085879,0.007819,0.071081,0.827697,-0.001628,0.984071
3,(yogurt),(rolls/buns),0.085879,0.110005,0.007819,0.091051,0.827697,-0.001628,0.979147
4,"(whole milk, other vegetables)",(yogurt),0.014837,0.085879,0.001136,0.076577,0.891685,-0.000138,0.989927


In [58]:
product_name = {'coffee'}
recommendations = make_recommendations(fpgrowth_rules_filtered, product_name)
for item in recommendations:
    print(list(item)[0])


shopping bags
frankfurter
root vegetables
domestic eggs
soda
pastry
bottled water
