### Sample program for Association Analysis (Market Basket Analysis) using FP-Growth  

#### Import libraries  

In [116]:
import pandas as pd
import numpy as np
import pyfpgrowth  # https://fp-growth.readthedocs.io/en/latest/

#### Parameters  

In [117]:
csv_in = 'groceries-col.csv'

#### Read CSV file  

In [118]:
df = pd.read_csv(csv_in, delimiter=',', skiprows=0, header=None)
print(df.shape)
print(df.info())
display(df.head())

(9835, 32)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9835 entries, 0 to 9834
Data columns (total 32 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       9835 non-null   object
 1   1       7676 non-null   object
 2   2       6033 non-null   object
 3   3       4734 non-null   object
 4   4       3729 non-null   object
 5   5       2874 non-null   object
 6   6       2229 non-null   object
 7   7       1684 non-null   object
 8   8       1246 non-null   object
 9   9       896 non-null    object
 10  10      650 non-null    object
 11  11      468 non-null    object
 12  12      351 non-null    object
 13  13      273 non-null    object
 14  14      196 non-null    object
 15  15      141 non-null    object
 16  16      95 non-null     object
 17  17      66 non-null     object
 18  18      52 non-null     object
 19  19      38 non-null     object
 20  20      29 non-null     object
 21  21      18 non-null     object
 22  22      14 no

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,citrus fruit,semi-finished bread,margarine,ready soups,,,,,,,...,,,,,,,,,,
1,tropical fruit,yogurt,coffee,,,,,,,,...,,,,,,,,,,
2,whole milk,,,,,,,,,,...,,,,,,,,,,
3,pip fruit,yogurt,cream cheese,meat spreads,,,,,,,...,,,,,,,,,,
4,other vegetables,whole milk,condensed milk,long life bakery product,,,,,,,...,,,,,,,,,,


In [119]:
values_set = set( df.values.flatten() )
print(values_set)
values_set.remove(np.nan)
id2item = sorted(list(values_set))
print(id2item)

{nan, 'bags', 'zwieback', 'decalcifier', 'white bread', 'soft cheese', 'artif. sweetener', 'dog food', 'potato products', 'liqueur', 'rolls/buns', 'hygiene articles', 'shopping bags', 'fish', 'detergent', 'cake bar', 'meat spreads', 'candles', 'ready soups', 'canned vegetables', 'female sanitary products', 'misc. beverages', 'long life bakery product', 'bottled beer', 'organic sausage', 'newspapers', 'liquor (appetizer)', 'softener', 'yogurt', 'meat', 'butter milk', 'rice', 'grapes', 'cream', 'specialty chocolate', 'dish cleaner', 'salty snack', 'chocolate marshmallow', 'organic products', 'fruit/vegetable juice', 'spread cheese', 'flower (seeds)', 'brown bread', 'beverages', 'dishes', 'finished products', 'chewing gum', 'baking powder', 'mayonnaise', 'tropical fruit', 'sugar', 'hair spray', 'white wine', 'soda', 'canned beer', 'frozen fruits', 'chocolate', 'specialty cheese', 'cookware', 'cream cheese', 'kitchen towels', 'turkey', 'male cosmetics', 'packaged fruit/vegetables', 'canned

In [120]:
item2id = {}
curr_id = 0
for item in id2item:
    item2id[ item ] = curr_id
    curr_id += 1
print( item2id )

{'Instant food products': 0, 'UHT-milk': 1, 'abrasive cleaner': 2, 'artif. sweetener': 3, 'baby cosmetics': 4, 'baby food': 5, 'bags': 6, 'baking powder': 7, 'bathroom cleaner': 8, 'beef': 9, 'berries': 10, 'beverages': 11, 'bottled beer': 12, 'bottled water': 13, 'brandy': 14, 'brown bread': 15, 'butter': 16, 'butter milk': 17, 'cake bar': 18, 'candles': 19, 'candy': 20, 'canned beer': 21, 'canned fish': 22, 'canned fruit': 23, 'canned vegetables': 24, 'cat food': 25, 'cereals': 26, 'chewing gum': 27, 'chicken': 28, 'chocolate': 29, 'chocolate marshmallow': 30, 'citrus fruit': 31, 'cleaner': 32, 'cling film/bags': 33, 'cocoa drinks': 34, 'coffee': 35, 'condensed milk': 36, 'cooking chocolate': 37, 'cookware': 38, 'cream': 39, 'cream cheese': 40, 'curd': 41, 'curd cheese': 42, 'decalcifier': 43, 'dental care': 44, 'dessert': 45, 'detergent': 46, 'dish cleaner': 47, 'dishes': 48, 'dog food': 49, 'domestic eggs': 50, 'female sanitary products': 51, 'finished products': 52, 'fish': 53, 'f

In [121]:
receipts = []
print
for i in range(df.shape[0]):
    receipt = []
    for j in range(df.shape[1]):
        if df[j][i] is not np.nan:
            receipt.append( item2id[df[j][i]] )
    receipts.append(receipt)
    
print(len(receipts))
print(receipts[:5])

9835
[[31, 133, 89, 119], [158, 167, 35], [166], [110, 167, 40, 92], [103, 166, 36, 86]]


In [122]:
%time patterns = pyfpgrowth.find_frequent_patterns(receipts, 15)
%time rules = pyfpgrowth.generate_association_rules(patterns, 0.9)

CPU times: user 1.33 s, sys: 3.65 ms, total: 1.33 s
Wall time: 1.33 s
CPU times: user 48.2 ms, sys: 0 ns, total: 48.2 ms
Wall time: 48.3 ms


In [123]:
# print(patterns)
print(rules)

{(83, 120): ((12,), 0.9047619047619048), (54, 124, 162): ((166,), 1.0), (40, 103, 152): ((166,), 0.9375), (50, 158, 162): ((166,), 0.9), (65, 158, 162): ((103,), 0.9047619047619048), (124, 131, 158, 167): ((166,), 0.9375)}


In [124]:
results = []
for x in rules:
    ret = [x, rules[x][0], rules[x][1]]
    results.append(ret)
df_res = pd.DataFrame(results)
df_res.columns = ['LHS', 'RHS', 'Conf']

In [125]:
display(df_res.sort_values(by='Conf', ascending=False))

Unnamed: 0,LHS,RHS,Conf
1,"(54, 124, 162)","(166,)",1.0
2,"(40, 103, 152)","(166,)",0.9375
5,"(124, 131, 158, 167)","(166,)",0.9375
0,"(83, 120)","(12,)",0.904762
4,"(65, 158, 162)","(103,)",0.904762
3,"(50, 158, 162)","(166,)",0.9


In [126]:
n_all = len(receipts)
lift = []
for i in range(df_res.shape[0]):
    rhs = df_res.at[i, 'RHS']
    conf = df_res.at[i, 'Conf']
    n_rhs = 0
    for items in receipts:
        if set(items) >= set(rhs):
            n_rhs += 1
    lift1 = conf / (n_rhs / n_all)
    lift.append(lift1)
    
df_res['Lift'] = lift

In [127]:
display(df_res.sort_values(by='Conf', ascending=False))

Unnamed: 0,LHS,RHS,Conf,Lift
1,"(54, 124, 162)","(166,)",1.0,3.913649
2,"(40, 103, 152)","(166,)",0.9375,3.669046
5,"(124, 131, 158, 167)","(166,)",0.9375,3.669046
0,"(83, 120)","(12,)",0.904762,11.235269
4,"(65, 158, 162)","(103,)",0.904762,4.67595
3,"(50, 158, 162)","(166,)",0.9,3.522284


In [128]:
print( id2item[54] )
print( id2item[124] )
print( id2item[162] )
print( id2item[166] )

flour
root vegetables
whipped/sour cream
whole milk
