## I. Importing datasets

In [1]:
# importing the libraries
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
import matplotlib.pyplot as plt

In [2]:
# importing dataset transactions_strings.csv
df_string = pd.read_csv('transactions_strings.csv')
df_string

Unnamed: 0,"{citrus fruit,semi-finished bread,margarine,ready soups}"
0,"{tropical fruit,yogurt,coffee}"
1,{whole milk}
2,"{pip fruit,yogurt,cream cheese ,meat spreads}"
3,"{other vegetables,whole milk,condensed milk,lo..."
4,"{whole milk,butter,yogurt,rice,abrasive cleaner}"
...,...
9829,"{sausage,chicken,beef,hamburger meat,citrus fr..."
9830,{cooking chocolate}
9831,"{chicken,citrus fruit,other vegetables,butter,..."
9832,"{semi-finished bread,bottled water,soda,bottle..."


In [3]:
# calculating main statisitcs for transactions_strings.csv
df_string.describe()

Unnamed: 0,"{citrus fruit,semi-finished bread,margarine,ready soups}"
count,9834
unique,7010
top,{canned beer}
freq,260


In [4]:
# importing dataset transactions_binary.csv
df_trans = pd.read_csv('transactions_binary.csv')
df_trans

Unnamed: 0,frankfurter,sausage,liver loaf,ham,meat,finished products,organic sausage,chicken,turkey,pork,...,candles,light bulbs,sound storage medium,newspapers,photo/film,pot plants,flower soil/fertilizer,flower (seeds),shopping bags,bags
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9830,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
9831,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9832,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
9833,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## II. Task 1:

### Top-30 most frequently bought products during the time period of the dataset

In [5]:
# top items by purchase frequency
freq_items = apriori(df_trans, min_support=0.005, use_colnames=True)
freq_item_30 = freq_items.sort_values(by = "support", ascending = False).head(30)
freq_item_30['Row Number'] = [i+1 for i, _ in enumerate(freq_item_30.index)]
print(freq_item_30)
### top 13th is bottled beer



      support                        itemsets  Row Number
21   0.255516                    (whole milk)           1
19   0.193493              (other vegetables)           2
47   0.183935                    (rolls/buns)           3
80   0.174377                          (soda)           4
26   0.139502                        (yogurt)           5
79   0.110524                 (bottled water)           6
16   0.108998               (root vegetables)           7
12   0.104931                (tropical fruit)           8
119  0.098526                 (shopping bags)           9
1    0.093950                       (sausage)          10
50   0.088968                        (pastry)          11
11   0.082766                  (citrus fruit)          12
83   0.080529                  (bottled beer)          13
115  0.079817                    (newspapers)          14
84   0.077682                   (canned beer)          15
13   0.075648                     (pip fruit)          16
394  0.074835 

## III. Task 2:

### Top 5 most promising product association rules that involve the 13th most frequently bought product

In [6]:
## From the initial dataset subset the transaction that contains bottled beer
product_13 = df_trans.loc[df_trans['bottled beer']==1]
product_13

Unnamed: 0,frankfurter,sausage,liver loaf,ham,meat,finished products,organic sausage,chicken,turkey,pork,...,candles,light bulbs,sound storage medium,newspapers,photo/film,pot plants,flower soil/fertilizer,flower (seeds),shopping bags,bags
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
77,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
78,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
103,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
110,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9744,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9754,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9783,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9803,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# creating a frequent itemset with the apriori function, the minimum threshold
# for the support metric is 0.01
frequent_items = apriori(product_13, min_support=0.01,use_colnames=True)
frequent_items



Unnamed: 0,support,itemsets
0,0.066919,(frankfurter)
1,0.097222,(sausage)
2,0.027778,(ham)
3,0.026515,(meat)
4,0.030303,(chicken)
...,...,...
1144,0.010101,"(root vegetables, bottled water, whole milk, o..."
1145,0.010101,"(whipped/sour cream, whole milk, yogurt, other..."
1146,0.011364,"(whole milk, yogurt, other vegetables, frozen ..."
1147,0.010101,"(whole milk, yogurt, rolls/buns, other vegetab..."


In [8]:
# genereting rules using the function assciation_rules and confidence as evaluation 
# metric, setting the minimun threshold
rules = association_rules(frequent_items, metric='confidence', min_threshold=0.5)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(frankfurter),(whole milk),0.066919,0.253788,0.034091,0.509434,2.007322,0.017108,1.521125,0.537814
1,(frankfurter),(bottled beer),0.066919,1.000000,0.066919,1.000000,1.000000,0.000000,inf,0.000000
2,(sausage),(bottled beer),0.097222,1.000000,0.097222,1.000000,1.000000,0.000000,inf,0.000000
3,(ham),(other vegetables),0.027778,0.200758,0.013889,0.500000,2.490566,0.008312,1.598485,0.615584
4,(ham),(whole milk),0.027778,0.253788,0.017677,0.636364,2.507463,0.010627,2.052083,0.618367
...,...,...,...,...,...,...,...,...,...,...
1232,"(other vegetables, whole milk, bottled water, ...",(bottled beer),0.010101,1.000000,0.010101,1.000000,1.000000,0.000000,inf,0.000000
1233,"(domestic eggs, whole milk, bottled water, bot...",(other vegetables),0.013889,0.200758,0.010101,0.727273,3.622642,0.007313,2.930556,0.734155
1234,"(other vegetables, bottled water, bottled beer...",(whole milk),0.015152,0.253788,0.010101,0.666667,2.626866,0.006256,2.238636,0.628846
1235,"(domestic eggs, whole milk, bottled water)","(other vegetables, bottled beer)",0.013889,0.200758,0.010101,0.727273,3.622642,0.007313,2.930556,0.734155


In [11]:
# creating an additional column called "antecedent_len" which will 
# contain the number of items in the antecedent
rules["antecedent_len"] = rules["antecedents"].apply(lambda x: len(x))
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric,antecedent_len
0,(frankfurter),(whole milk),0.066919,0.253788,0.034091,0.509434,2.007322,0.017108,1.521125,0.537814,1
1,(frankfurter),(bottled beer),0.066919,1.000000,0.066919,1.000000,1.000000,0.000000,inf,0.000000,1
2,(sausage),(bottled beer),0.097222,1.000000,0.097222,1.000000,1.000000,0.000000,inf,0.000000,1
3,(ham),(other vegetables),0.027778,0.200758,0.013889,0.500000,2.490566,0.008312,1.598485,0.615584,1
4,(ham),(whole milk),0.027778,0.253788,0.017677,0.636364,2.507463,0.010627,2.052083,0.618367,1
...,...,...,...,...,...,...,...,...,...,...,...
1232,"(other vegetables, whole milk, bottled water, ...",(bottled beer),0.010101,1.000000,0.010101,1.000000,1.000000,0.000000,inf,0.000000,4
1233,"(domestic eggs, whole milk, bottled water, bot...",(other vegetables),0.013889,0.200758,0.010101,0.727273,3.622642,0.007313,2.930556,0.734155,4
1234,"(other vegetables, bottled water, bottled beer...",(whole milk),0.015152,0.253788,0.010101,0.666667,2.626866,0.006256,2.238636,0.628846,4
1235,"(domestic eggs, whole milk, bottled water)","(other vegetables, bottled beer)",0.013889,0.200758,0.010101,0.727273,3.622642,0.007313,2.930556,0.734155,3


In [12]:
# to export dataset to a csv file
rules.to_csv('interesting_rules_final.csv')

In [15]:
## filtering the new subset dataset that includes bottle beer with
## the following conditions:
## 1. number of item in the antecedent equal to 2
## 2. support metric greater than 0.001
## 3. confidence metric greater than 0.4
## 4. lift metric greater than 2
## ordering the dataset in descendent order according to the highest 
## values for confidence and lift
interesting_rules = rules[ (rules['antecedent_len'] == 2) & # to consider 3 products
(rules['support'] >= 0.001) &
(rules['confidence'] > 0.5) &
(rules['lift'] > 2) ]
interesting_rules 
interesting_rules = interesting_rules[ interesting_rules["antecedents"].apply(lambda x: "bottled beer" in x) ]
interesting_rules
#interesting_rules.sort_values(by=['antecedent_len'], ascending=False).head(5)
interesting_rules.sort_values(by=['confidence','lift'], ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric,antecedent_len
540,"(bottled beer, soups)",(whole milk),0.015152,0.253788,0.013889,0.916667,3.61194,0.010044,8.954545,0.734266,2
240,"(hamburger meat, bottled beer)",(whole milk),0.026515,0.253788,0.021465,0.809524,3.189765,0.014735,3.917614,0.705196,2
564,"(detergent, bottled beer)",(whole milk),0.016414,0.253788,0.011364,0.692308,2.727899,0.007198,2.425189,0.643988,2
424,"(soft cheese, bottled beer)",(other vegetables),0.015152,0.200758,0.010101,0.666667,3.320755,0.007059,2.397727,0.709615,2
430,"(bottled beer, cream cheese )",(other vegetables),0.022727,0.200758,0.015152,0.666667,3.320755,0.010589,2.397727,0.715116,2
494,"(sliced cheese, bottled beer)",(whole milk),0.021465,0.253788,0.013889,0.647059,2.549605,0.008441,2.114268,0.621114,2
185,"(ham, bottled beer)",(whole milk),0.027778,0.253788,0.017677,0.636364,2.507463,0.010627,2.052083,0.618367,2
378,"(herbs, bottled beer)",(bottled water),0.02399,0.195707,0.015152,0.631579,3.227165,0.010457,2.183081,0.707094,2
473,"(bottled beer, dessert)",(whole milk),0.02399,0.253788,0.015152,0.631579,2.48861,0.009063,2.025433,0.612872,2
511,"(domestic eggs, bottled beer)",(whole milk),0.058081,0.253788,0.036616,0.630435,2.484101,0.021876,2.019162,0.634279,2


In [16]:
# to export dataset to a csv file
interesting_rules.to_csv('interesting_rules_final.csv')