## Project: Creating Association Rules from a Retail Store

### Installing and importing libraries 

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### Import apriori

In [5]:
# mlxtend should have been already installed via the command terminal.  if not, please use pip to install as shown below.

In [6]:
# pip install mlxtend on the command terminal (if not already installed)
# 1) go to anaconda prompt (you can also go here through Anaconda Navigator --> Environments --> base --> Open Terminal)
# 2) pip install mlxtend on the terminal

In [7]:
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

## Step 1. Load the store dataset and prepare the dataset

### E.1 Load store_ready csv file and assign it to  variable 'store'

In [8]:
store = pd.read_csv('store_ready.csv')

**Shape**

In [9]:
store.shape

(7501, 121)

In [6]:
# Sample output

(7501, 121)

**First 2 rows**

In [10]:
store.head(2)

Unnamed: 0.1,Unnamed: 0,asparagus,almonds,antioxydant juice,asparagus.1,avocado,babies food,bacon,barbecue sauce,black tea,...,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,False,False,True,True,False,True,False,False,False,False,...,False,True,False,False,True,False,False,True,False,False
1,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [8]:
# Sample output

Unnamed: 0.1,Unnamed: 0,asparagus,almonds,antioxydant juice,asparagus.1,avocado,babies food,bacon,barbecue sauce,black tea,...,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,False,False,True,True,False,True,False,False,False,False,...,False,True,False,False,True,False,False,True,False,False
1,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


**First 5 Column names**

In [13]:
list(store.columns)[0:5]

['Unnamed: 0', ' asparagus', 'almonds', 'antioxydant juice', 'asparagus']

In [10]:
# Sample output

['Unnamed: 0', ' asparagus', 'almonds', 'antioxydant juice', 'asparagus']

### E2. Data Preperation

**Drop first column**

In [17]:
store.drop('Unnamed: 0', axis=1, inplace=True)

In [18]:
store.head(2)

Unnamed: 0,asparagus,almonds,antioxydant juice,asparagus.1,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,...,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,False,True,True,False,True,False,False,False,False,False,...,False,True,False,False,True,False,False,True,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [12]:
# Sample output

Unnamed: 0,asparagus,almonds,antioxydant juice,asparagus.1,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,...,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,False,True,True,False,True,False,False,False,False,False,...,False,True,False,False,True,False,False,True,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


**Convert boolean values in store dataframe to int and assign it back to store**


In [19]:
store = store.astype(int)
store.head(2)

Unnamed: 0,asparagus,almonds,antioxydant juice,asparagus.1,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,...,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,0,1,1,0,1,0,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
# Sample output

Unnamed: 0,asparagus,almonds,antioxydant juice,asparagus.1,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,...,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,0,1,1,0,1,0,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


**Give row names, R0, R1 ...etc, to store dataframe**

In [21]:
store.shape[0]

7501

In [24]:
rnames = ['R'+str(i) for i in range(0,store.shape[0])]
store.index = rnames
store.head(3)

Unnamed: 0,asparagus,almonds,antioxydant juice,asparagus.1,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,...,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
R0,0,1,1,0,1,0,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0
R1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
R2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
# Sample output

(7501, 120)


Unnamed: 0,asparagus,almonds,antioxydant juice,asparagus.1,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,...,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
R0,0,1,1,0,1,0,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0
R1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
R2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
# we have total 7501 transactions, and 120 columns (or food items)

### E.3 Preliminary exploration and visualization

**Find top 5 popular items and print them as a dataframe with item, freq, and support as columns**

In [25]:
store.head(2)

Unnamed: 0,asparagus,almonds,antioxydant juice,asparagus.1,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,...,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
R0,0,1,1,0,1,0,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0
R1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
pop_items = store.sum(0).sort_values(ascending=False)
pop_items2 = pd.DataFrame(pop_items).reset_index()
pop_items2.rename(columns = {'index':'item', 0:'count'}, inplace=True)
pop_items2['support'] = pop_items2['count']/store.shape[0]
pop_items2.head(5)

Unnamed: 0,item,count,support
0,mineral water,1788,0.238368
1,eggs,1348,0.179709
2,spaghetti,1306,0.17411
3,french fries,1282,0.170911
4,chocolate,1229,0.163845


In [19]:
# Sample output

Unnamed: 0,item,freq,support
0,mineral water,1788,0.238368
1,eggs,1348,0.179709
2,spaghetti,1306,0.17411
3,french fries,1282,0.170911
4,chocolate,1229,0.163845


## Step 2. Short-list frequently occuring items and item sets by choosing a support level

In [38]:
store.head(3)

Unnamed: 0,asparagus,almonds,antioxydant juice,asparagus.1,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,...,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
R0,0,1,1,0,1,0,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0
R1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
R2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### E.4 Obtain frequently occuring items/item sets which occur atleast 0.5% of the time. Store them in 'freq_items'

**Obtain the frequent items and print the total number of item sets generated**

In [39]:
#Apriori algorithm returns the frequently occuring item sets as a dataframe

In [43]:
freq_items = apriori(store,min_support=0.005, use_colnames=True)
freq_items.shape[0]

725

In [44]:
# Sample output

**show the first 5 rows**

In [45]:
freq_items.head(5)

Unnamed: 0,support,itemsets
0,0.020397,(almonds)
1,0.008932,(antioxydant juice)
2,0.033329,(avocado)
3,0.008666,(bacon)
4,0.010799,(barbecue sauce)


In [24]:
# Sample output

Unnamed: 0,support,itemsets
0,0.020397,(almonds)
1,0.008932,(antioxydant juice)
2,0.033329,(avocado)
3,0.008666,(bacon)
4,0.010799,(barbecue sauce)


**Sort the 'freq_items' data frame to show only top 10 most frequently occuring items**

In [46]:
freq_items.sort_values('support', ascending=False).head(10)

Unnamed: 0,support,itemsets
60,0.238368,(mineral water)
27,0.179709,(eggs)
83,0.17411,(spaghetti)
33,0.170911,(french fries)
20,0.163845,(chocolate)
44,0.132116,(green tea)
59,0.129583,(milk)
45,0.098254,(ground beef)
39,0.095321,(frozen vegetables)
68,0.095054,(pancakes)


In [26]:
# Sample output

Unnamed: 0,support,itemsets
60,0.238368,(mineral water)
27,0.179709,(eggs)
83,0.17411,(spaghetti)
33,0.170911,(french fries)
20,0.163845,(chocolate)
44,0.132116,(green tea)
59,0.129583,(milk)
45,0.098254,(ground beef)
39,0.095321,(frozen vegetables)
68,0.095054,(pancakes)


## Step 3: From the frequently occuring item sets, generate association rules by choosing a metric

### E.5 Generate association rules which have a lift value of atleast 2

**Obtain the rules from 'association_rules' library and print the number of rules generated**

In [48]:
arules = association_rules(freq_items,metric='lift', min_threshold=2)
arules.shape[0]

482

In [28]:
# Sample output

482

### E.6 Sort Rules based on confidence and see top 10 rules

In [50]:
arules.head(3)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(burgers),(almonds),0.087188,0.020397,0.005199,0.059633,2.923577,0.003421,1.041724
1,(almonds),(burgers),0.020397,0.087188,0.005199,0.254902,2.923577,0.003421,1.225089
2,(frozen smoothie),(avocado),0.063325,0.033329,0.005066,0.08,2.40032,0.002955,1.050729


In [51]:
arules.sort_values('confidence', ascending=False).head(10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
296,"(soup, frozen vegetables)",(mineral water),0.007999,0.238368,0.005066,0.633333,2.656954,0.003159,2.077178
449,"(olive oil, soup)",(mineral water),0.008932,0.238368,0.005199,0.58209,2.441976,0.00307,1.822476
285,"(olive oil, frozen vegetables)",(mineral water),0.011332,0.238368,0.006532,0.576471,2.418404,0.003831,1.798297
421,"(milk, soup)",(mineral water),0.015198,0.238368,0.008532,0.561404,2.355194,0.004909,1.73652
187,"(chocolate, soup)",(mineral water),0.010132,0.238368,0.005599,0.552632,2.318395,0.003184,1.702471
199,"(eggs, cooking oil)",(mineral water),0.011732,0.238368,0.006399,0.545455,2.288286,0.003603,1.67559
263,"(frozen vegetables, ground beef)",(mineral water),0.016931,0.238368,0.009199,0.543307,2.279277,0.005163,1.667711
430,"(milk, turkey)",(mineral water),0.011332,0.238368,0.006133,0.541176,2.270338,0.003431,1.659967
469,"(spaghetti, soup)",(mineral water),0.014265,0.238368,0.007466,0.523364,2.195614,0.004065,1.597933
395,"(shrimp, ground beef)",(spaghetti),0.011465,0.17411,0.005999,0.523256,3.005315,0.004003,1.732354


In [30]:
# Sample output

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift
294,"(soup, frozen vegetables)",(mineral water),0.007999,0.238368,0.005066,0.633333,2.656954
448,"(olive oil, soup)",(mineral water),0.008932,0.238368,0.005199,0.58209,2.441976
284,"(olive oil, frozen vegetables)",(mineral water),0.011332,0.238368,0.006532,0.576471,2.418404
421,"(soup, milk)",(mineral water),0.015198,0.238368,0.008532,0.561404,2.355194
186,"(soup, chocolate)",(mineral water),0.010132,0.238368,0.005599,0.552632,2.318395
199,"(eggs, cooking oil)",(mineral water),0.011732,0.238368,0.006399,0.545455,2.288286
262,"(ground beef, frozen vegetables)",(mineral water),0.016931,0.238368,0.009199,0.543307,2.279277
431,"(milk, turkey)",(mineral water),0.011332,0.238368,0.006133,0.541176,2.270338
468,"(soup, spaghetti)",(mineral water),0.014265,0.238368,0.007466,0.523364,2.195614
394,"(ground beef, shrimp)",(spaghetti),0.011465,0.17411,0.005999,0.523256,3.005315


### E.7 Identiy any rare strong relationships

In [55]:
arules.sort_values('lift', ascending=False).head(10).iloc[:,:-2]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift
22,(pasta),(escalope),0.015731,0.079323,0.005866,0.372881,4.700812
23,(escalope),(pasta),0.079323,0.015731,0.005866,0.07395,4.700812
68,(pasta),(shrimp),0.015731,0.071457,0.005066,0.322034,4.506672
69,(shrimp),(pasta),0.071457,0.015731,0.005066,0.070896,4.506672
66,(whole wheat pasta),(olive oil),0.029463,0.065858,0.007999,0.271493,4.12241
67,(olive oil),(whole wheat pasta),0.065858,0.029463,0.007999,0.121457,4.12241
346,"(spaghetti, herb & pepper)",(ground beef),0.016264,0.098254,0.006399,0.393443,4.00436
351,(ground beef),"(spaghetti, herb & pepper)",0.098254,0.016264,0.006399,0.065129,4.00436
345,(ground beef),"(mineral water, herb & pepper)",0.098254,0.017064,0.006666,0.067843,3.975683
342,"(mineral water, herb & pepper)",(ground beef),0.017064,0.098254,0.006666,0.390625,3.975683


In [32]:
# Sample output

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift
22,(escalope),(pasta),0.079323,0.015731,0.005866,0.07395,4.700812
23,(pasta),(escalope),0.015731,0.079323,0.005866,0.372881,4.700812
68,(shrimp),(pasta),0.071457,0.015731,0.005066,0.070896,4.506672
69,(pasta),(shrimp),0.015731,0.071457,0.005066,0.322034,4.506672
67,(whole wheat pasta),(olive oil),0.029463,0.065858,0.007999,0.271493,4.12241
66,(olive oil),(whole wheat pasta),0.065858,0.029463,0.007999,0.121457,4.12241
347,"(herb & pepper, spaghetti)",(ground beef),0.016264,0.098254,0.006399,0.393443,4.00436
350,(ground beef),"(herb & pepper, spaghetti)",0.098254,0.016264,0.006399,0.065129,4.00436
345,(ground beef),"(herb & pepper, mineral water)",0.098254,0.017064,0.006666,0.067843,3.975683
342,"(herb & pepper, mineral water)",(ground beef),0.017064,0.098254,0.006666,0.390625,3.975683


In [54]:
arules.head(3)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(burgers),(almonds),0.087188,0.020397,0.005199,0.059633,2.923577,0.003421,1.041724
1,(almonds),(burgers),0.020397,0.087188,0.005199,0.254902,2.923577,0.003421,1.225089
2,(frozen smoothie),(avocado),0.063325,0.033329,0.005066,0.08,2.40032,0.002955,1.050729


In [57]:
arules[(arules['support'] < 0.01) & (arules['confidence'] > 0.3) & (arules['lift'] > 3)]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
20,(mushroom cream sauce),(escalope),0.019064,0.079323,0.005733,0.300699,3.790833,0.00422,1.316568
22,(pasta),(escalope),0.015731,0.079323,0.005866,0.372881,4.700812,0.004618,1.468107
50,(tomato sauce),(ground beef),0.014131,0.098254,0.005333,0.377358,3.840659,0.003944,1.448259
68,(pasta),(shrimp),0.015731,0.071457,0.005066,0.322034,4.506672,0.003942,1.369601
270,"(spaghetti, frozen vegetables)",(ground beef),0.027863,0.098254,0.008666,0.311005,3.165328,0.005928,1.308785
290,"(shrimp, mineral water)",(frozen vegetables),0.023597,0.095321,0.007199,0.305085,3.200616,0.00495,1.301856
318,"(spaghetti, tomatoes)",(frozen vegetables),0.020931,0.095321,0.006666,0.318471,3.341054,0.004671,1.327427
328,"(grated cheese, spaghetti)",(ground beef),0.016531,0.098254,0.005333,0.322581,3.283144,0.003708,1.331149
342,"(mineral water, herb & pepper)",(ground beef),0.017064,0.098254,0.006666,0.390625,3.975683,0.004989,1.479789
346,"(spaghetti, herb & pepper)",(ground beef),0.016264,0.098254,0.006399,0.393443,4.00436,0.004801,1.486663
