In this lab excercise you will learn how to do market basket analysis using apriori algorithm. 

Install  __mlxtend__ library from anaconda prompt 


In [2]:
#import all required classes for market basket analysis from mlxtedn library
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
#import pandas library 
import pandas as pd

# Example 3

In this example we will read data from csv file

In [5]:
#read data from csv file
vegetable_data=pd.read_csv("d:/vegdata.csv")

In [6]:
#display data
vegetable_data

Unnamed: 0,Potato,Beans,Brocoli,Corn,Green_Peppers,Squash,Tomatoes
0,0,0,1,1,1,0,0
1,1,0,0,1,0,1,0
2,0,1,0,1,0,1,1
3,0,1,0,1,1,0,1
4,1,1,1,0,0,0,0
5,1,1,0,0,0,1,1
6,0,0,0,1,0,0,1
7,0,0,1,0,1,0,1
8,1,1,0,0,0,1,0
9,0,1,0,1,0,0,0


To use apriori algorithm on transaction dataset requirement is that data should be in tabular
format where each row represents transaction and column represents existance of 
item in transaction. So let us first transform transaction data to the format required by algorithm

In [8]:
#replace all 1 with True and 0 with False
vegetable_data = vegetable_data.applymap(lambda x: True if x == 1 else False)

In [9]:
#check dataset once again
vegetable_data

Unnamed: 0,Potato,Beans,Brocoli,Corn,Green_Peppers,Squash,Tomatoes
0,False,False,True,True,True,False,False
1,True,False,False,True,False,True,False
2,False,True,False,True,False,True,True
3,False,True,False,True,True,False,True
4,True,True,True,False,False,False,False
5,True,True,False,False,False,True,True
6,False,False,False,True,False,False,True
7,False,False,True,False,True,False,True
8,True,True,False,False,False,True,False
9,False,True,False,True,False,False,False


In [10]:
#find dimensions of data
vegetable_data.shape

(14, 7)

In [11]:
#find frequent itemsets in the data with specified support count using apriori algorithm
frequent_itemsets = apriori(vegetable_data, min_support=0.1, use_colnames=True)
#if you don't set use_colnames=True, it returns column index instead of column name (item name)

In [12]:
#display frequent itemsets. 
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.428571,(Potato)
1,0.714286,(Beans)
2,0.357143,(Brocoli)
3,0.571429,(Corn)
4,0.357143,(Green_Peppers)
5,0.5,(Squash)
6,0.428571,(Tomatoes)
7,0.357143,"(Beans, Potato)"
8,0.142857,"(Potato, Corn)"
9,0.357143,"(Potato, Squash)"


In [13]:
frequent_itemsets["itemsets_length"] = frequent_itemsets["itemsets"].apply(lambda x: len(x))

In [14]:
frequent_itemsets

Unnamed: 0,support,itemsets,itemsets_length
0,0.428571,(Potato),1
1,0.714286,(Beans),1
2,0.357143,(Brocoli),1
3,0.571429,(Corn),1
4,0.357143,(Green_Peppers),1
5,0.5,(Squash),1
6,0.428571,(Tomatoes),1
7,0.357143,"(Beans, Potato)",2
8,0.142857,"(Potato, Corn)",2
9,0.357143,"(Potato, Squash)",2


In [15]:
#generate rules from frequent itemsets by specifying threshold for confidence or lift
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=.9)

In [16]:
#display rules
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,"(Potato, Corn)",(Squash),0.142857,0.5,0.142857,1.0,2.0,0.071429,inf
1,"(Squash, Tomatoes)",(Beans),0.142857,0.714286,0.142857,1.0,1.4,0.040816,inf
2,"(Brocoli, Corn)",(Green_Peppers),0.142857,0.357143,0.142857,1.0,2.8,0.091837,inf
3,"(Tomatoes, Brocoli)",(Green_Peppers),0.142857,0.357143,0.142857,1.0,2.8,0.091837,inf
4,"(Beans, Green_Peppers, Tomatoes)",(Corn),0.142857,0.571429,0.142857,1.0,1.75,0.061224,inf
5,"(Beans, Green_Peppers, Corn)",(Tomatoes),0.142857,0.428571,0.142857,1.0,2.333333,0.081633,inf
6,"(Green_Peppers, Tomatoes, Corn)",(Beans),0.142857,0.714286,0.142857,1.0,1.4,0.040816,inf


In [17]:
#add three columns specifying length of antecedent, consequents, and totak length of the rule
rules["antecedent_len"] = rules["antecedents"].apply(lambda x: len(x))
rules["consequents_len"] = rules["consequents"].apply(lambda x: len(x))
rules["total_len"] = rules["antecedent_len"]+rules["consequents_len"]

In [18]:
#now check rules
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len,consequents_len,total_len
0,"(Potato, Corn)",(Squash),0.142857,0.5,0.142857,1.0,2.0,0.071429,inf,2,1,3
1,"(Squash, Tomatoes)",(Beans),0.142857,0.714286,0.142857,1.0,1.4,0.040816,inf,2,1,3
2,"(Brocoli, Corn)",(Green_Peppers),0.142857,0.357143,0.142857,1.0,2.8,0.091837,inf,2,1,3
3,"(Tomatoes, Brocoli)",(Green_Peppers),0.142857,0.357143,0.142857,1.0,2.8,0.091837,inf,2,1,3
4,"(Beans, Green_Peppers, Tomatoes)",(Corn),0.142857,0.571429,0.142857,1.0,1.75,0.061224,inf,3,1,4
5,"(Beans, Green_Peppers, Corn)",(Tomatoes),0.142857,0.428571,0.142857,1.0,2.333333,0.081633,inf,3,1,4
6,"(Green_Peppers, Tomatoes, Corn)",(Beans),0.142857,0.714286,0.142857,1.0,1.4,0.040816,inf,3,1,4


In [19]:
#Now display rules that satisfies following condition
rules[ (rules['antecedent_len'] ==2) &
       (rules['confidence'] > 0.75) &
       (rules['consequents_len'] ==1)     ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len,consequents_len,total_len
0,"(Potato, Corn)",(Squash),0.142857,0.5,0.142857,1.0,2.0,0.071429,inf,2,1,3
1,"(Squash, Tomatoes)",(Beans),0.142857,0.714286,0.142857,1.0,1.4,0.040816,inf,2,1,3
2,"(Brocoli, Corn)",(Green_Peppers),0.142857,0.357143,0.142857,1.0,2.8,0.091837,inf,2,1,3
3,"(Tomatoes, Brocoli)",(Green_Peppers),0.142857,0.357143,0.142857,1.0,2.8,0.091837,inf,2,1,3
