In this lab excercise you will learn how to do market basket analysis using apriori algorithm. 

Install  __mlxtend__ library from anaconda prompt 


In [2]:
#import all required classes for market basket analysis from mlxtedn library
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

# Example 1

In this example instead of reading data from external sources we are going to 
create small transaction dataset to understand how to generate rules from
sample transaction dataset using apriori algorithm. Transaction dataset contains only three transactions as follows

In [3]:
sample_transactions_data = [['milk', 'bread', 'water'],['coffe', 'sugar','water' ],['burgers', 'eggs','bread']]

In [4]:
sample_transactions_data

[['milk', 'bread', 'water'],
 ['coffe', 'sugar', 'water'],
 ['burgers', 'eggs', 'bread']]

To use apriori algorithm on transaction dataset, requirement is that data should be in tabular
format where each row represents transaction and column represents existance of 
item in transaction. So let us first transform transaction data to the format required by algorithm.
No. of rows=no. of transactions (baskets to be alalyzed)
No. of columns=all unique items present in all transactions 

In [5]:
#instantiate TransactionEncoder
#it is required to convert data into required format (where each row represents transaction and column 
#represents presence of item in transaction)
transaction_encoder = TransactionEncoder()

In [7]:
#fit and transform data
transaction_array= transaction_encoder.fit(sample_transactions_data).transform(sample_transactions_data)

In [8]:
transaction_array

array([[ True, False, False, False,  True, False,  True],
       [False, False,  True, False, False,  True,  True],
       [ True,  True, False,  True, False, False, False]])

In [9]:
#import pandas library
import pandas as pd

In [10]:
#Create data frame
transaction_data = pd.DataFrame(transaction_array, columns=transaction_encoder.columns_)
#te.columns_ returns column names. column names are the names of all items in all transactions

In [11]:
#display data - the format now is as per requirement of algorithm
transaction_data

Unnamed: 0,bread,burgers,coffe,eggs,milk,sugar,water
0,True,False,False,False,True,False,True
1,False,False,True,False,False,True,True
2,True,True,False,True,False,False,False


In [12]:
#find dimensions of data
transaction_data.shape

(3, 7)

In [13]:
#find frequent itemsets in the data with specified support count using apriori algorithm
frequent_itemsets = apriori(transaction_data, min_support=0.3, use_colnames=True)
#if you don't set use_colnames=True, it returns column index instead of column name (item name)

In [14]:
#display frequent itemsets. 
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.666667,(bread)
1,0.333333,(burgers)
2,0.333333,(coffe)
3,0.333333,(eggs)
4,0.333333,(milk)
5,0.333333,(sugar)
6,0.666667,(water)
7,0.333333,"(burgers, bread)"
8,0.333333,"(bread, eggs)"
9,0.333333,"(milk, bread)"


In [127]:
#generate rules from frequent itemsets by specifying threshold for confidence or lift
rules= association_rules(frequent_itemsets, metric="confidence", min_threshold=.6)

In [128]:
#display rules
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(burgers),(bread),0.333333,0.666667,0.333333,1.0,1.5,0.111111,inf
1,(eggs),(bread),0.333333,0.666667,0.333333,1.0,1.5,0.111111,inf
2,(milk),(bread),0.333333,0.666667,0.333333,1.0,1.5,0.111111,inf
3,(eggs),(burgers),0.333333,0.333333,0.333333,1.0,3.0,0.222222,inf
4,(burgers),(eggs),0.333333,0.333333,0.333333,1.0,3.0,0.222222,inf
5,(sugar),(coffe),0.333333,0.333333,0.333333,1.0,3.0,0.222222,inf
6,(coffe),(sugar),0.333333,0.333333,0.333333,1.0,3.0,0.222222,inf
7,(coffe),(water),0.333333,0.666667,0.333333,1.0,1.5,0.111111,inf
8,(milk),(water),0.333333,0.666667,0.333333,1.0,1.5,0.111111,inf
9,(sugar),(water),0.333333,0.666667,0.333333,1.0,1.5,0.111111,inf


In [129]:
#Now display rule with lift greater than 1
rules[rules['lift']>1]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(burgers),(bread),0.333333,0.666667,0.333333,1.0,1.5,0.111111,inf
1,(eggs),(bread),0.333333,0.666667,0.333333,1.0,1.5,0.111111,inf
2,(milk),(bread),0.333333,0.666667,0.333333,1.0,1.5,0.111111,inf
3,(eggs),(burgers),0.333333,0.333333,0.333333,1.0,3.0,0.222222,inf
4,(burgers),(eggs),0.333333,0.333333,0.333333,1.0,3.0,0.222222,inf
5,(sugar),(coffe),0.333333,0.333333,0.333333,1.0,3.0,0.222222,inf
6,(coffe),(sugar),0.333333,0.333333,0.333333,1.0,3.0,0.222222,inf
7,(coffe),(water),0.333333,0.666667,0.333333,1.0,1.5,0.111111,inf
8,(milk),(water),0.333333,0.666667,0.333333,1.0,1.5,0.111111,inf
9,(sugar),(water),0.333333,0.666667,0.333333,1.0,1.5,0.111111,inf


In [130]:
#display only LHS, RHS, support of rule , confidence of the rule and lift value
#rules.loc[:,['antecedents','consequents','support','support','confidence','lift']]
rules[['antecedents','consequents','support','confidence','lift']]

Unnamed: 0,antecedents,consequents,support,confidence,lift
0,(burgers),(bread),0.333333,1.0,1.5
1,(eggs),(bread),0.333333,1.0,1.5
2,(milk),(bread),0.333333,1.0,1.5
3,(eggs),(burgers),0.333333,1.0,3.0
4,(burgers),(eggs),0.333333,1.0,3.0
5,(sugar),(coffe),0.333333,1.0,3.0
6,(coffe),(sugar),0.333333,1.0,3.0
7,(coffe),(water),0.333333,1.0,1.5
8,(milk),(water),0.333333,1.0,1.5
9,(sugar),(water),0.333333,1.0,1.5


In [131]:
for i in range(23):
    print(rules.iloc[i]['antecedents'],"->",rules.iloc[i]['consequents'],"support=",rules.iloc[i]['support'],"confidence=",rules.iloc[i]['confidence'],"lift=",rules.iloc[i]['lift'])


frozenset({'burgers'}) -> frozenset({'bread'}) support= 0.3333333333333333 confidence= 1.0 lift= 1.5
frozenset({'eggs'}) -> frozenset({'bread'}) support= 0.3333333333333333 confidence= 1.0 lift= 1.5
frozenset({'milk'}) -> frozenset({'bread'}) support= 0.3333333333333333 confidence= 1.0 lift= 1.5
frozenset({'eggs'}) -> frozenset({'burgers'}) support= 0.3333333333333333 confidence= 1.0 lift= 3.0
frozenset({'burgers'}) -> frozenset({'eggs'}) support= 0.3333333333333333 confidence= 1.0 lift= 3.0
frozenset({'sugar'}) -> frozenset({'coffe'}) support= 0.3333333333333333 confidence= 1.0 lift= 3.0
frozenset({'coffe'}) -> frozenset({'sugar'}) support= 0.3333333333333333 confidence= 1.0 lift= 3.0
frozenset({'coffe'}) -> frozenset({'water'}) support= 0.3333333333333333 confidence= 1.0 lift= 1.5
frozenset({'milk'}) -> frozenset({'water'}) support= 0.3333333333333333 confidence= 1.0 lift= 1.5
frozenset({'sugar'}) -> frozenset({'water'}) support= 0.3333333333333333 confidence= 1.0 lift= 1.5
frozenset