### Market Basket Analysis
Investigating Lift vs. Confidence

In [2]:
#install mlxtended for data encoding, apriori, and association_rule
#pip install mlxtend

In [6]:
#import needed libariries
from mlxtend.preprocessing 
import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [12]:
#create dataframe from sample file - 200 random market baskets
df = pd.read_excel('market_baskets.xlsx')

In [22]:
#check column name
df.columns

Index([' Transaction ID ', ' Items                                  '], dtype='object')

In [25]:
#clean columen names and drop transaction ID
df.rename(columns=lambda x: x.strip(), inplace=True)
df.drop(['Transaction ID'], axis=1, inplace=True)

In [31]:
#check results
df.head()

Unnamed: 0,Items
0,"Milk, Bread, Eggs, Apples"
1,"Bread, Butter, Cheese, Yogurt"
2,"Eggs, Bacon, Sausage"
3,"Apples, Oranges, Bananas, Grapes"
4,"Milk, Bread, Eggs, Yogurt"


In [34]:
#create makret baskets in list format
market_basket = [x.split() for x in df.Items]

In [36]:
#convert the market basket data into a transaction-encoded format
te = TransactionEncoder()
te_ary = te.fit_transform(market_basket)

In [39]:
#check output array
te_ary

array([[ True, False, False, ..., False, False, False],
       [False, False, False, ..., False,  True, False],
       [False, False,  True, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False,  True, False, ..., False,  True, False],
       [ True, False, False, ..., False, False, False]])

In [46]:
#check output columns
str(te.columns_)

"['Apples', 'Apples,', 'Bacon,', 'Bananas,', 'Beans,', 'Bread', 'Bread,', 'Butter', 'Butter,', 'Carrots,', 'Cereal,', 'Cheese,', 'Chicken,', 'Chips,', 'Coffee,', 'Cream', 'Eggs,', 'Garlic,', 'Grapes', 'Grapes,', 'Ham,', 'Kiwi', 'Lettuce', 'Milk,', 'Oranges,', 'Parmesan', 'Pasta,', 'Peas', 'Potatoes,', 'Rice,', 'Salsa', 'Sauce,', 'Sausage', 'Soda,', 'Sugar,', 'Tortillas', 'Yogurt', 'Yogurt,']"

In [None]:
#create df with resuls
df = pd.DataFrame(te_ary, columns=te.columns_)

In [55]:
#frequent itemset mining using apriori algorithm, min suppot = 0.1, show result sorted by suppot
frequent_itemsets = apriori(df, min_support=0.1, use_colnames=True)
frequent_itemsets.sort_values(['support'], ascending=False).head(10)

Unnamed: 0,support,itemsets
2,0.33,"(Bread,)"
5,0.27,"(Eggs,)"
6,0.27,"(Milk,)"
8,0.26,(Yogurt)
15,0.205,"(Milk,, Eggs,)"
17,0.2,"(Milk,, Bread,, Eggs,)"
13,0.2,"(Milk,, Bread,)"
12,0.2,"(Bread,, Eggs,)"
0,0.195,"(Apples,)"
4,0.135,"(Chicken,)"


In [63]:
#create association rules with a minimum lift threshold of 1
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules.sort_values(['confidence'], ascending=False).head(50)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,"(Bananas,)","(Apples,)",0.13,0.195,0.13,1.0,5.128205,0.10465,inf,0.925287
3,"(Oranges,)","(Apples,)",0.13,0.195,0.13,1.0,5.128205,0.10465,inf,0.925287
18,"(Bread,, Eggs,)","(Milk,)",0.2,0.27,0.2,1.0,3.703704,0.146,inf,0.9125
5,"(Cheese,)","(Bread,)",0.13,0.33,0.13,1.0,3.030303,0.0871,inf,0.770115
16,"(Milk,, Bread,)","(Eggs,)",0.2,0.27,0.2,1.0,3.703704,0.146,inf,0.9125
17,"(Milk,, Eggs,)","(Bread,)",0.205,0.33,0.2,0.97561,2.956393,0.13235,27.47,0.83239
12,"(Milk,)","(Eggs,)",0.27,0.27,0.205,0.759259,2.812071,0.1321,3.032308,0.882726
13,"(Eggs,)","(Milk,)",0.27,0.27,0.205,0.759259,2.812071,0.1321,3.032308,0.882726
19,"(Milk,)","(Bread,, Eggs,)",0.27,0.2,0.2,0.740741,3.703704,0.146,3.085714,1.0
21,"(Eggs,)","(Milk,, Bread,)",0.27,0.2,0.2,0.740741,3.703704,0.146,3.085714,1.0


The choice between lift and confidence as a metric in market basket analysis depends on your specific goals and the context of your analysis. Both metrics provide different insights and can be useful depending on the situation.  However, lift should be confirmed when high confidence is observed to take into account the expected support under independence.