# Association Rule Mining

Here, we will use chipotle transaction dataset (`chipotle.tsv`).

# Load data

In [1]:
import pandas as pd

df = pd.read_csv('chipotle.tsv', delimiter='\t') # tab-separated values
df

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98
...,...,...,...,...,...
4617,1833,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Sour ...",$11.75
4618,1833,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Sour Cream, Cheese...",$11.75
4619,1834,1,Chicken Salad Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Pinto...",$11.25
4620,1834,1,Chicken Salad Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Lettu...",$8.75


In [2]:
# the first basket
df[df['order_id'] == 1] # df can be filtered by a condition

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39


In [3]:
# how many baskets in the data?
df['order_id'].drop_duplicates().count()

1834

# Data Transformation for Association Rule Mining
We need to transform the original data table into an appropriate format for ARM.

Here, we make an ordered list of empty baskets and add items to the corresponding baskets.

For example, the first basket becomes like this:
```
['Chips and Fresh Tomato Salsa',
  'Izze',
  'Nantucket Nectar',
  'Chips and Tomatillo-Green Chili Salsa']
```

Then, we will use `TransactionEncoder()` to transform the ordered list into an appropriate data format.

In [4]:
n_transaction = df['order_id'].drop_duplicates().count() # 1834

# Create 1,834 blank lists (i.e., baskets)
basket_list = [[] for i in range(n_transaction)] # blank baskets

# for each item row, we check its basket number 
# and add it to the corresponding basket
for i, row in df.iterrows():
    idx = row['order_id'] - 1 # basket index
    item = row['item_name'] # item for the basket
    # print(i, idx, item)
    if item not in basket_list[idx]: # if the corresponding basket do not contain the item
        basket_list[idx].append(item) # append item to the basket

In [5]:
basket_list

[['Chips and Fresh Tomato Salsa',
  'Izze',
  'Nantucket Nectar',
  'Chips and Tomatillo-Green Chili Salsa'],
 ['Chicken Bowl'],
 ['Chicken Bowl', 'Side of Chips'],
 ['Steak Burrito', 'Steak Soft Tacos'],
 ['Steak Burrito', 'Chips and Guacamole'],
 ['Chicken Crispy Tacos', 'Chicken Soft Tacos'],
 ['Chicken Bowl', 'Chips and Guacamole'],
 ['Chips and Tomatillo-Green Chili Salsa', 'Chicken Burrito'],
 ['Chicken Burrito', 'Canned Soda'],
 ['Chicken Bowl', 'Chips and Guacamole'],
 ['Barbacoa Burrito', 'Nantucket Nectar'],
 ['Chicken Burrito', 'Izze'],
 ['Chips and Fresh Tomato Salsa', 'Chicken Bowl'],
 ['Carnitas Burrito', 'Canned Soda'],
 ['Chicken Burrito', 'Chips and Tomatillo-Green Chili Salsa'],
 ['Steak Burrito', 'Side of Chips'],
 ['Carnitas Bowl', 'Bottled Water'],
 ['Chicken Soft Tacos',
  'Chips and Guacamole',
  'Chips and Tomatillo Green Chili Salsa'],
 ['Barbacoa Bowl', 'Chips'],
 ['Chips and Guacamole',
  'Chicken Bowl',
  'Steak Burrito',
  'Chicken Salad Bowl'],
 ['Chicken 

We will transform the basket list into a dataframe for apriori using `TransactionEncoder`

In [6]:
from mlxtend.preprocessing import TransactionEncoder
 
transaction_encoder = TransactionEncoder() # define encoder
transaction_encoder.fit(basket_list) # encoder scans all items
transaction_encoder.columns_ # scanned item list (total 50 items)

['6 Pack Soft Drink',
 'Barbacoa Bowl',
 'Barbacoa Burrito',
 'Barbacoa Crispy Tacos',
 'Barbacoa Salad Bowl',
 'Barbacoa Soft Tacos',
 'Bottled Water',
 'Bowl',
 'Burrito',
 'Canned Soda',
 'Canned Soft Drink',
 'Carnitas Bowl',
 'Carnitas Burrito',
 'Carnitas Crispy Tacos',
 'Carnitas Salad',
 'Carnitas Salad Bowl',
 'Carnitas Soft Tacos',
 'Chicken Bowl',
 'Chicken Burrito',
 'Chicken Crispy Tacos',
 'Chicken Salad',
 'Chicken Salad Bowl',
 'Chicken Soft Tacos',
 'Chips',
 'Chips and Fresh Tomato Salsa',
 'Chips and Guacamole',
 'Chips and Mild Fresh Tomato Salsa',
 'Chips and Roasted Chili Corn Salsa',
 'Chips and Roasted Chili-Corn Salsa',
 'Chips and Tomatillo Green Chili Salsa',
 'Chips and Tomatillo Red Chili Salsa',
 'Chips and Tomatillo-Green Chili Salsa',
 'Chips and Tomatillo-Red Chili Salsa',
 'Crispy Tacos',
 'Izze',
 'Nantucket Nectar',
 'Salad',
 'Side of Chips',
 'Steak Bowl',
 'Steak Burrito',
 'Steak Crispy Tacos',
 'Steak Salad',
 'Steak Salad Bowl',
 'Steak Soft Ta

In [7]:
# check if each basket includes the scanned items,
# and return its results (as a numpy ndarray)
encoded_array = transaction_encoder.transform(basket_list)
encoded_array

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [8]:
# transform the encoded array into pandas dataframe
df = pd.DataFrame(encoded_array, columns=transaction_encoder.columns_)
df.head() # check only the first 5 rows

Unnamed: 0,6 Pack Soft Drink,Barbacoa Bowl,Barbacoa Burrito,Barbacoa Crispy Tacos,Barbacoa Salad Bowl,Barbacoa Soft Tacos,Bottled Water,Bowl,Burrito,Canned Soda,...,Steak Crispy Tacos,Steak Salad,Steak Salad Bowl,Steak Soft Tacos,Veggie Bowl,Veggie Burrito,Veggie Crispy Tacos,Veggie Salad,Veggie Salad Bowl,Veggie Soft Tacos
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


# Association Rule Mining

In [9]:
from mlxtend.frequent_patterns import apriori, association_rules

# check frequent itemsets
frequent_itemsets = apriori(df, min_support=0.05, use_colnames=True)
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.083969,(Bottled Water)
1,0.051254,(Canned Soda)
2,0.150491,(Canned Soft Drink)
3,0.335333,(Chicken Bowl)
4,0.26663,(Chicken Burrito)
5,0.053435,(Chicken Salad Bowl)
6,0.058342,(Chicken Soft Tacos)
7,0.113413,(Chips)
8,0.059978,(Chips and Fresh Tomato Salsa)
9,0.258451,(Chips and Guacamole)


In [10]:
# frequent itemsets sorted by support in ascending order
frequent_itemsets.sort_values(by='support', ascending=False)

Unnamed: 0,support,itemsets
3,0.335333,(Chicken Bowl)
4,0.26663,(Chicken Burrito)
9,0.258451,(Chips and Guacamole)
12,0.186478,(Steak Burrito)
2,0.150491,(Canned Soft Drink)
7,0.113413,(Chips)
11,0.102508,(Steak Bowl)
0,0.083969,(Bottled Water)
15,0.081243,"(Chicken Bowl, Chips and Guacamole)"
14,0.066521,"(Chicken Bowl, Chips)"


In [11]:
# association rules (criteria: confidence>=0.5)
association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Chips),(Chicken Bowl),0.113413,0.335333,0.066521,0.586538,1.749124,0.02849,1.607568,0.483072


In [12]:
# association rules (criteria: lift>=1.5)
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.5)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Chicken Bowl),(Chips),0.335333,0.113413,0.066521,0.198374,1.749124,0.02849,1.105985,0.64436
1,(Chips),(Chicken Bowl),0.113413,0.335333,0.066521,0.586538,1.749124,0.02849,1.607568,0.483072
