In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

onehot=pd.read_csv('onehot.csv',index_col=0)

In [2]:
# Select the column headers for sign items
sign_headers = [i for i in onehot.columns if i.lower().find('sign')>=0]

# Select columns of sign items using sign_headers
sign_columns = onehot[sign_headers]

# Perform aggregation of sign items into sign category
signs = sign_columns.sum(axis = 1) >= 1.0

# Print support for signs
print('Share of Signs: %.2f' % signs.mean())

Share of Signs: 0.10


In [3]:
def aggregate(item):
	# Select the column headers for sign items in onehot
	item_headers = [i for i in onehot.columns if i.lower().find(item)>=0]

	# Select columns of sign items
	item_columns = onehot[item_headers]

	# Return category of aggregated items
	return item_columns.sum(axis = 1) >= 1.0

# Aggregate items for the bags, boxes, and candles categories  
bags = aggregate('bag')
boxes = aggregate('box')
candles = aggregate('candle')

In [4]:
# Import apriori from mlxtend
from mlxtend.frequent_patterns import apriori

# Compute frequent itemsets using the Apriori algorithm
frequent_itemsets = apriori(onehot, 
                            min_support = 0.006, 
                            max_len = 3, 
                            use_colnames = True)

# Print a preview of the frequent itemsets
print(frequent_itemsets.head())

    support                              itemsets
0  0.006767          (HOT WATER BOTTLE KEEP CALM)
1  0.007519             (JUMBO BAG RED RETROSPOT)
2  0.006015     (PAPER CHAIN KIT 50'S CHRISTMAS )
3  0.006015                      (POPCORN HOLDER)
4  0.006767  (WHITE HANGING HEART T-LIGHT HOLDER)


In [5]:
# Import apriori from mlxtend
from mlxtend.frequent_patterns import apriori

# Compute frequent itemsets using a support of 0.003 and length of 3
frequent_itemsets_1 = apriori(onehot, min_support = 0.003, 
                            max_len = 3, use_colnames = True)

# Compute frequent itemsets using a support of 0.001 and length of 3
frequent_itemsets_2 = apriori(onehot, min_support = 0.001, 
                            max_len = 3, use_colnames = True)

# Print the number of freqeuent itemsets
print(len(frequent_itemsets_1), len(frequent_itemsets_2))

91 429


In [6]:
# Import the association rule function from mlxtend
from mlxtend.frequent_patterns import association_rules

# Compute all association rules for frequent_itemsets_1
rules_1 = association_rules(frequent_itemsets_1, 
                            metric = "support", 
                         	min_threshold = 0.0015)

# Compute all association rules for frequent_itemsets_2
rules_2 = association_rules(frequent_itemsets_2, 
                            metric = "support", 
                        	min_threshold = 0.0015)

# Print the number of association rules generated
print(len(rules_1), len(rules_2))

0 2


In [8]:
# Import the association rules function
from mlxtend.frequent_patterns import association_rules

# Compute frequent itemsets using the Apriori algorithm
frequent_itemsets = apriori(onehot, min_support = 0.001, 
                            max_len = 2, use_colnames = True)

# Compute all association rules for frequent_itemsets
rules = association_rules(frequent_itemsets, 
                            metric = "lift", 
                         	min_threshold = 1.0)

# Print association rules
(rules)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,"(BIRTHDAY CARD, RETRO SPOT)",(JUMBO BAG RED RETROSPOT),0.002256,0.007519,0.001504,0.666667,88.666667,0.001487,2.977444,0.990957
1,(JUMBO BAG RED RETROSPOT),"(BIRTHDAY CARD, RETRO SPOT)",0.007519,0.002256,0.001504,0.2,88.666667,0.001487,1.24718,0.996212


In [10]:
# Import the association rules function
from mlxtend.frequent_patterns import apriori, association_rules

# Compute frequent itemsets using the Apriori algorithm
frequent_itemsets = apriori(onehot, min_support = 0.0015, 
                            max_len = 2, use_colnames = True)

# Compute all association rules using confidence
rules = association_rules(frequent_itemsets, 
                            metric = "confidence", 
                         	min_threshold = 0.5)

# Print association rules
(rules)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,"(BIRTHDAY CARD, RETRO SPOT)",(JUMBO BAG RED RETROSPOT),0.002256,0.007519,0.001504,0.666667,88.666667,0.001487,2.977444,0.990957


In [14]:
# Apply the apriori algorithm with a minimum support of 0.0001
aggregated = pd.read_csv('aggregated.csv',index_col=0)
frequent_itemsets = apriori(aggregated, min_support = 0.0001, use_colnames = True)

# Generate the initial set of rules using a minimum support of 0.0001
rules = association_rules(frequent_itemsets, 
                          metric = "support", min_threshold = 0.0001)

# Set minimum antecedent support to 0.35
rules = rules[rules['antecedent support'] > 0.35]

# Set maximum consequent support to 0.35
rules = rules[rules['consequent support'] < 0.35]

# Print the remaining rules
(rules)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(bag),(box),0.466307,0.256065,0.021563,0.046243,0.18059,-0.097841,0.780005,-0.894758
2,(bag),(candle),0.466307,0.088949,0.010782,0.023121,0.25994,-0.030696,0.932615,-0.842137
8,(sign),(box),0.355795,0.256065,0.018868,0.05303,0.207097,-0.072239,0.785596,-0.855975
10,(sign),(candle),0.355795,0.088949,0.008086,0.022727,0.25551,-0.023561,0.932238,-0.818939
15,(sign),"(bag, candle)",0.355795,0.010782,0.005391,0.015152,1.405303,0.001555,1.004437,0.447699
16,(bag),"(sign, candle)",0.466307,0.008086,0.005391,0.011561,1.429672,0.00162,1.003515,0.563131


In [18]:
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.466307,(bag)
1,0.256065,(box)
2,0.088949,(candle)
3,0.355795,(sign)
4,0.021563,"(bag, box)"
5,0.010782,"(bag, candle)"
6,0.097035,"(sign, bag)"
7,0.016173,"(box, candle)"
8,0.018868,"(sign, box)"
9,0.008086,"(sign, candle)"


In [21]:
# Generate the initial set of rules using a minimum lift of 1.00
rules = association_rules(frequent_itemsets, metric = "lift", min_threshold = 1.00)

# Set antecedent support to 0.005
rules = rules[rules['antecedent support'] > 0.005]

# Set consequent support to 0.005
rules = rules[rules['consequent support'] > 0.005]
def zhangs_rule(rules):
	PAB = rules['support'].copy()
	PA = rules['antecedent support'].copy()
	PB = rules['consequent support'].copy()
	NUMERATOR = PAB - PA*PB
	DENOMINATOR = np.max((PAB*(1-PA).values,PA*(PB-PAB).values), axis = 0)
	return NUMERATOR / DENOMINATOR  
# Compute Zhang's rule
rules['zhang'] = zhangs_rule(rules)

# Set the lower bound for Zhang's rule to 0.98
rules = rules[rules['zhang'] > 0.98]
(rules[['antecedents', 'consequents']])

Unnamed: 0,antecedents,consequents


In [26]:
df

Unnamed: 0,InvoiceNo,StockCode,Description
0,562583,35637A,IVORY STRING CURTAIN WITH POLE
1,562583,35638A,PINK AND BLACK STRING CURTAIN
2,562583,84927F,PSYCHEDELIC TILE HOOK
3,562583,22425,ENAMEL COLANDER CREAM
4,562583,16008,SMALL FOLDING SCISSOR(POINTED EDGE)
...,...,...,...
227755,C581229,23158,SET OF 5 LUCKY CAT MAGNETS
227756,C581229,22712,CARD DOLLY GIRL
227757,C581229,22027,TEA PARTY BIRTHDAY CARD
227758,C581229,21508,VINTAGE KID DOLLY CARD


In [27]:
# read in the dataset
df = pd.read_csv('./online_retail.csv')

# step 1: create a list of unique items
unique_items = list(set(df['Description'].tolist()))
df = df.dropna(subset=['Description'])
df['Description'] = df['Description'].astype(str)
# step 2: create a one-hot encoded matrix
onehot = []
for index, row in df.iterrows():
    binary = []
    for item in unique_items:
        if item in list(set(df['Description'].tolist())):
            binary.append(1)
        else:
            binary.append(0)
    onehot.append(binary)

In [22]:
# Apply the Apriori algorithm with a minimum support threshold of 0.001
frequent_itemsets = apriori(onehot, min_support = 0.001, use_colnames = True)

# Recover association rules using a minium support threshold of 0.001
rules = association_rules(frequent_itemsets, metric = 'support', min_threshold = 0.001)

# Apply a 0.002 antecedent support threshold, 0.60 confidence threshold, and 2.50 lift threshold
filtered_rules = rules[(rules['antecedent support'] > 0.002) &
						(rules['consequent support'] > 0.01) &
						(rules['confidence'] > 0.60) &
						(rules['lift'] > 2.50)]

# Print remaining rule
print(filtered_rules[['antecedents','consequents']])

Empty DataFrame
Columns: [antecedents, consequents]
Index: []
