In [1]:
# Import necessary libraries.
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load Excel data source as Pandas dataframe.
df = pd.read_excel("./data/groceries_dataset_sample_v1.xlsx")

In [3]:
# Print out the header of the dataframe.
df.head()

Unnamed: 0,InvoiceNo,CustomerID,InvoiceDate,ItemDescription,Quantity
0,9246351480504524,1000,2014-06-24,whole milk,2
1,9246351480504524,1000,2014-06-24,pastry,3
2,9246351480504524,1000,2014-06-24,salty snack,2
3,2066741216007356,1000,2015-03-15,sausage,4
4,2066741216007356,1000,2015-03-15,whole milk,3


In [4]:
# Check any null record in the dataframe.
df.isnull().sum().sort_values(ascending=False)

InvoiceNo          0
CustomerID         0
InvoiceDate        0
ItemDescription    0
Quantity           0
dtype: int64

In [5]:
# Obtain the descriptive statistics of the dataframe.
df.describe()

Unnamed: 0,InvoiceNo,CustomerID,Quantity
count,1920.0,1920.0,1920.0
mean,5422356000000000.0,1099.888021,2.948958
std,2683089000000000.0,57.965179,1.428329
min,1002270000000000.0,1000.0,1.0
25%,3103041000000000.0,1051.0,2.0
50%,5335092000000000.0,1102.0,3.0
75%,7825146000000000.0,1148.0,4.0
max,9999248000000000.0,1200.0,5.0


In [6]:
# Obtain the earliest and latest date of InvoiceDate in the dataframe.
df['InvoiceDate'].agg(['min', 'max'])

min   2014-01-02
max   2015-12-29
Name: InvoiceDate, dtype: datetime64[ns]

In [7]:
# Create a matrix to show the purchased items in each invoice number.
basket = (df.groupby(['InvoiceNo', 'ItemDescription'])['Quantity'].sum()
          .unstack().reset_index().fillna(0).set_index('InvoiceNo'))

In [8]:
# Display the header of the matrix.
basket.head(10)

ItemDescription,Instant food products,UHT-milk,artif. sweetener,baking powder,bathroom cleaner,beef,berries,beverages,bottled beer,bottled water,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1002269524162774,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1006393877771546,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1021688255628550,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1040896218536178,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1045062390655717,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
1064161673432954,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1066929395332836,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1080180046391208,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1080532524089707,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1094405245901214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# Define the function to encode the sum of the item purchase quantity to a binary number.
def encode(x):
  if x < 1:
    return 0
  else:
    return 1

# Apply the function to encode the sum of the item purchase quantity to a binary number.
basket = basket.applymap(encode)
basket.head(5)

ItemDescription,Instant food products,UHT-milk,artif. sweetener,baking powder,bathroom cleaner,beef,berries,beverages,bottled beer,bottled water,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1002269524162774,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1006393877771546,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1021688255628550,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1040896218536178,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1045062390655717,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [10]:
# Define the sample item in the basket.
item_in_basket = basket.loc[basket['berries']==1]
# Applying the Apriori algorithm, creating the association rules for the sample item in the basket.
frequent_itemsets = apriori(item_in_basket, min_support=0.15, use_colnames=True)
# Obtaining a frequent itemset by calling the association_rules function.
frequent_itemsets_association_rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
# Sorting the result by Lift and Support metric.
frequent_itemsets_association_rules.sort_values(['lift','support'],ascending=False).reset_index(drop=True)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(berries),(other vegetables),1.0,0.266667,0.266667,0.266667,1.0,0.0,1.0
1,(other vegetables),(berries),0.266667,1.0,0.266667,1.0,1.0,0.0,inf
2,(berries),(white bread),1.0,0.266667,0.266667,0.266667,1.0,0.0,1.0
3,(white bread),(berries),0.266667,1.0,0.266667,1.0,1.0,0.0,inf
4,(berries),(yogurt),1.0,0.2,0.2,0.2,1.0,0.0,1.0
5,(yogurt),(berries),0.2,1.0,0.2,1.0,1.0,0.0,inf


In [11]:
# Define the function to obtain the frequently bought together items.
def frequently_bought_together(item):
    # Obtain the item name that is already in the basket.
    item_in_basket = basket.loc[basket[item]==1]
    # Applying the Apriori algorithm, creating the association rules for the sample item in the basket.
    frequent_itemsets = apriori(item_in_basket, min_support=0.15, use_colnames=True)
    # Obtaining a frequent itemset by calling the association_rules function.
    rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
    # Filter the result by confidence less than 1.
    filtered_rules = rules[(rules['confidence']<1)]
    # Sorting the result by Lift and Support metric.
    filtered_rules.sort_values(['lift','support'],ascending=False).reset_index(drop=True)
    print('Items frequently bought together with {0}'.format(item))
    return filtered_rules['consequents'].unique()[:10]

In [12]:
# Apply the function to obtain the frequently bought together items.
frequently_bought_together('berries')

Items frequently bought together with berries


array([frozenset({'other vegetables'}), frozenset({'white bread'}),
       frozenset({'yogurt'})], dtype=object)