In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# hide warnings
import warnings
warnings.filterwarnings('ignore')

# print all columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

- The data for Market Basket Analysis comes in many form which has to be transformed to use with the `apyori` package.
- We will take two examples of such data and do the analyses.
    1. **Basic Example** - Comma separated dataset having each record as products sold in a single transaction.
    2. **Advanced Example** - Dataset with columns haivng different features and one column for products and each row representing single product.

# 1. Basic Example

Comma separated dataset having each record as products sold in a single transaction.

In [2]:
# import data
df1 = pd.read_csv('online_retail_basic.csv', header = None)
df1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,turkey,avocado,,,,,,,,,,,,,,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,


In [3]:
df1.shape

(7501, 20)

- We have a dataset with **7,501 transactions** and each transaction has **at most 20 products**.
- We will now convert this data into the structure required for `apyori` package i.e. `list` of transactions and each transaction represnted as a `tuple`.

In [4]:
%%time

records = []
for i in range(len(df1)):
    records.append(tuple(df1.loc[i, pd.notna(df1.iloc[i, :])]))

Wall time: 5.29 s


In [5]:
for e in range(5):
    print(e+1, records[e], sep = " - ")

1 - ('shrimp', 'almonds', 'avocado', 'vegetables mix', 'green grapes', 'whole weat flour', 'yams', 'cottage cheese', 'energy drink', 'tomato juice', 'low fat yogurt', 'green tea', 'honey', 'salad', 'mineral water', 'salmon', 'antioxydant juice', 'frozen smoothie', 'spinach', 'olive oil')
2 - ('burgers', 'meatballs', 'eggs')
3 - ('chutney',)
4 - ('turkey', 'avocado')
5 - ('mineral water', 'milk', 'energy bar', 'whole wheat rice', 'green tea')


### `apyori.apriori`

In [6]:
# Apriori Algorithm
from apyori import apriori

association_rules = apriori(transactions = records,
                            min_support = .0045,     # consider items(sets) appearing 0.45% of the time
                            min_confidence = 0.2,
                            min_lift = 3,
                            min_length = 2     # minimum items in an itemset
                           )

In [7]:
association_results = list(association_rules)

In [8]:
len(association_results)

24

In [9]:
for e in range(5):
    print(e+1, association_results[e], sep = " - ")

1 - RelationRecord(items=frozenset({'light cream', 'chicken'}), support=0.004532728969470737, ordered_statistics=[OrderedStatistic(items_base=frozenset({'light cream'}), items_add=frozenset({'chicken'}), confidence=0.29059829059829057, lift=4.84395061728395)])
2 - RelationRecord(items=frozenset({'escalope', 'mushroom cream sauce'}), support=0.005732568990801226, ordered_statistics=[OrderedStatistic(items_base=frozenset({'mushroom cream sauce'}), items_add=frozenset({'escalope'}), confidence=0.3006993006993007, lift=3.790832696715049)])
3 - RelationRecord(items=frozenset({'escalope', 'pasta'}), support=0.005865884548726837, ordered_statistics=[OrderedStatistic(items_base=frozenset({'pasta'}), items_add=frozenset({'escalope'}), confidence=0.3728813559322034, lift=4.700811850163794)])
4 - RelationRecord(items=frozenset({'ground beef', 'herb & pepper'}), support=0.015997866951073192, ordered_statistics=[OrderedStatistic(items_base=frozenset({'herb & pepper'}), items_add=frozenset({'ground 

In [10]:
# print rules in proper format
for item in association_results[:5]:

    # first index of the inner list
    # Contains base item and add item
    pair = item[0] 
    items = [x for x in pair]
    print("Rule: " + items[0] + " -> " + items[1])

    #second index of the inner list
    print("Support: " + str(item[1]))

    #third index of the list located at 0th
    #of the third index of the inner list

    print("Confidence: " + str(item[2][0][2]))
    print("Lift: " + str(item[2][0][3]))
    print("=====================================")

Rule: light cream -> chicken
Support: 0.004532728969470737
Confidence: 0.29059829059829057
Lift: 4.84395061728395
Rule: escalope -> mushroom cream sauce
Support: 0.005732568990801226
Confidence: 0.3006993006993007
Lift: 3.790832696715049
Rule: escalope -> pasta
Support: 0.005865884548726837
Confidence: 0.3728813559322034
Lift: 4.700811850163794
Rule: ground beef -> herb & pepper
Support: 0.015997866951073192
Confidence: 0.3234501347708895
Lift: 3.2919938411349285
Rule: tomato sauce -> ground beef
Support: 0.005332622317024397
Confidence: 0.3773584905660377
Lift: 3.840659481324083


### `efficient-apriori`

- The `apyori` library is very limited in that the proper interpretation of results, exploration of itemsets etc. needs added coding. 
- We will now use the `efficient-apriori` library to overcome these limitations.

In [11]:
from efficient_apriori import apriori

itemsets, rules = apriori(transactions = records,
                          min_support = 0.0045,
                          min_confidence = 0.2
                         )

- We can now print a rule and check it's stats

In [12]:
# print only one rule
print(rules[0])

{almonds} -> {burgers} (conf: 0.255, supp: 0.005, lift: 2.924, conv: 1.225)


- Or, print rules based on the format we need them in or conditionally

In [13]:
# print in format - 2 items -> 1 item
## filter such rules 
rules_rhs = filter(lambda rule: len(rule.lhs) == 2 and len(rule.rhs) == 1, rules)

## print the rules sorted by lift ; top 10
for rule in sorted(rules_rhs, key = lambda rule: rule.lift, reverse=True)[:10]:
    print(rule)

{herb & pepper, spaghetti} -> {ground beef} (conf: 0.393, supp: 0.006, lift: 4.004, conv: 1.487)
{herb & pepper, mineral water} -> {ground beef} (conf: 0.391, supp: 0.007, lift: 3.976, conv: 1.480)
{frozen vegetables, spaghetti} -> {tomatoes} (conf: 0.239, supp: 0.007, lift: 3.498, conv: 1.225)
{mineral water, soup} -> {olive oil} (conf: 0.225, supp: 0.005, lift: 3.423, conv: 1.206)
{ground beef, milk} -> {olive oil} (conf: 0.224, supp: 0.005, lift: 3.405, conv: 1.204)
{spaghetti, tomatoes} -> {frozen vegetables} (conf: 0.318, supp: 0.007, lift: 3.341, conv: 1.327)
{grated cheese, spaghetti} -> {ground beef} (conf: 0.323, supp: 0.005, lift: 3.283, conv: 1.331)
{cooking oil, ground beef} -> {spaghetti} (conf: 0.571, supp: 0.005, lift: 3.282, conv: 1.927)
{frozen vegetables, olive oil} -> {milk} (conf: 0.424, supp: 0.005, lift: 3.268, conv: 1.510)
{chocolate, frozen vegetables} -> {shrimp} (conf: 0.233, supp: 0.005, lift: 3.255, conv: 1.210)


In [14]:
# print in above format but only rules with confidence >= 0.55
## print the rules sorted by confidence ; top 10
rules_rhs = filter(lambda rule: len(rule.lhs) == 2 and len(rule.rhs) == 1 and rule.confidence >= 0.55, rules)

for rule in sorted(rules_rhs, key = lambda rule: rule.confidence, reverse=True)[:10]:
    print(rule)

{frozen vegetables, soup} -> {mineral water} (conf: 0.633, supp: 0.005, lift: 2.657, conv: 2.077)
{cooking oil, pancakes} -> {mineral water} (conf: 0.593, supp: 0.005, lift: 2.489, conv: 1.872)
{olive oil, soup} -> {mineral water} (conf: 0.582, supp: 0.005, lift: 2.442, conv: 1.822)
{frozen vegetables, olive oil} -> {mineral water} (conf: 0.576, supp: 0.007, lift: 2.418, conv: 1.798)
{cooking oil, ground beef} -> {spaghetti} (conf: 0.571, supp: 0.005, lift: 3.282, conv: 1.927)
{milk, soup} -> {mineral water} (conf: 0.561, supp: 0.009, lift: 2.355, conv: 1.737)
{olive oil, shrimp} -> {mineral water} (conf: 0.557, supp: 0.005, lift: 2.338, conv: 1.721)
{chocolate, soup} -> {mineral water} (conf: 0.553, supp: 0.006, lift: 2.318, conv: 1.702)


In [15]:
# print in format - 3 items -> 1 item but only rules with lift >= 2
## print the rules sorted by support ; top 10
rules_rhs = filter(lambda rule: len(rule.lhs) == 3 and len(rule.rhs) == 1 and rule.lift >= 2, rules)

for rule in sorted(rules_rhs, key = lambda rule: rule.support, reverse=True)[:10]:
    print(rule)

{chocolate, mineral water, spaghetti} -> {milk} (conf: 0.311, supp: 0.005, lift: 2.399, conv: 1.263)
{chocolate, milk, mineral water} -> {spaghetti} (conf: 0.352, supp: 0.005, lift: 2.024, conv: 1.275)
{milk, mineral water, spaghetti} -> {frozen vegetables} (conf: 0.288, supp: 0.005, lift: 3.023, conv: 1.271)
{frozen vegetables, mineral water, spaghetti} -> {milk} (conf: 0.378, supp: 0.005, lift: 2.915, conv: 1.399)
{frozen vegetables, milk, spaghetti} -> {mineral water} (conf: 0.548, supp: 0.005, lift: 2.301, conv: 1.686)
{frozen vegetables, milk, mineral water} -> {spaghetti} (conf: 0.410, supp: 0.005, lift: 2.353, conv: 1.399)


- `itemsets` has all the itemsets in a nested dictionary with each dictionary having sub-dictionaries of same length of itemset and their frequencies as well.
- e.g., we have dictionaries with itemsets having 1, 2, 3 and 4 items each.

In [16]:
itemsets.keys()

dict_keys([1, 2, 3, 4])

In [17]:
# check frequency of each itemset (freq >= 500)

# create a dictionary of items with value >= 500
d = dict((keys, values) for keys, values in itemsets[1].items() if values >= 500)

# print the new dict
for k,v in d.items():
    print(k, v)

('shrimp',) 536
('low fat yogurt',) 574
('green tea',) 991
('mineral water',) 1788
('burgers',) 654
('eggs',) 1348
('milk',) 972
('french fries',) 1282
('frozen vegetables',) 715
('spaghetti',) 1306
('cookies',) 603
('chocolate',) 1229
('tomatoes',) 513
('pancakes',) 713
('ground beef',) 737
('escalope',) 595
('cake',) 608


# 2. Advanced Example

Dataset with columns haivng different features and one column for products and each row representing single product.

In [18]:
df2 = pd.read_csv('online_retail_advanced.csv')
df2.dropna(inplace = True)
df2.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [19]:
df2.shape

(406829, 8)

In [20]:
df2.InvoiceNo.nunique()

22190

- There are 22,190 transactions in the dataset and each record represents a product in each transaction.
- Now, we have to arrange the data in transactional form and to do so we will have to do so in three steps - 
    1. Select the column that separates each transaction - `InvoiceNo` and `date` from `InvoiceDate` (create `date` to remove time from `InvoiceDate`)
    2. Drop remaining features
    2. `pivot` the data and then convert `Description` into a single row of products per transaction

In [21]:
# date column
date = pd.to_datetime(df2['InvoiceDate']).dt.date
date[:5]

0    2010-12-01
1    2010-12-01
2    2010-12-01
3    2010-12-01
4    2010-12-01
Name: InvoiceDate, dtype: object

In [22]:
df2['date'] = date
df2.drop(['StockCode', 'Quantity', 'InvoiceDate', 'UnitPrice', 'CustomerID', 'Country'], axis = 1, inplace = True)
df2.head()

Unnamed: 0,InvoiceNo,Description,date
0,536365,WHITE HANGING HEART T-LIGHT HOLDER,2010-12-01
1,536365,WHITE METAL LANTERN,2010-12-01
2,536365,CREAM CUPID HEARTS COAT HANGER,2010-12-01
3,536365,KNITTED UNION FLAG HOT WATER BOTTLE,2010-12-01
4,536365,RED WOOLLY HOTTIE WHITE HEART.,2010-12-01


In [23]:
df2.drop_duplicates(inplace=True)

In [24]:
records = df2.groupby(['InvoiceNo', 'date'])['Description'].apply(tuple).values.tolist()
len(records)

22190

In [25]:
for e in range(5):
    print(e+1, records[e], sep = " - ")

1 - ('WHITE HANGING HEART T-LIGHT HOLDER', 'WHITE METAL LANTERN', 'CREAM CUPID HEARTS COAT HANGER', 'KNITTED UNION FLAG HOT WATER BOTTLE', 'RED WOOLLY HOTTIE WHITE HEART.', 'SET 7 BABUSHKA NESTING BOXES', 'GLASS STAR FROSTED T-LIGHT HOLDER')
2 - ('HAND WARMER UNION JACK', 'HAND WARMER RED POLKA DOT')
3 - ('ASSORTED COLOUR BIRD ORNAMENT', "POPPY'S PLAYHOUSE BEDROOM ", "POPPY'S PLAYHOUSE KITCHEN", 'FELTCRAFT PRINCESS CHARLOTTE DOLL', 'IVORY KNITTED MUG COSY ', 'BOX OF 6 ASSORTED COLOUR TEASPOONS', 'BOX OF VINTAGE JIGSAW BLOCKS ', 'BOX OF VINTAGE ALPHABET BLOCKS', 'HOME BUILDING BLOCK WORD', 'LOVE BUILDING BLOCK WORD', 'RECIPE BOX WITH METAL HEART', 'DOORMAT NEW ENGLAND')
4 - ('JAM MAKING SET WITH JARS', 'RED COAT RACK PARIS FASHION', 'YELLOW COAT RACK PARIS FASHION', 'BLUE COAT RACK PARIS FASHION')
5 - ('BATH BUILDING BLOCK WORD',)


- We can now apply Association Rules just like in the first example on this data.

# THE END