#### Analysis the data “BreadBasket_DMS.csv” and suggest which items is : best sellers, high frequent item, and high correlation.  

#### Input any item and suggest some items should by together. 

In [1]:
import pandas as pd
import numpy as np
import apyori
from mlxtend import frequent_patterns
from mlxtend.preprocessing import TransactionEncoder
import warnings
warnings.filterwarnings("ignore")

Data preprocessing for apyori

In [2]:
# get data
df = pd.read_csv('BreadBasket_DMS.csv')
df.head()

Unnamed: 0,Date,Time,Transaction,Item
0,30-10-16,9:58:11,1,Bread
1,30-10-16,10:05:34,2,Scandinavian
2,30-10-16,10:05:34,2,Scandinavian
3,30-10-16,10:07:57,3,Hot chocolate
4,30-10-16,10:07:57,3,Jam


In [3]:
records = []
for i in range(df.shape[0]):
    records.append([df.values[i, j] for j in range(2, 4)])
records

[[1, 'Bread'],
 [2, 'Scandinavian'],
 [2, 'Scandinavian'],
 [3, 'Hot chocolate'],
 [3, 'Jam'],
 [3, 'Cookies'],
 [4, 'Muffin'],
 [5, 'Coffee'],
 [5, 'Pastry'],
 [5, 'Bread'],
 [6, 'Medialuna'],
 [6, 'Pastry'],
 [6, 'Muffin'],
 [7, 'Medialuna'],
 [7, 'Pastry'],
 [7, 'Coffee'],
 [7, 'Tea'],
 [8, 'Pastry'],
 [8, 'Bread'],
 [9, 'Bread'],
 [9, 'Muffin'],
 [10, 'Scandinavian'],
 [10, 'Medialuna'],
 [11, 'Bread'],
 [11, 'Medialuna'],
 [11, 'Bread'],
 [11, 'NONE'],
 [12, 'Jam'],
 [12, 'Coffee'],
 [12, 'Tartine'],
 [12, 'Pastry'],
 [12, 'Tea'],
 [13, 'Basket'],
 [13, 'Bread'],
 [13, 'Coffee'],
 [14, 'Bread'],
 [14, 'Medialuna'],
 [14, 'Pastry'],
 [15, 'NONE'],
 [15, 'NONE'],
 [15, 'Mineral water'],
 [15, 'Scandinavian'],
 [16, 'Bread'],
 [16, 'Medialuna'],
 [16, 'Coffee'],
 [17, 'Hot chocolate'],
 [18, 'Farm House'],
 [19, 'Farm House'],
 [19, 'Bread'],
 [20, 'Bread'],
 [20, 'Medialuna'],
 [21, 'Coffee'],
 [21, 'Coffee'],
 [21, 'Medialuna'],
 [21, 'Bread'],
 [22, 'Jam'],
 [23, 'Scandinavian'],


In [4]:
# create a list of transactions
records.sort(key=lambda x: x[0])

transactions = []
for i in range(len(records)):
    if i == 0 or records[i][0] != records[i-1][0]:
        transactions.append([records[i][1]])
    else:
        transactions[-1].append(records[i][1])

transactions

[['Bread'],
 ['Scandinavian', 'Scandinavian'],
 ['Hot chocolate', 'Jam', 'Cookies'],
 ['Muffin'],
 ['Coffee', 'Pastry', 'Bread'],
 ['Medialuna', 'Pastry', 'Muffin'],
 ['Medialuna', 'Pastry', 'Coffee', 'Tea'],
 ['Pastry', 'Bread'],
 ['Bread', 'Muffin'],
 ['Scandinavian', 'Medialuna'],
 ['Bread', 'Medialuna', 'Bread', 'NONE'],
 ['Jam', 'Coffee', 'Tartine', 'Pastry', 'Tea'],
 ['Basket', 'Bread', 'Coffee'],
 ['Bread', 'Medialuna', 'Pastry'],
 ['NONE', 'NONE', 'Mineral water', 'Scandinavian'],
 ['Bread', 'Medialuna', 'Coffee'],
 ['Hot chocolate'],
 ['Farm House'],
 ['Farm House', 'Bread'],
 ['Bread', 'Medialuna'],
 ['Coffee', 'Coffee', 'Medialuna', 'Bread'],
 ['Jam'],
 ['Scandinavian', 'Muffin'],
 ['Bread'],
 ['Scandinavian'],
 ['Fudge'],
 ['Scandinavian'],
 ['Coffee', 'Bread'],
 ['Bread', 'Jam', 'NONE'],
 ['Bread'],
 ['Basket'],
 ['Scandinavian', 'Muffin'],
 ['Coffee'],
 ['Coffee', 'Muffin'],
 ['Muffin', 'Scandinavian'],
 ['Tea', 'Bread'],
 ['Coffee', 'Bread', 'NONE'],
 ['Bread', 'Tea'],
 

In [5]:
# delete items repeated in each transactions, for example: ['Scandinavian', 'Scandinavian'] -> ['Scandinavian']
# delete 'NONE' in transactions, for example: ['Coffee', 'Bread', 'NONE'] -> ['Coffee', 'Bread']
# also delete 'Adjustment' and 'Keep it Local' (spam data) 
apyori_transactions = []
for x in transactions:
    x = list(set(x))
    if 'NONE' in x:
        x.remove('NONE')
    if 'Adjustment' in x:
        x.remove('Adjustment')
    if 'Keeping It Local' in x:
        x.remove('Keeping It Local')
    apyori_transactions.append(x)
apyori_transactions # 9531 transactions

[['Bread'],
 ['Scandinavian'],
 ['Hot chocolate', 'Jam', 'Cookies'],
 ['Muffin'],
 ['Bread', 'Coffee', 'Pastry'],
 ['Medialuna', 'Muffin', 'Pastry'],
 ['Medialuna', 'Tea', 'Coffee', 'Pastry'],
 ['Bread', 'Pastry'],
 ['Bread', 'Muffin'],
 ['Medialuna', 'Scandinavian'],
 ['Bread', 'Medialuna'],
 ['Coffee', 'Tartine', 'Pastry', 'Tea', 'Jam'],
 ['Bread', 'Basket', 'Coffee'],
 ['Bread', 'Pastry', 'Medialuna'],
 ['Scandinavian', 'Mineral water'],
 ['Bread', 'Coffee', 'Medialuna'],
 ['Hot chocolate'],
 ['Farm House'],
 ['Farm House', 'Bread'],
 ['Bread', 'Medialuna'],
 ['Medialuna', 'Bread', 'Coffee'],
 ['Jam'],
 ['Scandinavian', 'Muffin'],
 ['Bread'],
 ['Scandinavian'],
 ['Fudge'],
 ['Scandinavian'],
 ['Bread', 'Coffee'],
 ['Bread', 'Jam'],
 ['Bread'],
 ['Basket'],
 ['Scandinavian', 'Muffin'],
 ['Coffee'],
 ['Coffee', 'Muffin'],
 ['Scandinavian', 'Muffin'],
 ['Bread', 'Tea'],
 ['Bread', 'Coffee'],
 ['Bread', 'Tea'],
 ['Scandinavian'],
 ['Coffee', 'Tartine', 'Juice', 'Muffin'],
 ['Scandinavia

Note: apyori_transactions list is acceptable data type for apyori.apriori  

 **Find the best sellers and high frequent itemsets**

For problems related to frequency, we should use mlxtend.frequent_patterns instead of apyori. Unlike apyori, The mlxtend.frequent_patterns.apriori function expects data in a one-hot encoded pandas DataFrame.

Data preprocessing for mlxtend.frequent_patterns

In [6]:
te = TransactionEncoder()
te_ary = te.fit(apyori_transactions).transform(apyori_transactions)
df = pd.DataFrame(te_ary, columns=te.columns_)
df.head()

Unnamed: 0,Afternoon with the baker,Alfajores,Argentina Night,Art Tray,Bacon,Baguette,Bakewell,Bare Popcorn,Basket,Bowl Nic Pitt,...,The BART,The Nomad,Tiffin,Toast,Truffles,Tshirt,Valentine's card,Vegan Feast,Vegan mincepie,Victorian Sponge
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


Get high frequent itemsets (include single product) with support >= 4% or 380/9531 transactions

In [7]:
high_frequency_df = frequent_patterns.apriori(df, min_support=0.04, use_colnames=True)
high_frequency_df = high_frequency_df.sort_values(by='support', ascending=False).reset_index(drop=True)
high_frequency_df

Unnamed: 0,support,itemsets
0,0.475081,(Coffee)
1,0.32494,(Bread)
2,0.141643,(Tea)
3,0.103137,(Cake)
4,0.089393,"(Bread, Coffee)"
5,0.08551,(Pastry)
6,0.071346,(Sandwich)
7,0.061379,(Medialuna)
8,0.057916,(Hot chocolate)
9,0.054349,"(Cake, Coffee)"


- The best seller is coffee. This product is in 4480/9531 transactions.
- Some high frequent itemsets: (bread, coffee), (cake, coffee), (coffee, tea), (pastry, coffee)

**Find high correlation itemsets**

In [8]:
# we choose min_supprt=0.002 (19/9531 transactions)
association_rule = apyori.apriori(apyori_transactions, min_length=2, min_support=0.001, min_lift=1.5, min_confidence=0.2)

association_result = list(association_rule)
print("Number of rules: ", len(association_result))

Number of rules:  59


In [9]:
association_result

[RelationRecord(items=frozenset({'Art Tray', 'Tea'}), support=0.001154128632882174, ordered_statistics=[OrderedStatistic(items_base=frozenset({'Art Tray'}), items_add=frozenset({'Tea'}), confidence=0.28947368421052627, lift=2.0436842105263153)]),
 RelationRecord(items=frozenset({'Bakewell', 'Tea'}), support=0.0012590494176896443, ordered_statistics=[OrderedStatistic(items_base=frozenset({'Bakewell'}), items_add=frozenset({'Tea'}), confidence=0.25, lift=1.765)]),
 RelationRecord(items=frozenset({'Bread', 'Eggs'}), support=0.0014688909873045851, ordered_statistics=[OrderedStatistic(items_base=frozenset({'Eggs'}), items_add=frozenset({'Bread'}), confidence=0.5, lift=1.5387471746851793)]),
 RelationRecord(items=frozenset({'Jammie Dodgers', 'Cake'}), support=0.0030427027594166402, ordered_statistics=[OrderedStatistic(items_base=frozenset({'Jammie Dodgers'}), items_add=frozenset({'Cake'}), confidence=0.23199999999999998, lift=2.249432349949135)]),
 RelationRecord(items=frozenset({'Cake', 'Te

association_result is complicated and it is very difficult to see. So, we convert it into a dataframe

In [10]:
# function to convert priori object into dataframe
def to_dataframe(apriori_obj):
    high_corr_df = pd.DataFrame(apriori_obj)

    high_corr_df['support_count'] = high_corr_df['support']*9531
    high_corr_df['items_base'] = high_corr_df.apply(lambda row: row['ordered_statistics'][0][0], axis=1)
    high_corr_df['items_add'] = high_corr_df.apply(lambda row: row['ordered_statistics'][0][1], axis=1)
    high_corr_df['confidence'] = high_corr_df.apply(lambda row: row['ordered_statistics'][0][2], axis=1)
    high_corr_df['lift'] = high_corr_df.apply(lambda row: row['ordered_statistics'][0][3], axis=1)

    high_corr_df = high_corr_df.sort_values(by='confidence', ascending=False).reset_index(drop=True).\
    drop(['ordered_statistics'], axis=1)
    
    return high_corr_df

In [11]:
high_corr_df = to_dataframe(association_result)
high_corr_df

Unnamed: 0,items,support,support_count,items_base,items_add,confidence,lift
0,"(Toast, Coffee, Pastry)",0.001364,13.0,"(Toast, Pastry)",(Coffee),0.866667,1.824249
1,"(Vegan mincepie, Cake, Coffee)",0.001049,10.0,"(Vegan mincepie, Cake)",(Coffee),0.833333,1.754086
2,"(Extra Salami or Feta, Coffee)",0.003253,31.0,(Extra Salami or Feta),(Coffee),0.815789,1.717158
3,"(Scone, Coffee, Cookies)",0.001574,15.0,"(Scone, Cookies)",(Coffee),0.789474,1.661765
4,"(Juice, Coffee, Pastry)",0.001784,17.0,"(Juice, Pastry)",(Coffee),0.772727,1.626516
5,"(Cake, Salad, Coffee)",0.001049,10.0,"(Cake, Salad)",(Coffee),0.769231,1.619156
6,"(Spanish Brunch, Juice, Coffee)",0.001993,19.0,"(Spanish Brunch, Juice)",(Coffee),0.730769,1.538198
7,"(Toast, Cake, Coffee)",0.001574,15.0,"(Toast, Cake)",(Coffee),0.714286,1.503502
8,"(Bread, Eggs)",0.001469,14.0,(Eggs),(Bread),0.5,1.538747
9,"(Coke, Sandwich, Juice)",0.001049,10.0,"(Coke, Juice)",(Sandwich),0.47619,6.67437


The dataframe show itemsets that go together very often

**Input any item and suggest some items should by together**

Generating association rules with very small min_confidence.

In [73]:
full_association_rule = apyori.apriori(apyori_transactions, min_length=2, min_support=0.001, min_lift=1.01, min_confidence=0.05)
full_association_result = list(full_association_rule)

Save all rules in csv file.

In [75]:
all_rules_df = to_dataframe(full_association_result)
all_rules_df['items'] = all_rules_df.apply(lambda row: set(row['items']), axis=1)
all_rules_df['items_base'] = all_rules_df.apply(lambda row: set(row['items_base']), axis=1)
all_rules_df['items_add'] = all_rules_df.apply(lambda row: set(row['items_add']), axis=1)

all_rules_df.to_csv('rules.csv', index=False)

In [76]:
# build a function to recommendation
def recommended_items(item):
    _df = pd.read_csv('rules.csv')
    
    boolean = []
    for items in _df['items']:
        if item in items:
            boolean.append(True)
        else:
            boolean.append(False)

    filtered = pd.Series(boolean)
    _df = _df[filtered].reset_index(drop=True)
    return _df

If we want to recommend items that go with a specific item, we just need to put name of that item into recommended_items function. The function will give us a dataframe of association rules that includes our target item. These rules are sorted by confidence.

In [79]:
recommended_items('Bread').head() # just select the top (high confidence)

Unnamed: 0,items,support,support_count,items_base,items_add,confidence,lift
0,"{'Bread', 'Eggs'}",0.001469,14.0,{'Eggs'},{'Bread'},0.5,1.538747
1,"{'Jammie Dodgers', 'Bread'}",0.004617,44.0,{'Jammie Dodgers'},{'Bread'},0.352,1.083278
2,"{'Bread', 'Focaccia'}",0.001993,19.0,{'Focaccia'},{'Bread'},0.351852,1.082822
3,"{'Bread', 'Jam'}",0.005036,48.0,{'Jam'},{'Bread'},0.338028,1.04028
4,"{'Soup', 'Bread', 'Tea'}",0.001574,15.0,"{'Soup', 'Bread'}",{'Tea'},0.241935,1.708065


For example, we can recommend eggs, Jammie Dodgers, Focaccia, Jam, soup, tea to go with bread.