In [1]:
# load relevant packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime

from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [2]:
# load the customer purchasing data
file_path = '/Users/ericjiang/Desktop/msci719_assignment/ass5/'
file_name = 'data.xlsx'
df = pd.read_excel(file_path + file_name)

In [3]:
df.head()

Unnamed: 0,Member,Order,SKU,Created On,Description
0,M09736,6468572,34993740,22-09-2014 22:45,Other Sauces
1,M09736,6468572,15669800,22-09-2014 22:45,Cashews
2,M09736,6468572,34989501,22-09-2014 22:45,Other Dals
3,M09736,6468572,7572303,22-09-2014 22:45,Namkeen
4,M09736,6468572,15669856,22-09-2014 22:45,Sugar


In [4]:
# count the number of items each customer purchased
df_count = df['Member'].groupby(df['Member']).agg({'item_count':'count'})\
           .reset_index().sort_values(by='item_count', ascending=False)

is deprecated and will be removed in a future version. Use                 named aggregation instead.

    >>> grouper.agg(name_1=func_1, name_2=func_2)

  


In [5]:
# extract the top 20 customers
df_top_20 = df_count.head(20).reset_index().drop('index',axis=1)

In [6]:
# append the rank to the top 20 customers
rank = []
for i in range(20):
    rank.append(i+1)
df_top_20['rank'] = rank

In [7]:
df_top_20

Unnamed: 0,Member,item_count,rank
0,M38622,1438,1
1,M33064,1318,2
2,M41747,1131,3
3,M32409,1106,4
4,M31966,1102,5
5,M56368,1021,6
6,M36432,1001,7
7,M41781,920,8
8,M35538,912,9
9,M33491,874,10


In [8]:
# merge the two datasets, and clean it for further analysis
df_combine = df.merge(df_top_20,how='inner',left_on='Member',right_on='Member')
df_combine = df_combine[['Member','rank','Order','Description']]
df_combine = df_combine.drop_duplicates(keep='first', inplace=False)
df_combine = df_combine.sort_values(by=['rank','Order']).reset_index().drop('index',axis=1)

In [9]:
df_combine

Unnamed: 0,Member,rank,Order,Description
0,M38622,1,6431665,Organic Flours
1,M38622,1,6431665,Exotic Vegetables
2,M38622,1,6431665,"Glucose, Marie & Milk Biscuits"
3,M38622,1,6431665,Organic Masalas & Spices
4,M38622,1,6431665,Ghee
...,...,...,...,...
14238,M78720,20,8381719,Root Vegetables
14239,M78720,20,8381719,Other Vegetables
14240,M78720,20,8381719,Chips
14241,M78720,20,8381719,Beans


In [10]:
# load the current basket for the top 20 customers
current_basket = pd.read_csv('Current_Basket.csv')
current_basket.head()
current_basket = current_basket.drop("Members' position in the list",axis=1)

In [11]:
current_basket.head()

Unnamed: 0,Item1,Item2,Item3,Item4,Item5
0,Raw Rice,Organic F&V,,,
1,Beans,Root Vegetables,Namkeen,Other Vegetables,
2,Organic Rice & Rice Products,Avalakki / Poha,Organic F&V,,
3,Namkeen,Moong Dal,Cream Biscuits,Whole Spices,
4,Beans,Ground Coffee,Sooji & Rava,,


In [12]:
# define a function for further use
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1
    

# the main part for iteration
for i in range(20):
    # extract the purchasing data for a specific customer
    df_sub = df_combine[df_combine['rank']==i+1].drop(['Member','rank'],axis=1)
    # reshape the dataset
    basket = df_sub.groupby(['Order','Description'])['Description'].count().unstack()\
             .reset_index().fillna(0).set_index('Order')
    basket_sets = basket.applymap(encode_units)
    # apply association rule mining functions
    frequent_itemsets = apriori(basket_sets,min_support=0.1,use_colnames=True)
    rules = association_rules(frequent_itemsets,support_only=True,min_threshold=0.1)
    # read the current basket of a specific customer
    a = list(current_basket.iloc[i])
    # drop null values
    b = [x for x in a if str(x) != 'nan']
    antecedent = b
    print('The result for customer'+' '+str(i+1))
    # sub iteration part: for each item in the current basket, find the top 5 consequents regarding support value
    for item in antecedent:
        result = rules[rules['antecedents']=={item}].sort_values(['support'],ascending=False).head(5)
        print(result[['antecedents','consequents','support']])
        print('******')
    print('The end for customer'+' '+str(i+1))
    print()
    print()
    print()

The result for customer 1
Empty DataFrame
Columns: [antecedents, consequents, support]
Index: []
******
       antecedents                               consequents   support
36   (Organic F&V)                   (Organic Dals & Pulses)  0.338129
63   (Organic F&V)                         (Root Vegetables)  0.309353
57   (Organic F&V)                (Organic Masalas & Spices)  0.230216
172  (Organic F&V)  (Root Vegetables, Organic Dals & Pulses)  0.223022
17   (Organic F&V)                       (Exotic Vegetables)  0.215827
******
The end for customer 1



The result for customer 2
    antecedents                          consequents   support
11      (Beans)                    (Root Vegetables)  0.387640
5       (Beans)                   (Gourd & Cucumber)  0.337079
109     (Beans)  (Gourd & Cucumber, Root Vegetables)  0.320225
9       (Beans)                   (Other Vegetables)  0.292135
120     (Beans)  (Root Vegetables, Other Vegetables)  0.275281
******
           antecedents    

The result for customer 10
Empty DataFrame
Columns: [antecedents, consequents, support]
Index: []
******
            antecedents         consequents   support
36  (Exotic Vegetables)          (Brinjals)  0.275641
7   (Exotic Vegetables)            (Banana)  0.243590
47  (Exotic Vegetables)  (Gourd & Cucumber)  0.237179
50  (Exotic Vegetables)  (Other Vegetables)  0.217949
52  (Exotic Vegetables)   (Root Vegetables)  0.217949
******
The end for customer 10



The result for customer 11
Empty DataFrame
Columns: [antecedents, consequents, support]
Index: []
******
       antecedents      consequents   support
19  (Other Juices)  (Ground Coffee)  0.104839
******
Empty DataFrame
Columns: [antecedents, consequents, support]
Index: []
******
The end for customer 11



The result for customer 12
    antecedents               consequents   support
36     (Banana)         (Root Vegetables)  0.555556
12     (Banana)                   (Beans)  0.388889
197    (Banana)  (Root Vegetables, Beans)  0.