In [1]:
import pandas as pd
import numpy as np

import itertools
import math

from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances

import warnings
warnings.filterwarnings("ignore")

In [2]:
test_data = pd.read_csv('last_orders_subset.csv')
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5487 entries, 0 to 5486
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Order          5487 non-null   int64 
 1   SKU            5487 non-null   int64 
 2   Member         5487 non-null   object
 3   Delivery Date  5487 non-null   object
 4   Name           5487 non-null   object
dtypes: int64(2), object(3)
memory usage: 214.5+ KB


In [3]:
df = pd.read_csv('all_except_last_orders.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28984 entries, 0 to 28983
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Order          28984 non-null  int64 
 1   SKU            28984 non-null  int64 
 2   Member         28984 non-null  object
 3   Delivery Date  28984 non-null  object
 4   Name           28984 non-null  object
dtypes: int64(2), object(3)
memory usage: 1.1+ MB


In [4]:
df['Delivery Date'] = pd.to_datetime(df['Delivery Date'], dayfirst = True)

# Sample Submission

In [5]:
order_list = list(set(test_data["Order"]))
len(order_list)

638

In [6]:
df_list = []
for order in order_list:
    top5 = test_data[test_data["Order"] == order].head()
    df_list.append(top5)
sample_submission = pd.concat(df_list, ignore_index = True)

In [7]:
sample_submission.head()

Unnamed: 0,Order,SKU,Member,Delivery Date,Name
0,7817216,15669994,SWNHNRN,03/09/12,Other Dals
1,7817216,15668465,SWNHNRN,03/09/12,Root Vegetables
2,7817216,15668687,SWNHNRN,03/09/12,Root Vegetables
3,7817216,15668468,SWNHNRN,03/09/12,Beans
4,7817216,15668520,SWNHNRN,03/09/12,Bread


In [8]:
sample_submission = sample_submission[["Order", "SKU", "Member"]].reset_index().rename(columns = {"index": "ID"})
sample_submission["ID"] = sample_submission["ID"].apply(lambda x: x+1)
sample_submission.head()

Unnamed: 0,ID,Order,SKU,Member
0,1,7817216,15669994,SWNHNRN
1,2,7817216,15668465,SWNHNRN
2,3,7817216,15668687,SWNHNRN
3,4,7817216,15668468,SWNHNRN
4,5,7817216,15668520,SWNHNRN


In [9]:
# sample_submission.to_csv("sample_submission.csv")

# Baseline Recommendation

Based on the frequency of the items in the past orders 

In [10]:
sku_freq_df = df[["Member", "SKU"]].groupby(["Member", "SKU"]) \
                .size() \
                .reset_index(name='frequency') \
                .sort_values(by = ["Member", "frequency"], ascending = [True, False])

sku_freq_df.head()

Unnamed: 0,Member,SKU,frequency
2,SSCEHNS,7580823,7
8,SSCEHNS,15668377,7
34,SSCEHNS,15669865,7
20,SSCEHNS,15669772,6
40,SSCEHNS,15669970,6


In [11]:
test_data.head()

Unnamed: 0,Order,SKU,Member,Delivery Date,Name
0,7409204,15669778,SWLCNOE,05/09/13,Other Dals
1,8076206,15669977,SWOEZES,01/04/14,Almonds
2,7560723,7593949,SSWWRHW,30/06/13,Cream Biscuits
3,8362837,15669764,SWLSCOZ,06/11/13,Besan
4,8202458,15670196,SSRCRSO,03/02/14,Organic F&V


In [12]:
order_member_sku_df = test_data[["Order", "Member", "SKU"]]
order_member_sku_df.head()

Unnamed: 0,Order,Member,SKU
0,7409204,SWLCNOE,15669778
1,8076206,SWOEZES,15669977
2,7560723,SSWWRHW,7593949
3,8362837,SWLSCOZ,15669764
4,8202458,SSRCRSO,15670196


In [13]:
sku_freq_with_curr_order = pd.merge(sku_freq_df, order_member_sku_df, how = "left", on = ["Member", "SKU"])
sku_freq_with_curr_order.head()

Unnamed: 0,Member,SKU,frequency,Order
0,SSCEHNS,7580823,7,
1,SSCEHNS,15668377,7,
2,SSCEHNS,15669865,7,8069966.0
3,SSCEHNS,15669772,6,
4,SSCEHNS,15669970,6,8069966.0


In [14]:
SSCEHNS_past = sku_freq_df[sku_freq_df["Member"] == "SWOEZES"]

In [15]:
SSCEHNS_order = order_member_sku_df[(order_member_sku_df["Order"] == 8076206)]

In [16]:
recc_sku = list()
for past_sku in list(SSCEHNS_past["SKU"]):
    if past_sku in list(SSCEHNS_order["SKU"]):
        pass
    else:
        recc_sku.append(past_sku)

In [17]:
recc_sku[:5]

[15669869, 15669878, 15669866, 7587667, 7642810]

In [18]:
sku_freq_with_curr_order[(sku_freq_with_curr_order["Order"].isna()) & (sku_freq_with_curr_order["Member"] == "SWLCNOE")].head()

Unnamed: 0,Member,SKU,frequency,Order
10642,SWLCNOE,15668465,4,
10644,SWLCNOE,15668459,3,
10645,SWLCNOE,15668460,3,
10646,SWLCNOE,15668467,3,
10647,SWLCNOE,15668494,3,


In [19]:
sku_freq_with_curr_order[sku_freq_with_curr_order["Member"] == "SWLCNOE"]

Unnamed: 0,Member,SKU,frequency,Order
10640,SWLCNOE,15668478,8,7409204.0
10641,SWLCNOE,15668379,6,7409204.0
10642,SWLCNOE,15668465,4,
10643,SWLCNOE,15668457,3,7409204.0
10644,SWLCNOE,15668459,3,
10645,SWLCNOE,15668460,3,
10646,SWLCNOE,15668467,3,
10647,SWLCNOE,15668494,3,
10648,SWLCNOE,15668594,3,
10649,SWLCNOE,15668688,3,7409204.0


In [20]:
recc_df_list = list()

for _, row in test_data[["Order", "Member"]].drop_duplicates().iterrows():
    df_temp = sku_freq_with_curr_order[sku_freq_with_curr_order["Member"] == row["Member"]].fillna("1000000000").sort_values(["Order", "frequency"], ascending = [False, False]).head()
    df_temp["Order"] = row["Order"]
    recc_df_list.append(df_temp.drop("frequency", axis = 1))

recc_df = pd.concat(recc_df_list, ignore_index = True)

In [21]:
recc_df = recc_df[["Order", "SKU", "Member"]].reset_index().rename(columns = {"index": "ID"})
recc_df["ID"] = recc_df["ID"].apply(lambda x: x+1)
recc_df.head()

Unnamed: 0,ID,Order,SKU,Member
0,1,7409204,15668465,SWLCNOE
1,2,7409204,15668459,SWLCNOE
2,3,7409204,15668460,SWLCNOE
3,4,7409204,15668467,SWLCNOE
4,5,7409204,15668494,SWLCNOE


In [22]:
# recc_df.to_csv("past_frequency_based_recc.csv", index = False)

# Train Test Split

The data is split into last orders (for members who have more than 5 orders) and all but last order.
Previous orders are used to train the recommendation system and the last orders are used for testing.

## Last Order per Member

In [23]:
df.head()

Unnamed: 0,Order,SKU,Member,Delivery Date,Name
0,8358896,15668375,SSCEHNS,2013-11-02,Root Vegetables
1,8358896,15668467,SSCEHNS,2013-11-02,Beans
2,8358896,15669863,SSCEHNS,2013-11-02,Moong Dal
3,8358896,15669778,SSCEHNS,2013-11-02,Other Dals
4,8358896,15669767,SSCEHNS,2013-11-02,Urad Dal


In [24]:
member_order_count = df[["Member", "Order"]].drop_duplicates()["Member"].value_counts().reset_index()
members_with_more_than_5_orders = member_order_count[member_order_count["count"] > 5]["Member"]
len(members_with_more_than_5_orders)

166

In [25]:
last_orders = df[df["Member"].isin(members_with_more_than_5_orders)][["Member", "Order", "Delivery Date"]] \
                .drop_duplicates() \
                .sort_values(["Order", "Delivery Date"], ascending = [True, False]) \
                .drop_duplicates(subset = "Member") \
                .reset_index(drop = True)["Order"]

In [26]:
len(last_orders)

166

In [27]:
df_train = df[~df["Order"].isin(last_orders)]
df_train.head()

Unnamed: 0,Order,SKU,Member,Delivery Date,Name
0,8358896,15668375,SSCEHNS,2013-11-02,Root Vegetables
1,8358896,15668467,SSCEHNS,2013-11-02,Beans
2,8358896,15669863,SSCEHNS,2013-11-02,Moong Dal
3,8358896,15669778,SSCEHNS,2013-11-02,Other Dals
4,8358896,15669767,SSCEHNS,2013-11-02,Urad Dal


In [28]:
df_test = df[df["Order"].isin(last_orders)]
df_test.head()

Unnamed: 0,Order,SKU,Member,Delivery Date,Name
94,8101324,15668377,SSCEHNS,2014-03-15,Root Vegetables
95,8101324,15668684,SSCEHNS,2014-03-15,Beans
96,8101324,15669778,SSCEHNS,2014-03-15,Other Dals
97,8101324,15668521,SSCEHNS,2014-03-15,Bread
98,8101324,15669865,SSCEHNS,2014-03-15,Other Dals


In [29]:
# df_train.to_csv("df_train.csv", index = False)
# df_test.to_csv("df_test.csv", index = False)

## Simulating Forgotten Items

In [30]:
basket_list = list()
forgotten_list = list()

for order in df_test["Order"].unique():
    y_dummy = ['']*df_test[df_test["Order"] == order].shape[0]
    basket, forgotten, _, _ = train_test_split(df_test[df_test["Order"] == order], y_dummy, test_size = 5, random_state = 101)
    basket_list.append(basket)
    forgotten_list.append(forgotten)

test_basket_df = pd.concat(basket_list, ignore_index = True)
forgotten_items_df = pd.concat(forgotten_list, ignore_index = True)

In [31]:
test_basket_df["Order"].value_counts()

Order
7362753    26
7392553    20
7566535    19
7391877    16
7428900    16
           ..
7453262     3
7460103     3
7513528     3
7352666     3
8101324     3
Name: count, Length: 166, dtype: int64

In [32]:
forgotten_items_df["Order"].value_counts()

Order
8101324    5
7734044    5
7585543    5
7644998    5
7770723    5
          ..
7370908    5
7391877    5
7360479    5
7370657    5
7737427    5
Name: count, Length: 166, dtype: int64

In [33]:
# test_basket_df.to_csv("test_basket_df.csv", index = False)
# forgotten_items_df.to_csv("forgotten_items_df.csv", index = False)

# Recommendation Engine 1: item based collaborative filtering

## SKU Co-occurrence Matrix

In [34]:
df_train = pd.read_csv("df_train.csv")
df_train.head()

Unnamed: 0,Order,SKU,Member,Delivery Date,Name
0,8358896,15668375,SSCEHNS,2013-11-02,Root Vegetables
1,8358896,15668467,SSCEHNS,2013-11-02,Beans
2,8358896,15669863,SSCEHNS,2013-11-02,Moong Dal
3,8358896,15669778,SSCEHNS,2013-11-02,Other Dals
4,8358896,15669767,SSCEHNS,2013-11-02,Urad Dal


In [35]:
sku_list = list(df_train["SKU"].unique())
len(sku_list)

632

In [36]:
pairs = list(itertools.combinations_with_replacement(sku_list, 2))
print(len(pairs))

200028


In [37]:
%%time

s1_list = list()
s2_list = list()
cooccur_list = list()

for s1, s2 in pairs:
    s1_list.append(s1)
    s2_list.append(s2)
    
    freq = len(set(df_train[df_train["SKU"] == s1]["Order"]) \
               .intersection(set(df_train[df_train["SKU"] == s2]["Order"]))
              )
    cooccur_list.append(freq)

pairwise_cooccurrence = pd.DataFrame({"SKU1": s1_list, "SKU2": s2_list, "cooccurrence_frequency": cooccur_list})
pairwise_cooccurrence.head()

CPU times: user 26.9 s, sys: 35.7 ms, total: 27 s
Wall time: 27 s


Unnamed: 0,SKU1,SKU2,cooccurrence_frequency
0,15668375,15668375,108
1,15668375,15668467,33
2,15668375,15669863,13
3,15668375,15669778,11
4,15668375,15669767,4


In [38]:
%%time

co_matrix = pd.DataFrame(np.zeros((len(sku_list), len(sku_list))), index=sku_list, columns=sku_list)

for _, row in pairwise_cooccurrence.iterrows():
    co_matrix.loc[row["SKU1"], row["SKU2"]] = row["cooccurrence_frequency"]
    co_matrix.loc[row["SKU2"], row["SKU1"]] = row["cooccurrence_frequency"]

co_matrix.head()

CPU times: user 7.41 s, sys: 8.92 ms, total: 7.42 s
Wall time: 7.42 s


Unnamed: 0,15668375,15668467,15669863,15669778,15669767,15669832,15669772,15669970,15668478,15669861,...,7590868,15668382,7582051,7590866,7735524,15670172,7590864,92433757,92286348,7610713
15668375,108.0,33.0,13.0,11.0,4.0,4.0,11.0,3.0,27.0,12.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15668467,33.0,322.0,31.0,25.0,19.0,18.0,27.0,4.0,62.0,21.0,...,1.0,4.0,1.0,8.0,0.0,0.0,5.0,0.0,1.0,1.0
15669863,13.0,31.0,299.0,45.0,43.0,42.0,47.0,4.0,24.0,43.0,...,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
15669778,11.0,25.0,45.0,243.0,19.0,23.0,69.0,4.0,21.0,40.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
15669767,4.0,19.0,43.0,19.0,184.0,38.0,29.0,4.0,7.0,31.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Similarity Matrix

In [39]:
%%time

sim_matrix = pd.DataFrame(cosine_similarity(co_matrix), index=sku_list, columns=sku_list)
sim_matrix.head()

CPU times: user 2.88 ms, sys: 1.12 ms, total: 4 ms
Wall time: 3.12 ms


Unnamed: 0,15668375,15668467,15669863,15669778,15669767,15669832,15669772,15669970,15668478,15669861,...,7590868,15668382,7582051,7590866,7735524,15670172,7590864,92433757,92286348,7610713
15668375,1.0,0.613503,0.409685,0.41358,0.310546,0.312635,0.414734,0.327831,0.548322,0.397129,...,0.208793,0.371777,0.342878,0.247329,0.175568,0.106815,0.259903,0.179621,0.269245,0.287959
15668467,0.613503,1.0,0.448954,0.448271,0.363029,0.36438,0.45922,0.348781,0.609245,0.414204,...,0.347401,0.591794,0.320763,0.464254,0.190179,0.114182,0.42758,0.199862,0.406556,0.389189
15669863,0.409685,0.448954,1.0,0.613309,0.632344,0.593434,0.623119,0.458031,0.410087,0.606846,...,0.325648,0.299858,0.408643,0.235254,0.120424,0.10586,0.234415,0.258891,0.269451,0.185499
15669778,0.41358,0.448271,0.613309,1.0,0.512776,0.523431,0.721877,0.443519,0.417328,0.605837,...,0.338942,0.307762,0.322747,0.241307,0.126258,0.1108,0.296229,0.365097,0.24348,0.20026
15669767,0.310546,0.363029,0.632344,0.512776,1.0,0.631637,0.571371,0.486398,0.301178,0.587039,...,0.272512,0.213498,0.29345,0.163419,0.099506,0.107323,0.176269,0.226608,0.242507,0.137267


## Test on df_test

In [40]:
test_basket_df = pd.read_csv("test_basket_df.csv")

In [41]:
# recc_df = pd.DataFrame(columns = ["Order", "SKU", "Member", "Similarity Score"])

# for order in test_basket_df["Order"].unique():
#     test_order = test_basket_df[test_basket_df["Order"] == order]
#     test_recc = pd.DataFrame(columns = ["SKU", "Similarity Score"])
#     for sku in test_order["SKU"]:
#         df_ = pd.DataFrame(sim_matrix.loc[sku]).reset_index()  
#         df_.columns = ["SKU", "Similarity Score"]
#         df_ = df_.sort_values("Similarity Score", ascending = False).reset_index(drop = True).iloc[1:]
#         test_recc = pd.concat([test_recc, df_]) 
#     test_recc["Order"] = order
#     test_recc["Member"] = test_order["Member"].unique()[0]
#     test_recc = test_recc[["Order", "SKU", "Member", "Similarity Score"]] \
#                     .sort_values("Similarity Score", ascending = False) \
#                     .iloc[:5]
#                     # .sort_values(["SKU", "Similarity Score"], ascending = [True, False]) \
#                     # .drop_duplicates(subset = "SKU") \
                    

#     recc_df = pd.concat([recc_df, test_recc], ignore_index = True)

In [42]:
recc_df = pd.DataFrame(columns = ["Order", "SKU", "Member"])

for order in test_basket_df["Order"].unique():
    test_order = test_basket_df[test_basket_df["Order"] == order]
    test_recc = pd.DataFrame(columns = ["SKU", "Similarity Score"])
    for sku in test_order["SKU"]:
        df_ = pd.DataFrame(sim_matrix.loc[sku]).reset_index()
        df_.columns = ["SKU", "Similarity Score"]
        df_ = df_.sort_values("Similarity Score", ascending = False).reset_index(drop = True)
        recc_count = 0
        for i in range(1, 632):
            if df_.iloc[i]["SKU"] in list(test_recc["SKU"]):
                pass
            else:
                test_recc = pd.concat([test_recc, df_.iloc[i:i+1]]) 
                recc_count += 1
            
            if recc_count >= 3:
                break    
    test_recc = test_recc.sort_values("Similarity Score", ascending = False).iloc[:5]
    test_recc["Order"] = order
    test_recc["Member"] = test_order["Member"].unique()[0]
    test_recc = test_recc[["Order", "SKU", "Member"]]

    recc_df = pd.concat([recc_df, test_recc], ignore_index = True)

In [43]:
recc_df.head()

Unnamed: 0,Order,SKU,Member
0,8101324,15668460,SSCEHNS
1,8101324,15668381,SSCEHNS
2,8101324,15668467,SSCEHNS
3,8101324,21409124,SSCEHNS
4,8101324,34990774,SSCEHNS


## Evaluation using Recall@5 Metric

In [44]:
forgotten_items_df = pd.read_csv("forgotten_items_df.csv")
forgotten_items_df.head()

Unnamed: 0,Order,SKU,Member,Delivery Date,Name
0,8101324,15668377,SSCEHNS,2014-03-15,Root Vegetables
1,8101324,15669778,SSCEHNS,2014-03-15,Other Dals
2,8101324,15669814,SSCEHNS,2014-03-15,Jaggery
3,8101324,15669865,SSCEHNS,2014-03-15,Other Dals
4,8101324,15669970,SSCEHNS,2014-03-15,Cashews


In [45]:
recall_list = list()

for order in list(forgotten_items_df["Order"].unique()):
    forgotten_SKUs = forgotten_items_df[forgotten_items_df["Order"] == order]["SKU"]
    recc_SKUs = recc_df[recc_df["Order"] == order]["SKU"]
    recall = len(set(forgotten_SKUs).intersection(set(recc_SKUs)))/len(set(forgotten_SKUs))
    recall_list.append(recall)

In [46]:
np.mean(recall_list)

0.10963855421686744

## Prepare Kaggle Submission    

In [47]:
%%time

sku_list = list(df["SKU"].unique())
pairs = list(itertools.combinations_with_replacement(sku_list, 2))

s1_list = list()
s2_list = list()
cooccur_list = list()

for s1, s2 in pairs:
    s1_list.append(s1)
    s2_list.append(s2)
    
    freq = len(set(df[df["SKU"] == s1]["Order"]) \
               .intersection(set(df[df["SKU"] == s2]["Order"]))
              )
    cooccur_list.append(freq)

pairwise_cooccurrence = pd.DataFrame({"SKU1": s1_list, "SKU2": s2_list, "cooccurrence_frequency": cooccur_list})

co_matrix = pd.DataFrame(np.zeros((len(sku_list), len(sku_list))), index=sku_list, columns=sku_list)

for _, row in pairwise_cooccurrence.iterrows():
    co_matrix.loc[row["SKU1"], row["SKU2"]] = row["cooccurrence_frequency"]
    co_matrix.loc[row["SKU2"], row["SKU1"]] = row["cooccurrence_frequency"]

sim_matrix = pd.DataFrame(cosine_similarity(co_matrix), index=sku_list, columns=sku_list)

recc_df = pd.DataFrame(columns = ["Order", "SKU", "Member"])

for order in test_data["Order"].unique():
    test_order = test_data[test_data["Order"] == order]
    test_recc = pd.DataFrame(columns = ["SKU", "Similarity Score"])
    for sku in test_order["SKU"]:
        try:
            df_ = pd.DataFrame(sim_matrix.loc[sku]).reset_index()
            df_.columns = ["SKU", "Similarity Score"]
            df_ = df_.sort_values("Similarity Score", ascending = False).reset_index(drop = True)
            recc_count = 0
            for i in range(1, 632):
                if df_.iloc[i]["SKU"] in list(test_recc["SKU"]):
                    pass
                else:
                    test_recc = pd.concat([test_recc, df_.iloc[i:i+1]]) 
                    recc_count += 1
                
                if recc_count >= 3:
                    break
        except:
            pass
    test_recc = test_recc.sort_values("Similarity Score", ascending = False).iloc[:5]
    test_recc["Order"] = order
    test_recc["Member"] = test_order["Member"].unique()[0]
    test_recc = test_recc[["Order", "SKU", "Member"]]

    recc_df = pd.concat([recc_df, test_recc], ignore_index = True)

recc_df.head()

CPU times: user 45.1 s, sys: 53 ms, total: 45.1 s
Wall time: 45.1 s


Unnamed: 0,Order,SKU,Member
0,7409204,15668465,SWLCNOE
1,7409204,15669772,SWLCNOE
2,7409204,15668467,SWLCNOE
3,7409204,15668379,SWLCNOE
4,7409204,15668688,SWLCNOE


In [48]:
recc_df = recc_df[["Order", "SKU", "Member"]].reset_index().rename(columns = {"index": "ID"})
recc_df["ID"] = recc_df["ID"].apply(lambda x: x+1)
recc_df.head()

Unnamed: 0,ID,Order,SKU,Member
0,1,7409204,15668465,SWLCNOE
1,2,7409204,15669772,SWLCNOE
2,3,7409204,15668467,SWLCNOE
3,4,7409204,15668379,SWLCNOE
4,5,7409204,15668688,SWLCNOE


In [49]:
# recc_df.to_csv("item_based_collab_filtering_recc.csv", index = False)

# Recommendation Engine 2: probability based

## Co-occurence Matrix for Calculating Probabilites

In [50]:
df_train = pd.read_csv("df_train.csv")

In [51]:
sku_list = list(df_train["SKU"].unique())
len(sku_list)

632

In [52]:
pairs = list(itertools.combinations_with_replacement(sku_list, 2))

s1_list = list()
s2_list = list()
cooccur_list = list()

for s1, s2 in pairs:
    s1_list.append(s1)
    s2_list.append(s2)
    
    freq = len(set(df_train[df_train["SKU"] == s1]["Order"]) \
               .intersection(set(df_train[df_train["SKU"] == s2]["Order"]))
              )
    cooccur_list.append(freq)

pairwise_cooccurrence = pd.DataFrame({"SKU1": s1_list, "SKU2": s2_list, "cooccurrence_frequency": cooccur_list})

co_matrix = pd.DataFrame(np.zeros((len(sku_list), len(sku_list))), index=sku_list, columns=sku_list)

for _, row in pairwise_cooccurrence.iterrows():
    co_matrix.loc[row["SKU1"], row["SKU2"]] = row["cooccurrence_frequency"]
    co_matrix.loc[row["SKU2"], row["SKU1"]] = row["cooccurrence_frequency"]

co_matrix.head()

Unnamed: 0,15668375,15668467,15669863,15669778,15669767,15669832,15669772,15669970,15668478,15669861,...,7590868,15668382,7582051,7590866,7735524,15670172,7590864,92433757,92286348,7610713
15668375,108.0,33.0,13.0,11.0,4.0,4.0,11.0,3.0,27.0,12.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15668467,33.0,322.0,31.0,25.0,19.0,18.0,27.0,4.0,62.0,21.0,...,1.0,4.0,1.0,8.0,0.0,0.0,5.0,0.0,1.0,1.0
15669863,13.0,31.0,299.0,45.0,43.0,42.0,47.0,4.0,24.0,43.0,...,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
15669778,11.0,25.0,45.0,243.0,19.0,23.0,69.0,4.0,21.0,40.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
15669767,4.0,19.0,43.0,19.0,184.0,38.0,29.0,4.0,7.0,31.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Test on df_test

In [53]:
test_basket_df = pd.read_csv("test_basket_df.csv")
test_basket_df.head()

Unnamed: 0,Order,SKU,Member,Delivery Date,Name
0,8101324,15668684,SSCEHNS,2014-03-15,Beans
1,8101324,15668521,SSCEHNS,2014-03-15,Bread
2,8101324,15669811,SSCEHNS,2014-03-15,Almonds
3,8203855,15669873,SSCHNCE,2014-02-03,Whole Spices
4,8203855,15669767,SSCHNCE,2014-02-03,Urad Dal


In [54]:
recc_df = pd.DataFrame(columns = ["Order", "SKU", "Member"])

for order in test_basket_df["Order"].unique():
    test_order = test_basket_df[test_basket_df["Order"] == order]
    prob_list = list()
    for sku in sku_list:
        cond_prob_list = list()
        for basket_item in test_order["SKU"]:
            p = co_matrix.loc[sku, basket_item]/co_matrix.loc[basket_item, basket_item]
            cond_prob_list.append(p)
            # print(f"sku: {sku}, basket_item: {basket_item}, probability: {p}")
        prob_list.append(math.prod(cond_prob_list))
    
    test_recc = pd.DataFrame({"SKU": sku_list, "Probability": prob_list})
    
    test_recc = test_recc.sort_values("Probability", ascending = False).iloc[:5]
    test_recc["Order"] = order
    test_recc["Member"] = test_order["Member"].unique()[0]
    # test_recc = test_recc[["Order", "SKU", "Member"]]

    recc_df = pd.concat([recc_df, test_recc], ignore_index = True)

In [55]:
recc_df.head()

Unnamed: 0,Order,SKU,Member,Probability
0,8101324,15668381,SSCEHNS,0.010054
1,8101324,15668460,SSCEHNS,0.009763
2,8101324,15668379,SSCEHNS,0.009157
3,8101324,15669780,SSCEHNS,0.007631
4,8101324,15668688,SSCEHNS,0.005117


## Evaluation using Recall@5 Metric

In [56]:
forgotten_items_df = pd.read_csv("forgotten_items_df.csv")
forgotten_items_df.head()

Unnamed: 0,Order,SKU,Member,Delivery Date,Name
0,8101324,15668377,SSCEHNS,2014-03-15,Root Vegetables
1,8101324,15669778,SSCEHNS,2014-03-15,Other Dals
2,8101324,15669814,SSCEHNS,2014-03-15,Jaggery
3,8101324,15669865,SSCEHNS,2014-03-15,Other Dals
4,8101324,15669970,SSCEHNS,2014-03-15,Cashews


In [57]:
recall_list = list()

for order in list(forgotten_items_df["Order"].unique()):
    forgotten_SKUs = forgotten_items_df[forgotten_items_df["Order"] == order]["SKU"]
    recc_SKUs = recc_df[recc_df["Order"] == order]["SKU"]
    recall = len(set(forgotten_SKUs).intersection(set(recc_SKUs)))/len(set(forgotten_SKUs))
    recall_list.append(recall)

In [58]:
np.mean(recall_list)

0.11927710843373492

## Prepare Kaggle Submission    

In [None]:
%%time

sku_list = list(df["SKU"].unique())

pairs = list(itertools.combinations_with_replacement(sku_list, 2))

s1_list = list()
s2_list = list()
cooccur_list = list()

for s1, s2 in pairs:
    s1_list.append(s1)
    s2_list.append(s2)
    
    freq = len(set(df[df["SKU"] == s1]["Order"]) \
               .intersection(set(df[df["SKU"] == s2]["Order"]))
              )
    cooccur_list.append(freq)

pairwise_cooccurrence = pd.DataFrame({"SKU1": s1_list, "SKU2": s2_list, "cooccurrence_frequency": cooccur_list})

co_matrix = pd.DataFrame(np.zeros((len(sku_list), len(sku_list))), index=sku_list, columns=sku_list)

for _, row in pairwise_cooccurrence.iterrows():
    co_matrix.loc[row["SKU1"], row["SKU2"]] = row["cooccurrence_frequency"]
    co_matrix.loc[row["SKU2"], row["SKU1"]] = row["cooccurrence_frequency"]

recc_df = pd.DataFrame(columns = ["Order", "SKU", "Member"])

for order in test_data["Order"].unique():
    test_order = test_data[test_data["Order"] == order]
    prob_list = list()
    for sku in sku_list:
        try:
            cond_prob_list = list()
            for basket_item in test_order["SKU"]:
                p = co_matrix.loc[sku, basket_item]/co_matrix.loc[basket_item, basket_item]
                cond_prob_list.append(p)
                # print(f"sku: {sku}, basket_item: {basket_item}, probability: {p}")
            prob_list.append(math.prod(cond_prob_list))
        except:
            prob_list.append(0)
    test_recc = pd.DataFrame({"SKU": sku_list, "Probability": prob_list})
    
    test_recc = test_recc.sort_values("Probability", ascending = False).iloc[:5]
    test_recc["Order"] = order
    test_recc["Member"] = test_order["Member"].unique()[0]
    # test_recc = test_recc[["Order", "SKU", "Member"]]

    recc_df = pd.concat([recc_df, test_recc], ignore_index = True)


In [None]:
recc_df = recc_df[["Order", "SKU", "Member"]].reset_index().rename(columns = {"index": "ID"})
recc_df["ID"] = recc_df["ID"].apply(lambda x: x+1)
recc_df.head()

In [None]:
# recc_df.to_csv("probability_based_recc.csv", index = False)

# Recommendation Engine 3: