In [30]:
import pandas as pd
import numpy as np

import itertools
import math

from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings("ignore")

In [2]:
test_data = pd.read_csv('last_orders_subset.csv')
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5487 entries, 0 to 5486
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Order          5487 non-null   int64 
 1   SKU            5487 non-null   int64 
 2   Member         5487 non-null   object
 3   Delivery Date  5487 non-null   object
 4   Name           5487 non-null   object
dtypes: int64(2), object(3)
memory usage: 214.5+ KB


In [3]:
df = pd.read_csv('all_except_last_orders.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28984 entries, 0 to 28983
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Order          28984 non-null  int64 
 1   SKU            28984 non-null  int64 
 2   Member         28984 non-null  object
 3   Delivery Date  28984 non-null  object
 4   Name           28984 non-null  object
dtypes: int64(2), object(3)
memory usage: 1.1+ MB


In [4]:
df['Delivery Date'] = pd.to_datetime(df['Delivery Date'], dayfirst = True)

# Sample Submission

In [3]:
order_list = list(set(test_data["Order"]))
len(order_list)

638

In [4]:
df_list = []
for order in order_list:
    top5 = test_data[test_data["Order"] == order].head()
    df_list.append(top5)
sample_submission = pd.concat(df_list, ignore_index = True)

In [5]:
sample_submission

Unnamed: 0,Order,SKU,Member,Delivery Date,Name
0,7817216,15669994,SWNHNRN,03/09/12,Other Dals
1,7817216,15668465,SWNHNRN,03/09/12,Root Vegetables
2,7817216,15668687,SWNHNRN,03/09/12,Root Vegetables
3,7817216,15668468,SWNHNRN,03/09/12,Beans
4,7817216,15668520,SWNHNRN,03/09/12,Bread
...,...,...,...,...,...
3185,7559166,15668685,SWOOOLL,29/06/13,Beans
3186,7559166,15668469,SWOOOLL,29/06/13,Beans
3187,7559166,15668465,SWOOOLL,29/06/13,Root Vegetables
3188,7559166,15668462,SWOOOLL,29/06/13,Gourd & Cucumber


In [6]:
sample_submission = sample_submission[["Order", "SKU", "Member"]].reset_index().rename(columns = {"index": "ID"})
sample_submission["ID"] = sample_submission["ID"].apply(lambda x: x+1)
sample_submission

Unnamed: 0,ID,Order,SKU,Member
0,1,7817216,15669994,SWNHNRN
1,2,7817216,15668465,SWNHNRN
2,3,7817216,15668687,SWNHNRN
3,4,7817216,15668468,SWNHNRN
4,5,7817216,15668520,SWNHNRN
...,...,...,...,...
3185,3186,7559166,15668685,SWOOOLL
3186,3187,7559166,15668469,SWOOOLL
3187,3188,7559166,15668465,SWOOOLL
3188,3189,7559166,15668462,SWOOOLL


In [7]:
sample_submission.to_csv("sample_submission.csv")

# Baseline Recommendation

Based on the frequency of the items in the past orders 

In [9]:
sku_freq_df = df[["Member", "SKU"]].groupby(["Member", "SKU"]) \
                .size() \
                .reset_index(name='frequency') \
                .sort_values(by = ["Member", "frequency"], ascending = [True, False])

sku_freq_df

Unnamed: 0,Member,SKU,frequency
2,SSCEHNS,7580823,7
8,SSCEHNS,15668377,7
34,SSCEHNS,15669865,7
20,SSCEHNS,15669772,6
40,SSCEHNS,15669970,6
...,...,...,...
18387,SWRNHCS,15669874,1
18388,SWRNHCS,15669886,1
18389,SWRNHCS,15670196,1
18390,SWRNHCS,15670260,1


In [10]:
test_data

Unnamed: 0,Order,SKU,Member,Delivery Date,Name
0,7409204,15669778,SWLCNOE,05/09/13,Other Dals
1,8076206,15669977,SWOEZES,01/04/14,Almonds
2,7560723,7593949,SSWWRHW,30/06/13,Cream Biscuits
3,8362837,15669764,SWLSCOZ,06/11/13,Besan
4,8202458,15670196,SSRCRSO,03/02/14,Organic F&V
...,...,...,...,...,...
5482,8269882,15668469,SWNHZNW,05/01/14,Beans
5483,8384422,15669875,SSWNRHC,18/11/13,Toor Dal
5484,7493590,15668465,SWRELHW,07/08/13,Root Vegetables
5485,8080319,15670267,SSNSECH,03/04/14,Toor Dal


In [11]:
order_member_sku_df = test_data[["Order", "Member", "SKU"]]
order_member_sku_df

Unnamed: 0,Order,Member,SKU
0,7409204,SWLCNOE,15669778
1,8076206,SWOEZES,15669977
2,7560723,SSWWRHW,7593949
3,8362837,SWLSCOZ,15669764
4,8202458,SSRCRSO,15670196
...,...,...,...
5482,8269882,SWNHZNW,15668469
5483,8384422,SSWNRHC,15669875
5484,7493590,SWRELHW,15668465
5485,8080319,SSNSECH,15670267


In [12]:
sku_freq_with_curr_order = pd.merge(sku_freq_df, order_member_sku_df, how = "left", on = ["Member", "SKU"])
sku_freq_with_curr_order

Unnamed: 0,Member,SKU,frequency,Order
0,SSCEHNS,7580823,7,
1,SSCEHNS,15668377,7,
2,SSCEHNS,15669865,7,8069966.0
3,SSCEHNS,15669772,6,
4,SSCEHNS,15669970,6,8069966.0
...,...,...,...,...
18387,SWRNHCS,15669874,1,
18388,SWRNHCS,15669886,1,
18389,SWRNHCS,15670196,1,
18390,SWRNHCS,15670260,1,


In [13]:
SSCEHNS_past = sku_freq_df[sku_freq_df["Member"] == "SWOEZES"]

In [14]:
SSCEHNS_order = order_member_sku_df[(order_member_sku_df["Order"] == 8076206)]

In [15]:
recc_sku = list()
for past_sku in list(SSCEHNS_past["SKU"]):
    if past_sku in list(SSCEHNS_order["SKU"]):
        pass
    else:
        recc_sku.append(past_sku)

In [16]:
recc_sku[:5]

[15669869, 15669878, 15669866, 7587667, 7642810]

In [17]:
sku_freq_with_curr_order[(sku_freq_with_curr_order["Order"].isna()) & (sku_freq_with_curr_order["Member"] == "SWLCNOE")].head()

Unnamed: 0,Member,SKU,frequency,Order
10642,SWLCNOE,15668465,4,
10644,SWLCNOE,15668459,3,
10645,SWLCNOE,15668460,3,
10646,SWLCNOE,15668467,3,
10647,SWLCNOE,15668494,3,


In [18]:
sku_freq_with_curr_order[sku_freq_with_curr_order["Member"] == "SWLCNOE"]

Unnamed: 0,Member,SKU,frequency,Order
10640,SWLCNOE,15668478,8,7409204.0
10641,SWLCNOE,15668379,6,7409204.0
10642,SWLCNOE,15668465,4,
10643,SWLCNOE,15668457,3,7409204.0
10644,SWLCNOE,15668459,3,
10645,SWLCNOE,15668460,3,
10646,SWLCNOE,15668467,3,
10647,SWLCNOE,15668494,3,
10648,SWLCNOE,15668594,3,
10649,SWLCNOE,15668688,3,7409204.0


In [19]:
recc_df_list = list()

for _, row in test_data[["Order", "Member"]].drop_duplicates().iterrows():
    df_temp = sku_freq_with_curr_order[sku_freq_with_curr_order["Member"] == row["Member"]].fillna("1000000000").sort_values(["Order", "frequency"], ascending = [False, False]).head()
    df_temp["Order"] = row["Order"]
    recc_df_list.append(df_temp.drop("frequency", axis = 1))

recc_df = pd.concat(recc_df_list, ignore_index = True)

In [20]:
recc_df = recc_df[["Order", "SKU", "Member"]].reset_index().rename(columns = {"index": "ID"})
recc_df["ID"] = recc_df["ID"].apply(lambda x: x+1)
recc_df

Unnamed: 0,ID,Order,SKU,Member
0,1,7409204,15668465,SWLCNOE
1,2,7409204,15668459,SWLCNOE
2,3,7409204,15668460,SWLCNOE
3,4,7409204,15668467,SWLCNOE
4,5,7409204,15668494,SWLCNOE
...,...,...,...,...
3185,3186,7682167,15668542,SWOSHSO
3186,3187,7682167,15669789,SWOSHSO
3187,3188,7682167,7586073,SWOSHSO
3188,3189,7682167,15669777,SWOSHSO


In [21]:
recc_df.to_csv("past_frequency_based_recc.csv", index = False)

# Train Test Split

The data is split into last orders (for members who have more than 5 orders) and all but last order.
Previous orders are used to train the recommendation system and the last orders are used for testing.

## Last Order per Member

In [5]:
df

Unnamed: 0,Order,SKU,Member,Delivery Date,Name
0,8358896,15668375,SSCEHNS,2013-11-02,Root Vegetables
1,8358896,15668467,SSCEHNS,2013-11-02,Beans
2,8358896,15669863,SSCEHNS,2013-11-02,Moong Dal
3,8358896,15669778,SSCEHNS,2013-11-02,Other Dals
4,8358896,15669767,SSCEHNS,2013-11-02,Urad Dal
...,...,...,...,...,...
28979,7466404,15669886,SWRNHCS,2013-09-01,Sooji & Rava
28980,7466404,15669874,SWRNHCS,2013-09-01,Avalakki / Poha
28981,7466404,15670260,SWRNHCS,2013-09-01,Organic F&V
28982,7466404,15670196,SWRNHCS,2013-09-01,Organic F&V


In [6]:
member_order_count = df[["Member", "Order"]].drop_duplicates()["Member"].value_counts().reset_index()
members_with_more_than_5_orders = member_order_count[member_order_count["count"] > 5]["Member"]
len(members_with_more_than_5_orders)

166

In [7]:
last_orders = df[df["Member"].isin(members_with_more_than_5_orders)][["Member", "Order", "Delivery Date"]] \
                .drop_duplicates() \
                .sort_values(["Order", "Delivery Date"], ascending = [True, False]) \
                .drop_duplicates(subset = "Member") \
                .reset_index(drop = True)["Order"]

In [8]:
len(last_orders)

166

In [9]:
df_train = df[~df["Order"].isin(last_orders)]
df_train

Unnamed: 0,Order,SKU,Member,Delivery Date,Name
0,8358896,15668375,SSCEHNS,2013-11-02,Root Vegetables
1,8358896,15668467,SSCEHNS,2013-11-02,Beans
2,8358896,15669863,SSCEHNS,2013-11-02,Moong Dal
3,8358896,15669778,SSCEHNS,2013-11-02,Other Dals
4,8358896,15669767,SSCEHNS,2013-11-02,Urad Dal
...,...,...,...,...,...
28979,7466404,15669886,SWRNHCS,2013-09-01,Sooji & Rava
28980,7466404,15669874,SWRNHCS,2013-09-01,Avalakki / Poha
28981,7466404,15670260,SWRNHCS,2013-09-01,Organic F&V
28982,7466404,15670196,SWRNHCS,2013-09-01,Organic F&V


In [10]:
df_test = df[df["Order"].isin(last_orders)]
df_test

Unnamed: 0,Order,SKU,Member,Delivery Date,Name
94,8101324,15668377,SSCEHNS,2014-03-15,Root Vegetables
95,8101324,15668684,SSCEHNS,2014-03-15,Beans
96,8101324,15669778,SSCEHNS,2014-03-15,Other Dals
97,8101324,15668521,SSCEHNS,2014-03-15,Bread
98,8101324,15669865,SSCEHNS,2014-03-15,Other Dals
...,...,...,...,...,...
28959,7737427,7573376,SWRNESE,2012-12-08,Mango Juices
28960,7737427,7590864,SWRNESE,2012-12-08,Yogurt & Lassi
28961,7737427,93289487,SWRNESE,2012-12-08,Phenyles & Acids
28962,7737427,7675055,SWRNESE,2012-12-08,Mosquito Repellent


In [40]:
# df_train.to_csv("df_train.csv", index = False)
# df_test.to_csv("df_test.csv", index = False)

## Simulating Forgotten Items

In [49]:
basket_list = list()
forgotten_list = list()

for order in df_test["Order"].unique():
    y_dummy = ['']*df_test[df_test["Order"] == order].shape[0]
    basket, forgotten, _, _ = train_test_split(df_test[df_test["Order"] == order], y_dummy, test_size = 5, random_state = 101)
    basket_list.append(basket)
    forgotten_list.append(forgotten)

test_basket_df = pd.concat(basket_list, ignore_index = True)
forgotten_items_df = pd.concat(forgotten_list, ignore_index = True)

In [60]:
test_basket_df["Order"].value_counts()

Order
7362753    26
7392553    20
7566535    19
7391877    16
7428900    16
           ..
7453262     3
7460103     3
7513528     3
7352666     3
8101324     3
Name: count, Length: 166, dtype: int64

In [52]:
forgotten_items_df["Order"].value_counts()

Order
8101324    5
7734044    5
7585543    5
7644998    5
7770723    5
          ..
7370908    5
7391877    5
7360479    5
7370657    5
7737427    5
Name: count, Length: 166, dtype: int64

In [54]:
# test_basket_df.to_csv("test_basket_df.csv", index = False)
# forgotten_items_df.to_csv("forgotten_items_df.csv", index = False)

# Recommendation Engine 1: item based collaborative filtering

## SKU Co-occurrence Matrix

In [60]:
df_train = pd.read_csv("df_train.csv")
df_train

Unnamed: 0,Order,SKU,Member,Delivery Date,Name
0,8358896,15668375,SSCEHNS,2013-11-02,Root Vegetables
1,8358896,15668467,SSCEHNS,2013-11-02,Beans
2,8358896,15669863,SSCEHNS,2013-11-02,Moong Dal
3,8358896,15669778,SSCEHNS,2013-11-02,Other Dals
4,8358896,15669767,SSCEHNS,2013-11-02,Urad Dal
...,...,...,...,...,...
27026,7466404,15669886,SWRNHCS,2013-09-01,Sooji & Rava
27027,7466404,15669874,SWRNHCS,2013-09-01,Avalakki / Poha
27028,7466404,15670260,SWRNHCS,2013-09-01,Organic F&V
27029,7466404,15670196,SWRNHCS,2013-09-01,Organic F&V


In [76]:
sku_list = list(df_train["SKU"].unique())
len(sku_list)

632

In [77]:
pairs = list(itertools.combinations_with_replacement(sku_list, 2))
print(len(pairs))

200028


In [78]:
%%time

s1_list = list()
s2_list = list()
cooccur_list = list()

for s1, s2 in pairs:
    s1_list.append(s1)
    s2_list.append(s2)
    
    freq = len(set(df_train[df_train["SKU"] == s1]["Order"]) \
               .intersection(set(df_train[df_train["SKU"] == s2]["Order"]))
              )
    cooccur_list.append(freq)

pairwise_cooccurrence = pd.DataFrame({"SKU1": s1_list, "SKU2": s2_list, "cooccurrence_frequency": cooccur_list})
pairwise_cooccurrence

CPU times: user 25.8 s, sys: 39.1 ms, total: 25.8 s
Wall time: 25.8 s


Unnamed: 0,SKU1,SKU2,cooccurrence_frequency
0,15668375,15668375,108
1,15668375,15668467,33
2,15668375,15669863,13
3,15668375,15669778,11
4,15668375,15669767,4
...,...,...,...
200023,92433757,92286348,0
200024,92433757,7610713,0
200025,92286348,92286348,2
200026,92286348,7610713,0


In [79]:
%%time

co_matrix = pd.DataFrame(np.zeros((len(sku_list), len(sku_list))), index=sku_list, columns=sku_list)

for _, row in pairwise_cooccurrence.iterrows():
    co_matrix.loc[row["SKU1"], row["SKU2"]] = row["cooccurrence_frequency"]
    co_matrix.loc[row["SKU2"], row["SKU1"]] = row["cooccurrence_frequency"]

co_matrix

CPU times: user 7.29 s, sys: 18.6 ms, total: 7.31 s
Wall time: 7.38 s


Unnamed: 0,15668375,15668467,15669863,15669778,15669767,15669832,15669772,15669970,15668478,15669861,...,7590868,15668382,7582051,7590866,7735524,15670172,7590864,92433757,92286348,7610713
15668375,108.0,33.0,13.0,11.0,4.0,4.0,11.0,3.0,27.0,12.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15668467,33.0,322.0,31.0,25.0,19.0,18.0,27.0,4.0,62.0,21.0,...,1.0,4.0,1.0,8.0,0.0,0.0,5.0,0.0,1.0,1.0
15669863,13.0,31.0,299.0,45.0,43.0,42.0,47.0,4.0,24.0,43.0,...,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
15669778,11.0,25.0,45.0,243.0,19.0,23.0,69.0,4.0,21.0,40.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
15669767,4.0,19.0,43.0,19.0,184.0,38.0,29.0,4.0,7.0,31.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15670172,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0
7590864,0.0,5.0,0.0,1.0,0.0,1.0,2.0,0.0,1.0,2.0,...,0.0,0.0,0.0,11.0,0.0,0.0,14.0,0.0,0.0,0.0
92433757,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0
92286348,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0


## Similarity Matrix

In [80]:
%%time

sim_matrix = pd.DataFrame(cosine_similarity(co_matrix), index=sku_list, columns=sku_list)
sim_matrix

Unnamed: 0,15668375,15668467,15669863,15669778,15669767,15669832,15669772,15669970,15668478,15669861,...,7590868,15668382,7582051,7590866,7735524,15670172,7590864,92433757,92286348,7610713
15668375,1.000000,0.613503,0.409685,0.413580,0.310546,0.312635,0.414734,0.327831,0.548322,0.397129,...,0.208793,0.371777,0.342878,0.247329,0.175568,0.106815,0.259903,0.179621,0.269245,0.287959
15668467,0.613503,1.000000,0.448954,0.448271,0.363029,0.364380,0.459220,0.348781,0.609245,0.414204,...,0.347401,0.591794,0.320763,0.464254,0.190179,0.114182,0.427580,0.199862,0.406556,0.389189
15669863,0.409685,0.448954,1.000000,0.613309,0.632344,0.593434,0.623119,0.458031,0.410087,0.606846,...,0.325648,0.299858,0.408643,0.235254,0.120424,0.105860,0.234415,0.258891,0.269451,0.185499
15669778,0.413580,0.448271,0.613309,1.000000,0.512776,0.523431,0.721877,0.443519,0.417328,0.605837,...,0.338942,0.307762,0.322747,0.241307,0.126258,0.110800,0.296229,0.365097,0.243480,0.200260
15669767,0.310546,0.363029,0.632344,0.512776,1.000000,0.631637,0.571371,0.486398,0.301178,0.587039,...,0.272512,0.213498,0.293450,0.163419,0.099506,0.107323,0.176269,0.226608,0.242507,0.137267
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15670172,0.106815,0.114182,0.105860,0.110800,0.107323,0.195462,0.138661,0.098822,0.172587,0.113124,...,0.142695,0.153513,0.086427,0.140733,0.138123,1.000000,0.122875,0.109381,0.075449,0.052105
7590864,0.259903,0.427580,0.234415,0.296229,0.176269,0.223270,0.313733,0.196050,0.338862,0.281306,...,0.125277,0.292786,0.188385,0.931773,0.163078,0.122875,1.000000,0.198681,0.219276,0.170360
92433757,0.179621,0.199862,0.258891,0.365097,0.226608,0.219379,0.352618,0.209597,0.293110,0.242570,...,0.206374,0.240851,0.180797,0.170098,0.192626,0.109381,0.198681,1.000000,0.093530,0.080740
92286348,0.269245,0.406556,0.269451,0.243480,0.242507,0.247526,0.241478,0.241623,0.409739,0.245746,...,0.244036,0.270740,0.110855,0.204578,0.177162,0.075449,0.219276,0.093530,1.000000,0.111386


## Test on df_test

In [81]:
test_basket_df = pd.read_csv("test_basket_df.csv")

In [93]:
# recc_df = pd.DataFrame(columns = ["Order", "SKU", "Member", "Similarity Score"])

# for order in test_basket_df["Order"].unique():
#     test_order = test_basket_df[test_basket_df["Order"] == order]
#     test_recc = pd.DataFrame(columns = ["SKU", "Similarity Score"])
#     for sku in test_order["SKU"]:
#         df_ = pd.DataFrame(sim_matrix.loc[sku]).reset_index()  
#         df_.columns = ["SKU", "Similarity Score"]
#         df_ = df_.sort_values("Similarity Score", ascending = False).reset_index(drop = True).iloc[1:]
#         test_recc = pd.concat([test_recc, df_]) 
#     test_recc["Order"] = order
#     test_recc["Member"] = test_order["Member"].unique()[0]
#     test_recc = test_recc[["Order", "SKU", "Member", "Similarity Score"]] \
#                     .sort_values("Similarity Score", ascending = False) \
#                     .iloc[:5]
#                     # .sort_values(["SKU", "Similarity Score"], ascending = [True, False]) \
#                     # .drop_duplicates(subset = "SKU") \
                    

#     recc_df = pd.concat([recc_df, test_recc], ignore_index = True)

In [94]:
recc_df = pd.DataFrame(columns = ["Order", "SKU", "Member"])

for order in test_basket_df["Order"].unique():
    test_order = test_basket_df[test_basket_df["Order"] == order]
    test_recc = pd.DataFrame(columns = ["SKU", "Similarity Score"])
    for sku in test_order["SKU"]:
        df_ = pd.DataFrame(sim_matrix.loc[sku]).reset_index()
        df_.columns = ["SKU", "Similarity Score"]
        df_ = df_.sort_values("Similarity Score", ascending = False).reset_index(drop = True)
        recc_count = 0
        for i in range(1, 632):
            if df_.iloc[i]["SKU"] in list(test_recc["SKU"]):
                pass
            else:
                test_recc = pd.concat([test_recc, df_.iloc[i:i+1]]) 
                recc_count += 1
            
            if recc_count >= 3:
                break    
    test_recc = test_recc.sort_values("Similarity Score", ascending = False).iloc[:5]
    test_recc["Order"] = order
    test_recc["Member"] = test_order["Member"].unique()[0]
    test_recc = test_recc[["Order", "SKU", "Member"]]

    recc_df = pd.concat([recc_df, test_recc], ignore_index = True)

In [97]:
recc_df

Unnamed: 0,Order,SKU,Member
0,8101324,15668460,SSCEHNS
1,8101324,15668381,SSCEHNS
2,8101324,15668467,SSCEHNS
3,8101324,21409124,SSCEHNS
4,8101324,34990774,SSCEHNS
...,...,...,...
825,7737427,7590866,SWRNESE
826,7737427,7590867,SWRNESE
827,7737427,15668688,SWRNESE
828,7737427,15668381,SWRNESE


## Evaluation using Recall@5 Metric

In [87]:
forgotten_items_df = pd.read_csv("forgotten_items_df.csv")
forgotten_items_df

Unnamed: 0,Order,SKU,Member,Delivery Date,Name
0,8101324,15668377,SSCEHNS,2014-03-15,Root Vegetables
1,8101324,15669778,SSCEHNS,2014-03-15,Other Dals
2,8101324,15669814,SSCEHNS,2014-03-15,Jaggery
3,8101324,15669865,SSCEHNS,2014-03-15,Other Dals
4,8101324,15669970,SSCEHNS,2014-03-15,Cashews
...,...,...,...,...,...
825,7737427,7675055,SWRNESE,2012-12-08,Mosquito Repellent
826,7737427,15668457,SWRNESE,2012-12-08,Brinjals
827,7737427,15668520,SWRNESE,2012-12-08,Bread
828,7737427,7573376,SWRNESE,2012-12-08,Mango Juices


In [91]:
recall_list = list()

for order in list(forgotten_items_df["Order"].unique()):
    forgotten_SKUs = forgotten_items_df[forgotten_items_df["Order"] == order]["SKU"]
    recc_SKUs = recc_df[recc_df["Order"] == order]["SKU"]
    recall = len(set(forgotten_SKUs).intersection(set(recc_SKUs)))/len(set(forgotten_SKUs))
    recall_list.append(recall)

In [92]:
np.mean(recall_list)

0.10963855421686744

## Prepare Kaggle Submission    

In [114]:
%%time

sku_list = list(df["SKU"].unique())
pairs = list(itertools.combinations_with_replacement(sku_list, 2))

s1_list = list()
s2_list = list()
cooccur_list = list()

for s1, s2 in pairs:
    s1_list.append(s1)
    s2_list.append(s2)
    
    freq = len(set(df[df["SKU"] == s1]["Order"]) \
               .intersection(set(df[df["SKU"] == s2]["Order"]))
              )
    cooccur_list.append(freq)

pairwise_cooccurrence = pd.DataFrame({"SKU1": s1_list, "SKU2": s2_list, "cooccurrence_frequency": cooccur_list})

co_matrix = pd.DataFrame(np.zeros((len(sku_list), len(sku_list))), index=sku_list, columns=sku_list)

for _, row in pairwise_cooccurrence.iterrows():
    co_matrix.loc[row["SKU1"], row["SKU2"]] = row["cooccurrence_frequency"]
    co_matrix.loc[row["SKU2"], row["SKU1"]] = row["cooccurrence_frequency"]

sim_matrix = pd.DataFrame(cosine_similarity(co_matrix), index=sku_list, columns=sku_list)

recc_df = pd.DataFrame(columns = ["Order", "SKU", "Member"])

for order in test_data["Order"].unique():
    test_order = test_data[test_data["Order"] == order]
    test_recc = pd.DataFrame(columns = ["SKU", "Similarity Score"])
    for sku in test_order["SKU"]:
        try:
            df_ = pd.DataFrame(sim_matrix.loc[sku]).reset_index()
            df_.columns = ["SKU", "Similarity Score"]
            df_ = df_.sort_values("Similarity Score", ascending = False).reset_index(drop = True)
            recc_count = 0
            for i in range(1, 632):
                if df_.iloc[i]["SKU"] in list(test_recc["SKU"]):
                    pass
                else:
                    test_recc = pd.concat([test_recc, df_.iloc[i:i+1]]) 
                    recc_count += 1
                
                if recc_count >= 3:
                    break
        except:
            pass
    test_recc = test_recc.sort_values("Similarity Score", ascending = False).iloc[:5]
    test_recc["Order"] = order
    test_recc["Member"] = test_order["Member"].unique()[0]
    test_recc = test_recc[["Order", "SKU", "Member"]]

    recc_df = pd.concat([recc_df, test_recc], ignore_index = True)

recc_df

CPU times: user 44 s, sys: 62.9 ms, total: 44.1 s
Wall time: 44.1 s


Unnamed: 0,Order,SKU,Member
0,7409204,15668465,SWLCNOE
1,7409204,15669772,SWLCNOE
2,7409204,15668467,SWLCNOE
3,7409204,15668379,SWLCNOE
4,7409204,15668688,SWLCNOE
...,...,...,...
3185,7682167,15668460,SWOSHSO
3186,7682167,15668467,SWOSHSO
3187,7682167,15668468,SWOSHSO
3188,7682167,15668379,SWOSHSO


In [115]:
recc_df = recc_df[["Order", "SKU", "Member"]].reset_index().rename(columns = {"index": "ID"})
recc_df["ID"] = recc_df["ID"].apply(lambda x: x+1)
recc_df

Unnamed: 0,ID,Order,SKU,Member
0,1,7409204,15668465,SWLCNOE
1,2,7409204,15669772,SWLCNOE
2,3,7409204,15668467,SWLCNOE
3,4,7409204,15668379,SWLCNOE
4,5,7409204,15668688,SWLCNOE
...,...,...,...,...
3185,3186,7682167,15668460,SWOSHSO
3186,3187,7682167,15668467,SWOSHSO
3187,3188,7682167,15668468,SWOSHSO
3188,3189,7682167,15668379,SWOSHSO


In [116]:
recc_df.to_csv("item_based_collab_filtering_recc.csv", index = False)

# Recommendation Engine 2: probability based

## Co-occurence Matrix for Calculating Probabilites

In [71]:
df_train = pd.read_csv("df_train.csv")

In [17]:
sku_list = list(df_train["SKU"].unique())
len(sku_list)

632

In [38]:
pairs = list(itertools.combinations_with_replacement(sku_list, 2))

s1_list = list()
s2_list = list()
cooccur_list = list()

for s1, s2 in pairs:
    s1_list.append(s1)
    s2_list.append(s2)
    
    freq = len(set(df_train[df_train["SKU"] == s1]["Order"]) \
               .intersection(set(df_train[df_train["SKU"] == s2]["Order"]))
              )
    cooccur_list.append(freq)

pairwise_cooccurrence = pd.DataFrame({"SKU1": s1_list, "SKU2": s2_list, "cooccurrence_frequency": cooccur_list})

co_matrix = pd.DataFrame(np.zeros((len(sku_list), len(sku_list))), index=sku_list, columns=sku_list)

for _, row in pairwise_cooccurrence.iterrows():
    co_matrix.loc[row["SKU1"], row["SKU2"]] = row["cooccurrence_frequency"]
    co_matrix.loc[row["SKU2"], row["SKU1"]] = row["cooccurrence_frequency"]

co_matrix

Unnamed: 0,SKU1,SKU2,cooccurrence_frequency
0,15668375,15668375,108
1,15668375,15668467,33
2,15668375,15669863,13
3,15668375,15669778,11
4,15668375,15669767,4
...,...,...,...
200023,92433757,92286348,0
200024,92433757,7610713,0
200025,92286348,92286348,2
200026,92286348,7610713,0


## Test on df_test

In [13]:
test_basket_df = pd.read_csv("test_basket_df.csv")
test_basket_df

Unnamed: 0,Order,SKU,Member,Delivery Date,Name
0,8101324,15668684,SSCEHNS,2014-03-15,Beans
1,8101324,15668521,SSCEHNS,2014-03-15,Bread
2,8101324,15669811,SSCEHNS,2014-03-15,Almonds
3,8203855,15669873,SSCHNCE,2014-02-03,Whole Spices
4,8203855,15669767,SSCHNCE,2014-02-03,Urad Dal
...,...,...,...,...,...
1118,8160271,15669760,SWRLSWR,2014-02-12,Whole Spices
1119,7737427,7735781,SWRNESE,2012-12-08,Namkeen
1120,7737427,7590864,SWRNESE,2012-12-08,Yogurt & Lassi
1121,7737427,93289487,SWRNESE,2012-12-08,Phenyles & Acids


In [62]:
recc_df = pd.DataFrame(columns = ["Order", "SKU", "Member"])

for order in test_basket_df["Order"].unique():
    test_order = test_basket_df[test_basket_df["Order"] == order]
    prob_list = list()
    for sku in sku_list:
        cond_prob_list = list()
        for basket_item in test_order["SKU"]:
            p = co_matrix.loc[sku, basket_item]/co_matrix.loc[basket_item, basket_item]
            cond_prob_list.append(p)
            # print(f"sku: {sku}, basket_item: {basket_item}, probability: {p}")
        prob_list.append(math.prod(cond_prob_list))
    
    test_recc = pd.DataFrame({"SKU": sku_list, "Probability": prob_list})
    
    test_recc = test_recc.sort_values("Probability", ascending = False).iloc[:5]
    test_recc["Order"] = order
    test_recc["Member"] = test_order["Member"].unique()[0]
    # test_recc = test_recc[["Order", "SKU", "Member"]]

    recc_df = pd.concat([recc_df, test_recc], ignore_index = True)

In [65]:
recc_df

Unnamed: 0,Order,SKU,Member,Probability
0,8101324,15668381,SSCEHNS,0.010054
1,8101324,15668460,SSCEHNS,0.009763
2,8101324,15668379,SSCEHNS,0.009157
3,8101324,15669780,SSCEHNS,0.007631
4,8101324,15668688,SSCEHNS,0.005117
...,...,...,...,...
825,7737427,15668379,SWRNESE,0.005952
826,7737427,15668688,SWRNESE,0.001727
827,7737427,15669764,SWRNESE,0.001231
828,7737427,15669864,SWRNESE,0.001201


## Evaluation using Recall@5 Metric

In [68]:
forgotten_items_df = pd.read_csv("forgotten_items_df.csv")
forgotten_items_df

Unnamed: 0,Order,SKU,Member,Delivery Date,Name
0,8101324,15668377,SSCEHNS,2014-03-15,Root Vegetables
1,8101324,15669778,SSCEHNS,2014-03-15,Other Dals
2,8101324,15669814,SSCEHNS,2014-03-15,Jaggery
3,8101324,15669865,SSCEHNS,2014-03-15,Other Dals
4,8101324,15669970,SSCEHNS,2014-03-15,Cashews
...,...,...,...,...,...
825,7737427,7675055,SWRNESE,2012-12-08,Mosquito Repellent
826,7737427,15668457,SWRNESE,2012-12-08,Brinjals
827,7737427,15668520,SWRNESE,2012-12-08,Bread
828,7737427,7573376,SWRNESE,2012-12-08,Mango Juices


In [69]:
recall_list = list()

for order in list(forgotten_items_df["Order"].unique()):
    forgotten_SKUs = forgotten_items_df[forgotten_items_df["Order"] == order]["SKU"]
    recc_SKUs = recc_df[recc_df["Order"] == order]["SKU"]
    recall = len(set(forgotten_SKUs).intersection(set(recc_SKUs)))/len(set(forgotten_SKUs))
    recall_list.append(recall)

In [70]:
np.mean(recall_list)

0.11927710843373492

## Prepare Kaggle Submission    

In [79]:
%%time

sku_list = list(df["SKU"].unique())

pairs = list(itertools.combinations_with_replacement(sku_list, 2))

s1_list = list()
s2_list = list()
cooccur_list = list()

for s1, s2 in pairs:
    s1_list.append(s1)
    s2_list.append(s2)
    
    freq = len(set(df[df["SKU"] == s1]["Order"]) \
               .intersection(set(df[df["SKU"] == s2]["Order"]))
              )
    cooccur_list.append(freq)

pairwise_cooccurrence = pd.DataFrame({"SKU1": s1_list, "SKU2": s2_list, "cooccurrence_frequency": cooccur_list})

co_matrix = pd.DataFrame(np.zeros((len(sku_list), len(sku_list))), index=sku_list, columns=sku_list)

for _, row in pairwise_cooccurrence.iterrows():
    co_matrix.loc[row["SKU1"], row["SKU2"]] = row["cooccurrence_frequency"]
    co_matrix.loc[row["SKU2"], row["SKU1"]] = row["cooccurrence_frequency"]

recc_df = pd.DataFrame(columns = ["Order", "SKU", "Member"])

for order in test_data["Order"].unique():
    test_order = test_data[test_data["Order"] == order]
    prob_list = list()
    for sku in sku_list:
        try:
            cond_prob_list = list()
            for basket_item in test_order["SKU"]:
                p = co_matrix.loc[sku, basket_item]/co_matrix.loc[basket_item, basket_item]
                cond_prob_list.append(p)
                # print(f"sku: {sku}, basket_item: {basket_item}, probability: {p}")
            prob_list.append(math.prod(cond_prob_list))
        except:
            prob_list.append(0)
    test_recc = pd.DataFrame({"SKU": sku_list, "Probability": prob_list})
    
    test_recc = test_recc.sort_values("Probability", ascending = False).iloc[:5]
    test_recc["Order"] = order
    test_recc["Member"] = test_order["Member"].unique()[0]
    # test_recc = test_recc[["Order", "SKU", "Member"]]

    recc_df = pd.concat([recc_df, test_recc], ignore_index = True)


CPU times: user 1min 1s, sys: 112 ms, total: 1min 1s
Wall time: 1min 1s


In [81]:
recc_df = recc_df[["Order", "SKU", "Member"]].reset_index().rename(columns = {"index": "ID"})
recc_df["ID"] = recc_df["ID"].apply(lambda x: x+1)
recc_df

Unnamed: 0,ID,Order,SKU,Member
0,1,7409204,15668379,SWLCNOE
1,2,7409204,15668688,SWLCNOE
2,3,7409204,15668460,SWLCNOE
3,4,7409204,15668381,SWLCNOE
4,5,7409204,15669878,SWLCNOE
...,...,...,...,...
3185,3186,7682167,15668379,SWOSHSO
3186,3187,7682167,15668381,SWOSHSO
3187,3188,7682167,15668468,SWOSHSO
3188,3189,7682167,15668460,SWOSHSO


In [82]:
recc_df.to_csv("probability_based_recc.csv", index = False)

# Recommendation Engine 3: