In [1]:
import pandas as pd
import numpy as np

import itertools
import math

# from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA

from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split, GridSearchCV
from surprise import accuracy

import warnings
warnings.filterwarnings("ignore")

In [2]:
test_data = pd.read_csv('last_orders_subset.csv')
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5487 entries, 0 to 5486
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Order          5487 non-null   int64 
 1   SKU            5487 non-null   int64 
 2   Member         5487 non-null   object
 3   Delivery Date  5487 non-null   object
 4   Name           5487 non-null   object
dtypes: int64(2), object(3)
memory usage: 214.5+ KB


In [3]:
df = pd.read_csv('all_except_last_orders.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28984 entries, 0 to 28983
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Order          28984 non-null  int64 
 1   SKU            28984 non-null  int64 
 2   Member         28984 non-null  object
 3   Delivery Date  28984 non-null  object
 4   Name           28984 non-null  object
dtypes: int64(2), object(3)
memory usage: 1.1+ MB


In [4]:
df['Delivery Date'] = pd.to_datetime(df['Delivery Date'], dayfirst = True)

# Train Test Split

The data is split into last orders (for members who have more than 5 orders) and all but last order.
Previous orders are used to train the recommendation system and the last orders are used for testing.

## Last Order per Member

In [23]:
df.head()

Unnamed: 0,Order,SKU,Member,Delivery Date,Name
0,8358896,15668375,SSCEHNS,2013-11-02,Root Vegetables
1,8358896,15668467,SSCEHNS,2013-11-02,Beans
2,8358896,15669863,SSCEHNS,2013-11-02,Moong Dal
3,8358896,15669778,SSCEHNS,2013-11-02,Other Dals
4,8358896,15669767,SSCEHNS,2013-11-02,Urad Dal


In [24]:
member_order_count = df[["Member", "Order"]].drop_duplicates()["Member"].value_counts().reset_index()
members_with_more_than_5_orders = member_order_count[member_order_count["count"] > 5]["Member"]
len(members_with_more_than_5_orders)

166

In [25]:
last_orders = df[df["Member"].isin(members_with_more_than_5_orders)][["Member", "Order", "Delivery Date"]] \
                .drop_duplicates() \
                .sort_values(["Order", "Delivery Date"], ascending = [True, False]) \
                .drop_duplicates(subset = "Member") \
                .reset_index(drop = True)["Order"]

In [26]:
len(last_orders)

166

In [27]:
df_train = df[~df["Order"].isin(last_orders)]
df_train.head()

Unnamed: 0,Order,SKU,Member,Delivery Date,Name
0,8358896,15668375,SSCEHNS,2013-11-02,Root Vegetables
1,8358896,15668467,SSCEHNS,2013-11-02,Beans
2,8358896,15669863,SSCEHNS,2013-11-02,Moong Dal
3,8358896,15669778,SSCEHNS,2013-11-02,Other Dals
4,8358896,15669767,SSCEHNS,2013-11-02,Urad Dal


In [28]:
df_test = df[df["Order"].isin(last_orders)]
df_test.head()

Unnamed: 0,Order,SKU,Member,Delivery Date,Name
94,8101324,15668377,SSCEHNS,2014-03-15,Root Vegetables
95,8101324,15668684,SSCEHNS,2014-03-15,Beans
96,8101324,15669778,SSCEHNS,2014-03-15,Other Dals
97,8101324,15668521,SSCEHNS,2014-03-15,Bread
98,8101324,15669865,SSCEHNS,2014-03-15,Other Dals


In [29]:
# df_train.to_csv("df_train.csv", index = False)
# df_test.to_csv("df_test.csv", index = False)

## Simulating Forgotten Items

In [30]:
basket_list = list()
forgotten_list = list()

for order in df_test["Order"].unique():
    y_dummy = ['']*df_test[df_test["Order"] == order].shape[0]
    basket, forgotten, _, _ = train_test_split(df_test[df_test["Order"] == order], y_dummy, test_size = 5, random_state = 101)
    basket_list.append(basket)
    forgotten_list.append(forgotten)

test_basket_df = pd.concat(basket_list, ignore_index = True)
forgotten_items_df = pd.concat(forgotten_list, ignore_index = True)

In [31]:
test_basket_df["Order"].value_counts()

Order
7362753    26
7392553    20
7566535    19
7391877    16
7428900    16
           ..
7453262     3
7460103     3
7513528     3
7352666     3
8101324     3
Name: count, Length: 166, dtype: int64

In [32]:
forgotten_items_df["Order"].value_counts()

Order
8101324    5
7734044    5
7585543    5
7644998    5
7770723    5
          ..
7370908    5
7391877    5
7360479    5
7370657    5
7737427    5
Name: count, Length: 166, dtype: int64

In [33]:
# test_basket_df.to_csv("test_basket_df.csv", index = False)
# forgotten_items_df.to_csv("forgotten_items_df.csv", index = False)

# User Based Collaborative Filtering with item similarity

## Member-SKU (User-Item) Matrix

In [86]:
df_train = pd.read_csv("df_train.csv")
df_train.head()

Unnamed: 0,Order,SKU,Member,Delivery Date,Name
0,8358896,15668375,SSCEHNS,2013-11-02,Root Vegetables
1,8358896,15668467,SSCEHNS,2013-11-02,Beans
2,8358896,15669863,SSCEHNS,2013-11-02,Moong Dal
3,8358896,15669778,SSCEHNS,2013-11-02,Other Dals
4,8358896,15669767,SSCEHNS,2013-11-02,Urad Dal


In [87]:
sku_list = list(df_train["SKU"].unique())
len(sku_list)

632

In [88]:
member_list = list(df_train["Member"].unique())
len(member_list)

638

In [89]:
sku_freq_df = df_train[["Member", "SKU"]].groupby(["Member", "SKU"]) \
                .size() \
                .reset_index(name='frequency') \
                .sort_values(by = ["Member", "frequency"], ascending = [True, False])

sku_freq_df.head()

Unnamed: 0,Member,SKU,frequency
2,SSCEHNS,7580823,7
8,SSCEHNS,15668377,6
20,SSCEHNS,15669772,6
32,SSCEHNS,15669865,6
7,SSCEHNS,15668375,5


In [90]:
sm_matrix = pd.pivot(sku_freq_df, index='SKU', columns='Member', values='frequency').fillna(0)
sm_matrix.head()

Member,SSCEHNS,SSCESNS,SSCEWZO,SSCHNCE,SSCLCSW,SSCLLWS,SSCLSCL,SSCLWOR,SSCNHZN,SSCNLEC,...,SWRHZSH,SWRLHOS,SWRLOEC,SWRLSWR,SWRLWHS,SWRNCEH,SWRNCSS,SWRNEHZ,SWRNESE,SWRNHCS
SKU,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6884195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7541573,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7543241,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7547271,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7547296,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## SKU Similarity Matrix

In [112]:
scaler = StandardScaler()
sm_matrix_scaled = scaler.fit_transform(sm_matrix)

In [113]:
%%time

sim_matrix = pd.DataFrame((euclidean_distances(sm_matrix_scaled)), index=list(sm_matrix.index), columns=list(sm_matrix.index))
sim_matrix.head()

CPU times: user 16.6 ms, sys: 4.73 ms, total: 21.4 ms
Wall time: 16.5 ms


Unnamed: 0,15668375,15668467,15669863,15669778,15669767,15669832,15669772,15669970,15668478,15669861,...,7590868,15668382,7582051,7590866,7735524,15670172,7590864,92433757,92286348,7610713
15668375,0.0,21.099607,11.446708,17.31919,12.677475,12.662472,12.617875,12.635902,15.919548,15.138056,...,30.73887,13.819715,13.968046,11.003437,10.751899,9.571307,16.062754,13.722294,19.999713,23.164156
15668467,21.099607,0.0,21.111366,24.507853,20.334886,21.794515,21.768634,21.603885,23.557039,23.320022,...,35.337509,22.684192,22.578064,20.874346,20.742854,20.156205,23.719286,21.449013,26.62843,28.859391
15669863,11.446708,21.111366,0.0,17.333514,12.697036,12.682057,12.637529,12.655528,15.93513,15.154442,...,30.746942,14.156492,13.985802,11.025969,10.774957,9.597202,16.078197,13.740368,20.012118,23.174868
15669778,17.31919,24.507853,17.333514,0.0,16.025778,17.741967,18.046339,18.140827,18.104309,19.641837,...,32.370844,19.218051,19.092666,16.956863,16.20778,16.156531,20.674517,18.878348,23.862357,26.570693
15669767,12.677475,20.334886,12.697036,16.025778,0.0,13.803109,13.762208,13.778738,16.84101,16.104296,...,31.031698,15.168966,15.009794,12.298912,12.074392,11.036192,16.976444,14.282121,20.740688,23.806842


In [114]:
mm_scaler = MinMaxScaler()
sim_matrix_scaled = mm_scaler.fit_transform(sim_matrix)

In [115]:
sim_matrix = pd.DataFrame(1 - sim_matrix_scaled, index=sku_list, columns=sku_list)
sim_matrix

Unnamed: 0,15668375,15668467,15669863,15669778,15669767,15669832,15669772,15669970,15668478,15669861,...,7590868,15668382,7582051,7590866,7735524,15670172,7590864,92433757,92286348,7610713
15668375,1.000000,0.796868,0.889772,0.833817,0.877700,0.877403,0.878031,0.878241,0.846533,0.854571,...,0.708371,0.866799,0.865492,0.893855,0.896385,0.907582,0.845121,0.868229,0.807572,0.778130
15668467,0.796045,1.000000,0.796704,0.764840,0.803829,0.788987,0.789576,0.791825,0.772906,0.775968,...,0.664743,0.781359,0.782580,0.798636,0.800103,0.805377,0.771296,0.794031,0.743794,0.723581
15669863,0.889353,0.796754,1.000000,0.833680,0.877511,0.877213,0.877841,0.878052,0.846382,0.854413,...,0.708295,0.863553,0.865321,0.893638,0.896163,0.907332,0.844973,0.868055,0.807453,0.778028
15669778,0.832588,0.764055,0.833084,1.000000,0.845399,0.828224,0.825557,0.825195,0.825471,0.811304,...,0.692888,0.814768,0.816143,0.836426,0.843807,0.843997,0.800654,0.818716,0.770407,0.745502
15669767,0.877456,0.804230,0.877731,0.846228,1.000000,0.866360,0.866969,0.867228,0.837650,0.845288,...,0.705593,0.853794,0.855460,0.881359,0.883640,0.893437,0.836312,0.862853,0.800443,0.771975
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15670172,0.907481,0.805950,0.907582,0.844973,0.893533,0.893315,0.893982,0.894116,0.858808,0.867495,...,0.714441,0.877706,0.879654,0.912565,0.919289,1.000000,0.879091,0.899136,0.817174,0.794883
7590864,0.844733,0.771647,0.845172,0.801622,0.836228,0.835744,0.836329,0.836714,0.811853,0.818575,...,0.692725,0.825602,0.827044,0.847916,0.851784,0.878920,1.000000,0.845995,0.779025,0.764315
92433757,0.867357,0.793504,0.867685,0.818857,0.862220,0.857013,0.857611,0.857910,0.832235,0.837269,...,0.699588,0.845279,0.846867,0.870994,0.873110,0.898579,0.845364,1.000000,0.798722,0.775997
92286348,0.806677,0.743640,0.807290,0.771034,0.799914,0.802575,0.827633,0.800823,0.782803,0.801638,...,0.670549,0.791184,0.792444,0.811001,0.810896,0.816524,0.778553,0.799116,1.000000,0.752661


In [92]:
%%time

sim_matrix = pd.DataFrame((cosine_similarity(sm_matrix)), index=list(sm_matrix.index), columns=list(sm_matrix.index))
sim_matrix.head()

CPU times: user 243 ms, sys: 10.8 ms, total: 254 ms
Wall time: 40.9 ms


Unnamed: 0,6884195,7541573,7543241,7547271,7547296,7547323,7548497,7548498,7548511,7548730,...,93141092,93141093,93156751,93174226,93176429,93176430,93176431,93289485,93289486,93289487
6884195,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.182574,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7541573,0.0,1.0,0.0,0.085749,0.047565,0.0,0.0,0.047565,0.028583,0.0,...,0.021437,0.0,0.0,0.0,0.0,0.0,0.0343,0.16169,0.024254,0.054233
7543241,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7547271,0.0,0.085749,0.0,1.0,0.5547,0.050252,0.050252,0.0,0.166667,0.068041,...,0.25,0.0,0.0,0.048113,0.068041,0.0,0.0,0.039284,0.0,0.0
7547296,0.0,0.047565,0.0,0.5547,1.0,0.0,0.0,0.0,0.0,0.0,...,0.138675,0.0,0.0,0.0,0.0,0.0,0.0,0.065372,0.0,0.0


## Test on df_test

In [93]:
test_basket_df = pd.read_csv("test_basket_df.csv")

In [94]:
recc_df = pd.DataFrame(columns = ["Order", "SKU", "Member"])

for order in test_basket_df["Order"].unique():
    test_order = test_basket_df[test_basket_df["Order"] == order]
    test_recc = pd.DataFrame(columns = ["SKU", "Similarity Score"])
    for sku in test_order["SKU"]:
        df_ = pd.DataFrame(sim_matrix.loc[sku]).reset_index()
        df_.columns = ["SKU", "Similarity Score"]
        df_ = df_.sort_values("Similarity Score", ascending = False).reset_index(drop = True).iloc[1:]
        test_recc = pd.concat([test_recc, df_])
        # recc_count = 0
        # for i in range(1, 632):
        #     if df_.iloc[i]["SKU"] in list(test_recc["SKU"]):
        #         pass
        #     else:
        #         test_recc = pd.concat([test_recc, df_.iloc[i:i+1]]) 
        #         recc_count += 1
            
        #     if recc_count >= 3:
        #         break    
    test_recc = test_recc \
                .sort_values("Similarity Score", ascending = False) \
                .drop_duplicates(subset = "SKU") \
                .iloc[:5]
    test_recc["Order"] = order
    test_recc["Member"] = test_order["Member"].unique()[0]
    test_recc = test_recc[["Order", "SKU", "Member"]]

    recc_df = pd.concat([recc_df, test_recc], ignore_index = True)

In [95]:
recc_df.head()

Unnamed: 0,Order,SKU,Member
0,8101324,34990774,SSCEHNS
1,8101324,21409124,SSCEHNS
2,8101324,7572306,SSCEHNS
3,8101324,7569805,SSCEHNS
4,8101324,34989441,SSCEHNS


## Evaluation using Recall@5 Metric

In [96]:
forgotten_items_df = pd.read_csv("forgotten_items_df.csv")
forgotten_items_df.head()

Unnamed: 0,Order,SKU,Member,Delivery Date,Name
0,8101324,15668377,SSCEHNS,2014-03-15,Root Vegetables
1,8101324,15669778,SSCEHNS,2014-03-15,Other Dals
2,8101324,15669814,SSCEHNS,2014-03-15,Jaggery
3,8101324,15669865,SSCEHNS,2014-03-15,Other Dals
4,8101324,15669970,SSCEHNS,2014-03-15,Cashews


In [97]:
recall_list = list()

for order in list(forgotten_items_df["Order"].unique()):
    forgotten_SKUs = forgotten_items_df[forgotten_items_df["Order"] == order]["SKU"]
    recc_SKUs = recc_df[recc_df["Order"] == order]["SKU"]
    recall = len(set(forgotten_SKUs).intersection(set(recc_SKUs)))/len(set(forgotten_SKUs))
    recall_list.append(recall)

In [98]:
np.mean(recall_list)

0.1108433734939759