In [1]:
import pandas as pd
import numpy as np

import itertools
import math

# from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA

from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split, GridSearchCV
from surprise import accuracy

import warnings
warnings.filterwarnings("ignore")

In [2]:
test_data = pd.read_csv('last_orders_subset.csv')
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5487 entries, 0 to 5486
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Order          5487 non-null   int64 
 1   SKU            5487 non-null   int64 
 2   Member         5487 non-null   object
 3   Delivery Date  5487 non-null   object
 4   Name           5487 non-null   object
dtypes: int64(2), object(3)
memory usage: 214.5+ KB


In [3]:
df = pd.read_csv('all_except_last_orders.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28984 entries, 0 to 28983
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Order          28984 non-null  int64 
 1   SKU            28984 non-null  int64 
 2   Member         28984 non-null  object
 3   Delivery Date  28984 non-null  object
 4   Name           28984 non-null  object
dtypes: int64(2), object(3)
memory usage: 1.1+ MB


In [4]:
df['Delivery Date'] = pd.to_datetime(df['Delivery Date'], dayfirst = True)

# Train Test Split

The data is split into last orders (for members who have more than 5 orders) and all but last order.
Previous orders are used to train the recommendation system and the last orders are used for testing.

## Last Order per Member

In [23]:
df.head()

Unnamed: 0,Order,SKU,Member,Delivery Date,Name
0,8358896,15668375,SSCEHNS,2013-11-02,Root Vegetables
1,8358896,15668467,SSCEHNS,2013-11-02,Beans
2,8358896,15669863,SSCEHNS,2013-11-02,Moong Dal
3,8358896,15669778,SSCEHNS,2013-11-02,Other Dals
4,8358896,15669767,SSCEHNS,2013-11-02,Urad Dal


In [24]:
member_order_count = df[["Member", "Order"]].drop_duplicates()["Member"].value_counts().reset_index()
members_with_more_than_5_orders = member_order_count[member_order_count["count"] > 5]["Member"]
len(members_with_more_than_5_orders)

166

In [25]:
last_orders = df[df["Member"].isin(members_with_more_than_5_orders)][["Member", "Order", "Delivery Date"]] \
                .drop_duplicates() \
                .sort_values(["Order", "Delivery Date"], ascending = [True, False]) \
                .drop_duplicates(subset = "Member") \
                .reset_index(drop = True)["Order"]

In [26]:
len(last_orders)

166

In [27]:
df_train = df[~df["Order"].isin(last_orders)]
df_train.head()

Unnamed: 0,Order,SKU,Member,Delivery Date,Name
0,8358896,15668375,SSCEHNS,2013-11-02,Root Vegetables
1,8358896,15668467,SSCEHNS,2013-11-02,Beans
2,8358896,15669863,SSCEHNS,2013-11-02,Moong Dal
3,8358896,15669778,SSCEHNS,2013-11-02,Other Dals
4,8358896,15669767,SSCEHNS,2013-11-02,Urad Dal


In [28]:
df_test = df[df["Order"].isin(last_orders)]
df_test.head()

Unnamed: 0,Order,SKU,Member,Delivery Date,Name
94,8101324,15668377,SSCEHNS,2014-03-15,Root Vegetables
95,8101324,15668684,SSCEHNS,2014-03-15,Beans
96,8101324,15669778,SSCEHNS,2014-03-15,Other Dals
97,8101324,15668521,SSCEHNS,2014-03-15,Bread
98,8101324,15669865,SSCEHNS,2014-03-15,Other Dals


In [29]:
# df_train.to_csv("df_train.csv", index = False)
# df_test.to_csv("df_test.csv", index = False)

## Simulating Forgotten Items

In [30]:
basket_list = list()
forgotten_list = list()

for order in df_test["Order"].unique():
    y_dummy = ['']*df_test[df_test["Order"] == order].shape[0]
    basket, forgotten, _, _ = train_test_split(df_test[df_test["Order"] == order], y_dummy, test_size = 5, random_state = 101)
    basket_list.append(basket)
    forgotten_list.append(forgotten)

test_basket_df = pd.concat(basket_list, ignore_index = True)
forgotten_items_df = pd.concat(forgotten_list, ignore_index = True)

In [31]:
test_basket_df["Order"].value_counts()

Order
7362753    26
7392553    20
7566535    19
7391877    16
7428900    16
           ..
7453262     3
7460103     3
7513528     3
7352666     3
8101324     3
Name: count, Length: 166, dtype: int64

In [32]:
forgotten_items_df["Order"].value_counts()

Order
8101324    5
7734044    5
7585543    5
7644998    5
7770723    5
          ..
7370908    5
7391877    5
7360479    5
7370657    5
7737427    5
Name: count, Length: 166, dtype: int64

In [33]:
# test_basket_df.to_csv("test_basket_df.csv", index = False)
# forgotten_items_df.to_csv("forgotten_items_df.csv", index = False)

# Matrix Factorization

## Member-SKU (User-Item) Matrix

In [99]:
df_train = pd.read_csv("df_train.csv")
df_train.head()

Unnamed: 0,Order,SKU,Member,Delivery Date,Name
0,8358896,15668375,SSCEHNS,2013-11-02,Root Vegetables
1,8358896,15668467,SSCEHNS,2013-11-02,Beans
2,8358896,15669863,SSCEHNS,2013-11-02,Moong Dal
3,8358896,15669778,SSCEHNS,2013-11-02,Other Dals
4,8358896,15669767,SSCEHNS,2013-11-02,Urad Dal


In [100]:
sku_list = list(df_train["SKU"].unique())
len(sku_list)

632

In [101]:
member_list = list(df_train["Member"].unique())
len(member_list)

638

In [102]:
sku_freq_df = df_train[["Member", "SKU"]].groupby(["Member", "SKU"]) \
                .size() \
                .reset_index(name='frequency') \
                .sort_values(by = ["Member", "frequency"], ascending = [True, False])

sku_freq_df.head()

Unnamed: 0,Member,SKU,frequency
2,SSCEHNS,7580823,7
8,SSCEHNS,15668377,6
20,SSCEHNS,15669772,6
32,SSCEHNS,15669865,6
7,SSCEHNS,15668375,5


In [103]:
sku_freq_df.describe()

Unnamed: 0,SKU,frequency
count,17758.0,17758.0
mean,17134140.0,1.522187
std,13008010.0,1.094065
min,6884195.0,1.0
25%,15668450.0,1.0
50%,15669770.0,1.0
75%,15669880.0,2.0
max,93289490.0,14.0


In [104]:
reader = Reader() #rating_scale=(1, 14))
data = Dataset.load_from_df(sku_freq_df[['Member', 'SKU', 'frequency']], reader)

In [105]:
trainset = data.build_full_trainset()

In [106]:
model = SVD()
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x16c16bb50>

In [107]:
n_factors = model.n_factors

In [108]:
user_latent_matrix = model.pu 
item_latent_matrix = model.qi

In [109]:
item_latent_matrix.shape

(632, 100)

In [110]:
user_map = {trainset.to_raw_uid(inner_id): latent_vec for inner_id, latent_vec in enumerate(user_latent_matrix)}
item_map = {trainset.to_raw_iid(inner_id): latent_vec for inner_id, latent_vec in enumerate(item_latent_matrix)}

user_latent_df = pd.DataFrame.from_dict(user_map, orient='index')
item_latent_df = pd.DataFrame.from_dict(item_map, orient='index')

user_latent_df.columns = [f'latent_feature_{i}' for i in range(n_factors)]
item_latent_df.columns = [f'latent_feature_{i}' for i in range(n_factors)]

In [111]:
item_latent_df

Unnamed: 0,latent_feature_0,latent_feature_1,latent_feature_2,latent_feature_3,latent_feature_4,latent_feature_5,latent_feature_6,latent_feature_7,latent_feature_8,latent_feature_9,...,latent_feature_90,latent_feature_91,latent_feature_92,latent_feature_93,latent_feature_94,latent_feature_95,latent_feature_96,latent_feature_97,latent_feature_98,latent_feature_99
34986113,0.067684,-0.043690,-0.044371,-0.094790,-0.050114,-0.056162,0.096246,-0.009205,0.170190,0.137179,...,0.037057,0.132163,-0.063037,0.061296,0.111179,-0.045227,0.120960,0.032194,-0.009616,0.010716
7580811,-0.206388,-0.074532,0.046287,0.015729,0.027719,-0.005548,-0.004226,0.028160,-0.029684,-0.192472,...,0.156811,-0.010668,0.044076,-0.127898,0.014835,0.041832,-0.074697,-0.043050,0.152952,0.132425
7575006,-0.047226,-0.055074,0.070420,0.075733,0.217387,-0.141175,-0.222609,0.111839,-0.057669,-0.006713,...,0.004832,0.220039,0.015523,-0.133597,0.118727,-0.022698,0.054080,0.032963,-0.061058,-0.158844
7569801,-0.109029,-0.018995,0.008550,-0.245164,-0.071219,-0.344314,-0.057554,0.002530,-0.019343,-0.170891,...,0.008778,0.163106,-0.144169,-0.086683,0.006287,0.099437,0.152334,-0.112081,-0.065655,-0.123206
15669776,0.056838,-0.118354,0.032317,-0.053723,-0.132798,-0.373153,0.166195,-0.105296,0.030866,-0.472660,...,-0.319242,-0.233867,0.292097,0.172921,-0.049842,-0.185209,-0.123417,-0.245080,0.026498,0.063485
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34990370,-0.245591,-0.109506,-0.048398,-0.106051,-0.083740,0.025362,0.146255,0.051841,0.009203,0.005982,...,-0.120446,0.109157,0.079131,0.085645,0.153122,-0.228520,0.001862,0.064173,-0.006707,-0.023397
7649103,-0.015544,-0.040570,-0.092549,0.041551,0.067230,0.000164,-0.052653,0.005266,0.077109,-0.089041,...,-0.113578,-0.057597,-0.113289,0.090934,-0.074112,0.055978,0.080424,0.259606,0.029744,0.093307
7650488,-0.076690,0.025183,-0.103347,0.016935,0.195430,-0.103581,0.106587,0.036333,-0.048858,-0.004660,...,-0.046380,-0.158839,0.037761,-0.036118,-0.111888,-0.107776,-0.015354,-0.110570,-0.044698,-0.079573
34987615,-0.006901,-0.057834,-0.277333,0.113385,0.046782,-0.075775,0.040710,-0.159155,0.062112,0.053296,...,-0.047332,0.001501,0.037825,0.141900,0.036534,-0.059759,-0.044124,-0.054459,-0.058889,-0.083983


In [113]:
ss = StandardScaler()
item_latent_df = pd.DataFrame(ss.fit_transform(item_latent_df)
                            , index = item_latent_df.index
                            , columns = [f'latent_feature_{i}' for i in range(n_factors)]
                            )
item_latent_df

Unnamed: 0,latent_feature_0,latent_feature_1,latent_feature_2,latent_feature_3,latent_feature_4,latent_feature_5,latent_feature_6,latent_feature_7,latent_feature_8,latent_feature_9,...,latent_feature_90,latent_feature_91,latent_feature_92,latent_feature_93,latent_feature_94,latent_feature_95,latent_feature_96,latent_feature_97,latent_feature_98,latent_feature_99
34986113,0.596222,-0.335509,-0.408546,-0.780694,-0.425742,-0.507282,0.840840,-0.103363,1.549034,1.107670,...,0.380765,1.117918,-0.558696,0.491522,0.980217,-0.428781,1.050389,0.295024,-0.095029,0.201912
7580811,-1.846341,-0.601851,0.387931,0.180861,0.296516,-0.068894,-0.090295,0.225209,-0.227534,-1.516438,...,1.431677,-0.099790,0.405452,-1.094351,0.100446,0.365449,-0.677626,-0.380801,1.238823,1.318695
7575006,-0.427867,-0.433813,0.599958,0.702920,2.056560,-1.243624,-2.114199,0.961056,-0.476282,-0.037751,...,0.097969,1.867099,0.148443,-1.142122,1.049143,-0.223246,0.459716,0.301931,-0.517104,-1.353955
7569801,-0.978665,-0.122253,0.056398,-2.088995,-0.621589,-3.003097,-0.584528,-0.000169,-0.135623,-1.344644,...,0.132599,1.381719,-1.288980,-0.748874,0.022391,0.890978,1.327482,-1.000829,-0.554824,-1.026942
15669776,0.499557,-0.980280,0.265200,-0.423396,-1.193018,-3.252886,1.489105,-0.948345,0.310655,-3.746804,...,-2.745958,-2.002667,2.637937,1.427191,-0.490157,-1.705817,-1.107910,-2.195390,0.201281,0.686113
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34990370,-2.195721,-0.903867,-0.443930,-0.878661,-0.737782,0.198826,1.304308,0.433456,0.118106,0.063309,...,-1.001407,0.921781,0.720987,0.695622,1.363223,-2.100937,-0.001463,0.582254,-0.071163,-0.111103
7649103,-0.145517,-0.308561,-0.831821,0.405526,0.663158,-0.019421,-0.539104,0.023892,0.721687,-0.693102,...,-0.941143,-0.499878,-1.011027,0.739958,-0.711789,0.494501,0.692383,2.337583,0.227914,0.959759
7650488,-0.690454,0.259251,-0.926683,0.191355,1.852808,-0.918002,0.936679,0.297083,-0.397968,-0.021409,...,-0.351439,-1.363017,0.348605,-0.325025,-1.056740,-0.999403,-0.153518,-0.987252,-0.382878,-0.626568
34987615,-0.068491,-0.457650,-2.455251,1.030501,0.473411,-0.677159,0.326155,-1.421957,0.588386,0.439940,...,-0.359791,0.003958,0.349180,1.167163,0.298594,-0.561352,-0.407606,-0.483278,-0.499310,-0.667040


## SKU similarity matrix

In [116]:
%%time

sim_matrix = pd.DataFrame(cosine_similarity(item_latent_df), index=item_latent_df.index, columns=item_latent_df.index)

CPU times: user 75.8 ms, sys: 7.25 ms, total: 83.1 ms
Wall time: 18.4 ms


In [117]:
scaler = MinMaxScaler()
sim_matrix = pd.DataFrame(scaler.fit_transform(sim_matrix), index=item_latent_df.index, columns=item_latent_df.index)
sim_matrix

Unnamed: 0,34986113,7580811,7575006,7569801,15669776,15669869,15668381,15669765,7586072,15669778,...,7574983,7587702,92978776,34989674,7578351,34990370,7649103,7650488,34987615,92383200
34986113,1.000000,0.357871,0.271969,0.255219,0.280156,0.337536,0.190091,0.301980,0.301878,0.296743,...,0.260931,0.197159,0.316580,0.235631,0.246741,0.341696,0.126865,0.203386,0.224017,0.151145
7580811,0.327707,1.000000,0.209971,0.329681,0.225786,0.144542,0.377519,0.094918,0.086744,0.313537,...,0.252200,0.092052,0.300740,0.177143,0.360560,0.253809,0.282239,0.294865,0.206997,0.143535
7575006,0.260983,0.234030,1.000000,0.299937,0.082365,0.156986,0.247423,0.269643,0.172260,0.260410,...,0.124059,0.266515,0.201240,0.221125,0.309543,0.281292,0.278586,0.272650,0.152881,0.184458
7569801,0.185685,0.299983,0.245958,1.000000,0.182018,0.098289,0.230342,0.408790,0.160440,0.213232,...,0.224116,0.230241,0.350248,0.205007,0.178297,0.315731,0.319181,0.289704,0.208347,0.247736
15669776,0.231695,0.210740,0.035148,0.201498,1.000000,0.171491,0.187001,0.227588,0.183066,0.418553,...,0.299236,0.251809,0.186906,0.410876,0.350172,0.207361,0.196350,0.314415,0.299885,0.287836
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34990370,0.331427,0.276171,0.280932,0.364398,0.245773,0.276815,0.221510,0.326117,0.223096,0.389921,...,0.322391,0.300934,0.339024,0.181153,0.354371,1.000000,0.198585,0.250208,0.266307,0.269405
7649103,0.084878,0.281477,0.255136,0.347372,0.210833,0.172470,0.195985,0.270314,0.354885,0.123179,...,0.164562,0.220514,0.294389,0.281619,0.233522,0.172948,1.000000,0.332437,0.265159,0.344944
7650488,0.174562,0.302134,0.257537,0.326850,0.334417,0.268691,0.244130,0.355950,0.254745,0.173166,...,0.077124,0.192775,0.318632,0.301697,0.282362,0.235011,0.340020,1.000000,0.274570,0.239884
34987615,0.190652,0.210010,0.129592,0.244813,0.315841,0.107812,0.246419,0.065226,0.339945,0.316885,...,0.376957,0.232765,0.235314,0.231851,0.162081,0.246514,0.268728,0.269799,1.000000,0.195204


## Test on df_test

In [118]:
test_basket_df = pd.read_csv("test_basket_df.csv")

In [119]:
recc_df = pd.DataFrame(columns = ["Order", "SKU", "Member"])

for order in test_basket_df["Order"].unique():
    test_order = test_basket_df[test_basket_df["Order"] == order]
    test_recc = pd.DataFrame(columns = ["SKU", "Similarity Score"])
    for sku in test_order["SKU"]:
        df_ = pd.DataFrame(sim_matrix.loc[sku]).reset_index()
        df_.columns = ["SKU", "Similarity Score"]
        df_ = df_.sort_values("Similarity Score", ascending = False).reset_index(drop = True).iloc[1:]
        test_recc = pd.concat([test_recc, df_])
           
    test_recc = test_recc \
                .sort_values("Similarity Score", ascending = False) \
                .drop_duplicates(subset = "SKU") \
                .iloc[:5]
    test_recc["Order"] = order
    test_recc["Member"] = test_order["Member"].unique()[0]
    test_recc = test_recc[["Order", "SKU", "Member"]]

    recc_df = pd.concat([recc_df, test_recc], ignore_index = True)

## Evaluation using Recall@5 Metric

In [120]:
forgotten_items_df = pd.read_csv("forgotten_items_df.csv")
forgotten_items_df.head()

Unnamed: 0,Order,SKU,Member,Delivery Date,Name
0,8101324,15668377,SSCEHNS,2014-03-15,Root Vegetables
1,8101324,15669778,SSCEHNS,2014-03-15,Other Dals
2,8101324,15669814,SSCEHNS,2014-03-15,Jaggery
3,8101324,15669865,SSCEHNS,2014-03-15,Other Dals
4,8101324,15669970,SSCEHNS,2014-03-15,Cashews


In [121]:
recall_list = list()

for order in list(forgotten_items_df["Order"].unique()):
    forgotten_SKUs = forgotten_items_df[forgotten_items_df["Order"] == order]["SKU"]
    recc_SKUs = recc_df[recc_df["Order"] == order]["SKU"]
    recall = len(set(forgotten_SKUs).intersection(set(recc_SKUs)))/len(set(forgotten_SKUs))
    recall_list.append(recall)

In [122]:
np.mean(recall_list)

0.05903614457831326