In [1]:
import pandas as pd
import numpy as np

import itertools
import math

# from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA

from surprise import SVDpp, Dataset, Reader
from surprise.model_selection import train_test_split, GridSearchCV
from surprise import accuracy

import warnings
warnings.filterwarnings("ignore")

In [2]:
test_data = pd.read_csv('last_orders_subset.csv')
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5487 entries, 0 to 5486
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Order          5487 non-null   int64 
 1   SKU            5487 non-null   int64 
 2   Member         5487 non-null   object
 3   Delivery Date  5487 non-null   object
 4   Name           5487 non-null   object
dtypes: int64(2), object(3)
memory usage: 214.5+ KB


In [3]:
df = pd.read_csv('all_except_last_orders.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28984 entries, 0 to 28983
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Order          28984 non-null  int64 
 1   SKU            28984 non-null  int64 
 2   Member         28984 non-null  object
 3   Delivery Date  28984 non-null  object
 4   Name           28984 non-null  object
dtypes: int64(2), object(3)
memory usage: 1.1+ MB


In [4]:
df['Delivery Date'] = pd.to_datetime(df['Delivery Date'], dayfirst = True)

## Build Matrix Factorization Model

In [5]:
df_train = pd.read_csv("df_train.csv")
df_train.head()

Unnamed: 0,Order,SKU,Member,Delivery Date,Name
0,8358896,15668375,SSCEHNS,2013-11-02,Root Vegetables
1,8358896,15668467,SSCEHNS,2013-11-02,Beans
2,8358896,15669863,SSCEHNS,2013-11-02,Moong Dal
3,8358896,15669778,SSCEHNS,2013-11-02,Other Dals
4,8358896,15669767,SSCEHNS,2013-11-02,Urad Dal


In [6]:
sku_freq_df = df_train[["Member", "SKU"]].groupby(["Member", "SKU"]) \
                .size() \
                .reset_index(name='frequency') \
                .sort_values(by = ["Member", "frequency"], ascending = [True, False])

sku_freq_df.head()

Unnamed: 0,Member,SKU,frequency
2,SSCEHNS,7580823,7
8,SSCEHNS,15668377,6
20,SSCEHNS,15669772,6
32,SSCEHNS,15669865,6
7,SSCEHNS,15668375,5


In [7]:
reader = Reader() #rating_scale=(1, 14))
data = Dataset.load_from_df(sku_freq_df[['Member', 'SKU', 'frequency']], reader)
data

<surprise.dataset.DatasetAutoFolds at 0x138c0a6d0>

In [8]:
trainset = data.build_full_trainset()

In [26]:
param_grid = {
    'n_factors': [50, 100, 150, 200],
    'n_epochs': [20, 50, 100, 200],
    'lr_all': [0.05, 0.007, 0.01],
    'reg_all': [0.02, 0.06, 0.1, 0.15]
}

In [27]:
%%time

gs = GridSearchCV(
            SVDpp
            , param_grid = param_grid
            , measures = ['fcp']
            , cv = 5
            , n_jobs = -1
            , joblib_verbose = 1
        )

gs.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 11 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 178 tasks      | elapsed:   22.5s
[Parallel(n_jobs=-1)]: Done 428 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 778 tasks      | elapsed:  4.5min


CPU times: user 7.66 s, sys: 812 ms, total: 8.47 s
Wall time: 7min 7s


[Parallel(n_jobs=-1)]: Done 960 out of 960 | elapsed:  7.1min finished


In [28]:
print("Best FCP Score:", gs.best_score['fcp'])
print("Best Params:", gs.best_params['fcp'])

Best FCP Score: 0.6075844925365755
Best Params: {'n_factors': 200, 'n_epochs': 20, 'lr_all': 0.007, 'reg_all': 0.15}


In [29]:
model = gs.best_estimator['fcp']
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x13fd93a50>

## Test on df_test

In [30]:
sku_list = list(df_train["SKU"].unique())
len(sku_list)

632

In [31]:
test_basket_df = pd.read_csv("test_basket_df.csv")

In [32]:
test_order = 7374967
test_order_df = test_basket_df[test_basket_df["Order"] == test_order]
test_order_df

Unnamed: 0,Order,SKU,Member,Delivery Date,Name
620,7374967,15669760,SWHZENC,2013-09-24,Whole Spices
621,7374967,15668466,SWHZENC,2013-09-24,Root Vegetables
622,7374967,15669778,SWHZENC,2013-09-24,Other Dals
623,7374967,15668462,SWHZENC,2013-09-24,Gourd & Cucumber
624,7374967,15669866,SWHZENC,2013-09-24,Other Dals
625,7374967,15669869,SWHZENC,2013-09-24,Moong Dal
626,7374967,15668688,SWHZENC,2013-09-24,Root Vegetables
627,7374967,93174226,SWHZENC,2013-09-24,Body Wash


In [33]:
test_member = test_order_df["Member"].unique()[0]
test_member

'SWHZENC'

In [34]:
test_order_skus = set(test_order_df["SKU"])
print(test_order_skus)

{15669760, 15669866, 15669869, 15668462, 15668688, 15668466, 15669778, 93174226}


In [35]:
sku_pred_list = list(set(sku_list) - test_order_skus)
print(len(sku_pred_list))

624


In [36]:
pred_value_list = list()

for sku in sku_pred_list:
    prediction = model.predict(uid = test_member, iid = sku)
    pred_value_list.append(prediction.est)

test_pred_df = pd.DataFrame({"SKU": sku_pred_list, "prediction": pred_value_list})

In [37]:
test_recc = test_pred_df.sort_values("prediction", ascending = False).iloc[:5]
test_recc

Unnamed: 0,SKU,prediction
444,15668460,2.47129
443,15668458,2.346232
406,7689353,2.289285
355,7590864,2.143011
413,15668378,2.141776


In [38]:
sku_list = list(df_train["SKU"].unique())
recc_df = pd.DataFrame()

for test_order in test_basket_df["Order"].unique():
    test_order_df = test_basket_df[test_basket_df["Order"] == test_order]
    test_member = test_order_df["Member"].unique()[0]
    test_order_skus = set(test_order_df["SKU"])
    
    sku_pred_list = list(set(sku_list) - test_order_skus)

    pred_value_list = list()

    for sku in sku_pred_list:
        prediction = model.predict(uid = test_member, iid = sku)
        pred_value_list.append(prediction.est)
    
    test_pred_df = pd.DataFrame({"SKU": sku_pred_list, "prediction": pred_value_list})
    test_recc = test_pred_df.sort_values("prediction", ascending = False).iloc[:5]
    test_recc["Order"] = test_order
    test_recc["Member"] = test_member
    test_recc = test_recc[["Order", "SKU", "Member"]]

    recc_df = pd.concat([recc_df, test_recc], ignore_index = True)

In [39]:
recc_df

Unnamed: 0,Order,SKU,Member
0,8101324,15669870,SSCEHNS
1,8101324,7580823,SSCEHNS
2,8101324,15669832,SSCEHNS
3,8101324,15669767,SSCEHNS
4,8101324,15669775,SSCEHNS
...,...,...,...
825,7737427,7590866,SWRNESE
826,7737427,7689353,SWRNESE
827,7737427,92388167,SWRNESE
828,7737427,34990774,SWRNESE


## Evaluation using Recall@5 Metric

In [40]:
forgotten_items_df = pd.read_csv("forgotten_items_df.csv")
forgotten_items_df.head()

Unnamed: 0,Order,SKU,Member,Delivery Date,Name
0,8101324,15668377,SSCEHNS,2014-03-15,Root Vegetables
1,8101324,15669778,SSCEHNS,2014-03-15,Other Dals
2,8101324,15669814,SSCEHNS,2014-03-15,Jaggery
3,8101324,15669865,SSCEHNS,2014-03-15,Other Dals
4,8101324,15669970,SSCEHNS,2014-03-15,Cashews


In [41]:
recall_list = list()

for order in list(forgotten_items_df["Order"].unique()):
    forgotten_SKUs = forgotten_items_df[forgotten_items_df["Order"] == order]["SKU"]
    recc_SKUs = recc_df[recc_df["Order"] == order]["SKU"]
    recall = len(set(forgotten_SKUs).intersection(set(recc_SKUs)))/len(set(forgotten_SKUs))
    recall_list.append(recall)

In [42]:
np.mean(recall_list)

0.12650602409638553

## Prepare Kaggle Submission

### Fit best model

In [27]:
sku_freq_df = df[["Member", "SKU"]].groupby(["Member", "SKU"]) \
                .size() \
                .reset_index(name='frequency') \
                .sort_values(by = ["Member", "frequency"], ascending = [True, False]) \
                .reset_index(drop = True)

sku_freq_df.head()

Unnamed: 0,Member,SKU,frequency
0,SSCEHNS,7580823,7
1,SSCEHNS,15668377,7
2,SSCEHNS,15669865,7
3,SSCEHNS,15669772,6
4,SSCEHNS,15669970,6


In [28]:
reader = Reader() 
data = Dataset.load_from_df(sku_freq_df[['Member', 'SKU', 'frequency']], reader)
data

<surprise.dataset.DatasetAutoFolds at 0x146ab1d10>

In [29]:
trainset = data.build_full_trainset()

In [31]:
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x146914690>

### Generate Recommendations

In [32]:
sku_list = list(df["SKU"].unique())
recc_df = pd.DataFrame()

for test_order in test_data["Order"].unique():
    test_order_df = test_data[test_data["Order"] == test_order]
    test_member = test_order_df["Member"].unique()[0]
    test_order_skus = set(test_order_df["SKU"])
    
    sku_pred_list = list(set(sku_list) - test_order_skus)

    pred_value_list = list()

    for sku in sku_pred_list:
        prediction = model.predict(uid = test_member, iid = sku)
        pred_value_list.append(prediction.est)
    
    test_pred_df = pd.DataFrame({"SKU": sku_pred_list, "prediction": pred_value_list})
    test_recc = test_pred_df.sort_values("prediction", ascending = False).iloc[:5]
    test_recc["Order"] = test_order
    test_recc["Member"] = test_member
    test_recc = test_recc[["Order", "SKU", "Member"]]

    recc_df = pd.concat([recc_df, test_recc], ignore_index = True)

In [33]:
recc_df["Order"].value_counts()

Order
7409204    5
8250969    5
7727449    5
8285456    5
7517068    5
          ..
8077922    5
7682148    5
8374672    5
8065270    5
7682167    5
Name: count, Length: 638, dtype: int64

In [34]:
recc_df.drop_duplicates(subset = ["Order", "SKU"])

Unnamed: 0,Order,SKU,Member
0,7409204,15668465,SWLCNOE
1,7409204,15668594,SWLCNOE
2,7409204,15668381,SWLCNOE
3,7409204,15668449,SWLCNOE
4,7409204,15668494,SWLCNOE
...,...,...,...
3185,7682167,15669789,SWOSHSO
3186,7682167,7689353,SWOSHSO
3187,7682167,15668542,SWOSHSO
3188,7682167,15669872,SWOSHSO


In [35]:
recc_df = recc_df[["Order", "SKU", "Member"]].reset_index().rename(columns = {"index": "ID"})
recc_df["ID"] = recc_df["ID"].apply(lambda x: x+1)
recc_df.head()

Unnamed: 0,ID,Order,SKU,Member
0,1,7409204,15668465,SWLCNOE
1,2,7409204,15668594,SWLCNOE
2,3,7409204,15668381,SWLCNOE
3,4,7409204,15668449,SWLCNOE
4,5,7409204,15668494,SWLCNOE


In [36]:
recc_df.to_csv("matrix_factorization_svd.csv", index = False)