In [1]:
import pandas as pd
import numpy as np

import itertools
import math

# from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA

from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split, GridSearchCV
from surprise import accuracy

import warnings
warnings.filterwarnings("ignore")

In [2]:
test_data = pd.read_csv('last_orders_subset.csv')
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5487 entries, 0 to 5486
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Order          5487 non-null   int64 
 1   SKU            5487 non-null   int64 
 2   Member         5487 non-null   object
 3   Delivery Date  5487 non-null   object
 4   Name           5487 non-null   object
dtypes: int64(2), object(3)
memory usage: 214.5+ KB


In [3]:
df = pd.read_csv('all_except_last_orders.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28984 entries, 0 to 28983
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Order          28984 non-null  int64 
 1   SKU            28984 non-null  int64 
 2   Member         28984 non-null  object
 3   Delivery Date  28984 non-null  object
 4   Name           28984 non-null  object
dtypes: int64(2), object(3)
memory usage: 1.1+ MB


In [4]:
df['Delivery Date'] = pd.to_datetime(df['Delivery Date'], dayfirst = True)

# Train Test Split

The data is split into last orders (for members who have more than 5 orders) and all but last order.
Previous orders are used to train the recommendation system and the last orders are used for testing.

## Last Order per Member

In [23]:
df.head()

Unnamed: 0,Order,SKU,Member,Delivery Date,Name
0,8358896,15668375,SSCEHNS,2013-11-02,Root Vegetables
1,8358896,15668467,SSCEHNS,2013-11-02,Beans
2,8358896,15669863,SSCEHNS,2013-11-02,Moong Dal
3,8358896,15669778,SSCEHNS,2013-11-02,Other Dals
4,8358896,15669767,SSCEHNS,2013-11-02,Urad Dal


In [24]:
member_order_count = df[["Member", "Order"]].drop_duplicates()["Member"].value_counts().reset_index()
members_with_more_than_5_orders = member_order_count[member_order_count["count"] > 5]["Member"]
len(members_with_more_than_5_orders)

166

In [25]:
last_orders = df[df["Member"].isin(members_with_more_than_5_orders)][["Member", "Order", "Delivery Date"]] \
                .drop_duplicates() \
                .sort_values(["Order", "Delivery Date"], ascending = [True, False]) \
                .drop_duplicates(subset = "Member") \
                .reset_index(drop = True)["Order"]

In [26]:
len(last_orders)

166

In [27]:
df_train = df[~df["Order"].isin(last_orders)]
df_train.head()

Unnamed: 0,Order,SKU,Member,Delivery Date,Name
0,8358896,15668375,SSCEHNS,2013-11-02,Root Vegetables
1,8358896,15668467,SSCEHNS,2013-11-02,Beans
2,8358896,15669863,SSCEHNS,2013-11-02,Moong Dal
3,8358896,15669778,SSCEHNS,2013-11-02,Other Dals
4,8358896,15669767,SSCEHNS,2013-11-02,Urad Dal


In [28]:
df_test = df[df["Order"].isin(last_orders)]
df_test.head()

Unnamed: 0,Order,SKU,Member,Delivery Date,Name
94,8101324,15668377,SSCEHNS,2014-03-15,Root Vegetables
95,8101324,15668684,SSCEHNS,2014-03-15,Beans
96,8101324,15669778,SSCEHNS,2014-03-15,Other Dals
97,8101324,15668521,SSCEHNS,2014-03-15,Bread
98,8101324,15669865,SSCEHNS,2014-03-15,Other Dals


In [29]:
# df_train.to_csv("df_train.csv", index = False)
# df_test.to_csv("df_test.csv", index = False)

## Simulating Forgotten Items

In [30]:
basket_list = list()
forgotten_list = list()

for order in df_test["Order"].unique():
    y_dummy = ['']*df_test[df_test["Order"] == order].shape[0]
    basket, forgotten, _, _ = train_test_split(df_test[df_test["Order"] == order], y_dummy, test_size = 5, random_state = 101)
    basket_list.append(basket)
    forgotten_list.append(forgotten)

test_basket_df = pd.concat(basket_list, ignore_index = True)
forgotten_items_df = pd.concat(forgotten_list, ignore_index = True)

In [31]:
test_basket_df["Order"].value_counts()

Order
7362753    26
7392553    20
7566535    19
7391877    16
7428900    16
           ..
7453262     3
7460103     3
7513528     3
7352666     3
8101324     3
Name: count, Length: 166, dtype: int64

In [32]:
forgotten_items_df["Order"].value_counts()

Order
8101324    5
7734044    5
7585543    5
7644998    5
7770723    5
          ..
7370908    5
7391877    5
7360479    5
7370657    5
7737427    5
Name: count, Length: 166, dtype: int64

In [33]:
# test_basket_df.to_csv("test_basket_df.csv", index = False)
# forgotten_items_df.to_csv("forgotten_items_df.csv", index = False)

# Probability Based Recommendation Engine

## Co-occurence Matrix for Calculating Probabilites

In [50]:
df_train = pd.read_csv("df_train.csv")

In [51]:
sku_list = list(df_train["SKU"].unique())
len(sku_list)

632

In [52]:
pairs = list(itertools.combinations_with_replacement(sku_list, 2))

s1_list = list()
s2_list = list()
cooccur_list = list()

for s1, s2 in pairs:
    s1_list.append(s1)
    s2_list.append(s2)
    
    freq = len(set(df_train[df_train["SKU"] == s1]["Order"]) \
               .intersection(set(df_train[df_train["SKU"] == s2]["Order"]))
              )
    cooccur_list.append(freq)

pairwise_cooccurrence = pd.DataFrame({"SKU1": s1_list, "SKU2": s2_list, "cooccurrence_frequency": cooccur_list})

co_matrix = pd.DataFrame(np.zeros((len(sku_list), len(sku_list))), index=sku_list, columns=sku_list)

for _, row in pairwise_cooccurrence.iterrows():
    co_matrix.loc[row["SKU1"], row["SKU2"]] = row["cooccurrence_frequency"]
    co_matrix.loc[row["SKU2"], row["SKU1"]] = row["cooccurrence_frequency"]

co_matrix.head()

Unnamed: 0,15668375,15668467,15669863,15669778,15669767,15669832,15669772,15669970,15668478,15669861,...,7590868,15668382,7582051,7590866,7735524,15670172,7590864,92433757,92286348,7610713
15668375,108.0,33.0,13.0,11.0,4.0,4.0,11.0,3.0,27.0,12.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15668467,33.0,322.0,31.0,25.0,19.0,18.0,27.0,4.0,62.0,21.0,...,1.0,4.0,1.0,8.0,0.0,0.0,5.0,0.0,1.0,1.0
15669863,13.0,31.0,299.0,45.0,43.0,42.0,47.0,4.0,24.0,43.0,...,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
15669778,11.0,25.0,45.0,243.0,19.0,23.0,69.0,4.0,21.0,40.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
15669767,4.0,19.0,43.0,19.0,184.0,38.0,29.0,4.0,7.0,31.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Test on df_test

In [53]:
test_basket_df = pd.read_csv("test_basket_df.csv")
test_basket_df.head()

Unnamed: 0,Order,SKU,Member,Delivery Date,Name
0,8101324,15668684,SSCEHNS,2014-03-15,Beans
1,8101324,15668521,SSCEHNS,2014-03-15,Bread
2,8101324,15669811,SSCEHNS,2014-03-15,Almonds
3,8203855,15669873,SSCHNCE,2014-02-03,Whole Spices
4,8203855,15669767,SSCHNCE,2014-02-03,Urad Dal


In [54]:
recc_df = pd.DataFrame(columns = ["Order", "SKU", "Member"])

for order in test_basket_df["Order"].unique():
    test_order = test_basket_df[test_basket_df["Order"] == order]
    prob_list = list()
    for sku in sku_list:
        cond_prob_list = list()
        for basket_item in test_order["SKU"]:
            p = co_matrix.loc[sku, basket_item]/co_matrix.loc[basket_item, basket_item]
            cond_prob_list.append(p)
            # print(f"sku: {sku}, basket_item: {basket_item}, probability: {p}")
        prob_list.append(math.prod(cond_prob_list))
    
    test_recc = pd.DataFrame({"SKU": sku_list, "Probability": prob_list})
    
    test_recc = test_recc.sort_values("Probability", ascending = False).iloc[:5]
    test_recc["Order"] = order
    test_recc["Member"] = test_order["Member"].unique()[0]
    # test_recc = test_recc[["Order", "SKU", "Member"]]

    recc_df = pd.concat([recc_df, test_recc], ignore_index = True)

In [55]:
recc_df.head()

Unnamed: 0,Order,SKU,Member,Probability
0,8101324,15668381,SSCEHNS,0.010054
1,8101324,15668460,SSCEHNS,0.009763
2,8101324,15668379,SSCEHNS,0.009157
3,8101324,15669780,SSCEHNS,0.007631
4,8101324,15668688,SSCEHNS,0.005117


## Evaluation using Recall@5 Metric

In [56]:
forgotten_items_df = pd.read_csv("forgotten_items_df.csv")
forgotten_items_df.head()

Unnamed: 0,Order,SKU,Member,Delivery Date,Name
0,8101324,15668377,SSCEHNS,2014-03-15,Root Vegetables
1,8101324,15669778,SSCEHNS,2014-03-15,Other Dals
2,8101324,15669814,SSCEHNS,2014-03-15,Jaggery
3,8101324,15669865,SSCEHNS,2014-03-15,Other Dals
4,8101324,15669970,SSCEHNS,2014-03-15,Cashews


In [57]:
recall_list = list()

for order in list(forgotten_items_df["Order"].unique()):
    forgotten_SKUs = forgotten_items_df[forgotten_items_df["Order"] == order]["SKU"]
    recc_SKUs = recc_df[recc_df["Order"] == order]["SKU"]
    recall = len(set(forgotten_SKUs).intersection(set(recc_SKUs)))/len(set(forgotten_SKUs))
    recall_list.append(recall)

In [58]:
np.mean(recall_list)

0.11927710843373492

## Prepare Kaggle Submission    

In [59]:
%%time

sku_list = list(df["SKU"].unique())

pairs = list(itertools.combinations_with_replacement(sku_list, 2))

s1_list = list()
s2_list = list()
cooccur_list = list()

for s1, s2 in pairs:
    s1_list.append(s1)
    s2_list.append(s2)
    
    freq = len(set(df[df["SKU"] == s1]["Order"]) \
               .intersection(set(df[df["SKU"] == s2]["Order"]))
              )
    cooccur_list.append(freq)

pairwise_cooccurrence = pd.DataFrame({"SKU1": s1_list, "SKU2": s2_list, "cooccurrence_frequency": cooccur_list})

co_matrix = pd.DataFrame(np.zeros((len(sku_list), len(sku_list))), index=sku_list, columns=sku_list)

for _, row in pairwise_cooccurrence.iterrows():
    co_matrix.loc[row["SKU1"], row["SKU2"]] = row["cooccurrence_frequency"]
    co_matrix.loc[row["SKU2"], row["SKU1"]] = row["cooccurrence_frequency"]

recc_df = pd.DataFrame(columns = ["Order", "SKU", "Member"])

for order in test_data["Order"].unique():
    test_order = test_data[test_data["Order"] == order]
    prob_list = list()
    for sku in sku_list:
        try:
            cond_prob_list = list()
            for basket_item in test_order["SKU"]:
                p = co_matrix.loc[sku, basket_item]/co_matrix.loc[basket_item, basket_item]
                cond_prob_list.append(p)
                # print(f"sku: {sku}, basket_item: {basket_item}, probability: {p}")
            prob_list.append(math.prod(cond_prob_list))
        except:
            prob_list.append(0)
    test_recc = pd.DataFrame({"SKU": sku_list, "Probability": prob_list})
    
    test_recc = test_recc.sort_values("Probability", ascending = False).iloc[:5]
    test_recc["Order"] = order
    test_recc["Member"] = test_order["Member"].unique()[0]
    # test_recc = test_recc[["Order", "SKU", "Member"]]

    recc_df = pd.concat([recc_df, test_recc], ignore_index = True)


CPU times: user 1min 1s, sys: 69.7 ms, total: 1min 1s
Wall time: 1min 1s


In [60]:
recc_df = recc_df[["Order", "SKU", "Member"]].reset_index().rename(columns = {"index": "ID"})
recc_df["ID"] = recc_df["ID"].apply(lambda x: x+1)
recc_df.head()

Unnamed: 0,ID,Order,SKU,Member
0,1,7409204,15668379,SWLCNOE
1,2,7409204,15668688,SWLCNOE
2,3,7409204,15668460,SWLCNOE
3,4,7409204,15668381,SWLCNOE
4,5,7409204,15669878,SWLCNOE


In [61]:
# recc_df.to_csv("probability_based_recc.csv", index = False)