In [1]:
import pandas as pd
import numpy as np

import itertools
import math

# from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA

from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split, GridSearchCV
from surprise import accuracy

import warnings
warnings.filterwarnings("ignore")

In [2]:
test_data = pd.read_csv('last_orders_subset.csv')
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5487 entries, 0 to 5486
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Order          5487 non-null   int64 
 1   SKU            5487 non-null   int64 
 2   Member         5487 non-null   object
 3   Delivery Date  5487 non-null   object
 4   Name           5487 non-null   object
dtypes: int64(2), object(3)
memory usage: 214.5+ KB


In [3]:
df = pd.read_csv('all_except_last_orders.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28984 entries, 0 to 28983
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Order          28984 non-null  int64 
 1   SKU            28984 non-null  int64 
 2   Member         28984 non-null  object
 3   Delivery Date  28984 non-null  object
 4   Name           28984 non-null  object
dtypes: int64(2), object(3)
memory usage: 1.1+ MB


In [4]:
df['Delivery Date'] = pd.to_datetime(df['Delivery Date'], dayfirst = True)

# Past Purchase Frequency Based Recommendation

Based on the frequency of the items in the past orders for that particular user

In [10]:
sku_freq_df = df[["Member", "SKU"]].groupby(["Member", "SKU"]) \
                .size() \
                .reset_index(name='frequency') \
                .sort_values(by = ["Member", "frequency"], ascending = [True, False])

sku_freq_df.head()

Unnamed: 0,Member,SKU,frequency
2,SSCEHNS,7580823,7
8,SSCEHNS,15668377,7
34,SSCEHNS,15669865,7
20,SSCEHNS,15669772,6
40,SSCEHNS,15669970,6


In [11]:
test_data.head()

Unnamed: 0,Order,SKU,Member,Delivery Date,Name
0,7409204,15669778,SWLCNOE,05/09/13,Other Dals
1,8076206,15669977,SWOEZES,01/04/14,Almonds
2,7560723,7593949,SSWWRHW,30/06/13,Cream Biscuits
3,8362837,15669764,SWLSCOZ,06/11/13,Besan
4,8202458,15670196,SSRCRSO,03/02/14,Organic F&V


In [12]:
order_member_sku_df = test_data[["Order", "Member", "SKU"]]
order_member_sku_df.head()

Unnamed: 0,Order,Member,SKU
0,7409204,SWLCNOE,15669778
1,8076206,SWOEZES,15669977
2,7560723,SSWWRHW,7593949
3,8362837,SWLSCOZ,15669764
4,8202458,SSRCRSO,15670196


In [13]:
sku_freq_with_curr_order = pd.merge(sku_freq_df, order_member_sku_df, how = "left", on = ["Member", "SKU"])
sku_freq_with_curr_order.head()

Unnamed: 0,Member,SKU,frequency,Order
0,SSCEHNS,7580823,7,
1,SSCEHNS,15668377,7,
2,SSCEHNS,15669865,7,8069966.0
3,SSCEHNS,15669772,6,
4,SSCEHNS,15669970,6,8069966.0


In [14]:
SSCEHNS_past = sku_freq_df[sku_freq_df["Member"] == "SWOEZES"]

In [15]:
SSCEHNS_order = order_member_sku_df[(order_member_sku_df["Order"] == 8076206)]

In [16]:
recc_sku = list()
for past_sku in list(SSCEHNS_past["SKU"]):
    if past_sku in list(SSCEHNS_order["SKU"]):
        pass
    else:
        recc_sku.append(past_sku)

In [17]:
recc_sku[:5]

[15669869, 15669878, 15669866, 7587667, 7642810]

In [18]:
sku_freq_with_curr_order[(sku_freq_with_curr_order["Order"].isna()) & (sku_freq_with_curr_order["Member"] == "SWLCNOE")].head()

Unnamed: 0,Member,SKU,frequency,Order
10642,SWLCNOE,15668465,4,
10644,SWLCNOE,15668459,3,
10645,SWLCNOE,15668460,3,
10646,SWLCNOE,15668467,3,
10647,SWLCNOE,15668494,3,


In [19]:
sku_freq_with_curr_order[sku_freq_with_curr_order["Member"] == "SWLCNOE"].head()

Unnamed: 0,Member,SKU,frequency,Order
10640,SWLCNOE,15668478,8,7409204.0
10641,SWLCNOE,15668379,6,7409204.0
10642,SWLCNOE,15668465,4,
10643,SWLCNOE,15668457,3,7409204.0
10644,SWLCNOE,15668459,3,
10645,SWLCNOE,15668460,3,
10646,SWLCNOE,15668467,3,
10647,SWLCNOE,15668494,3,
10648,SWLCNOE,15668594,3,
10649,SWLCNOE,15668688,3,7409204.0


In [20]:
recc_df_list = list()

for _, row in test_data[["Order", "Member"]].drop_duplicates().iterrows():
    df_temp = sku_freq_with_curr_order[sku_freq_with_curr_order["Member"] == row["Member"]] \
                .fillna("1000000000") \
                .sort_values(["Order", "frequency"], ascending = [False, False]) \
                .head()
    df_temp["Order"] = row["Order"]
    recc_df_list.append(df_temp.drop("frequency", axis = 1))

recc_df = pd.concat(recc_df_list, ignore_index = True)

In [21]:
recc_df = recc_df[["Order", "SKU", "Member"]].reset_index().rename(columns = {"index": "ID"})
recc_df["ID"] = recc_df["ID"].apply(lambda x: x+1)
recc_df.head()

Unnamed: 0,ID,Order,SKU,Member
0,1,7409204,15668465,SWLCNOE
1,2,7409204,15668459,SWLCNOE
2,3,7409204,15668460,SWLCNOE
3,4,7409204,15668467,SWLCNOE
4,5,7409204,15668494,SWLCNOE


In [22]:
# recc_df.to_csv("past_frequency_based_recc.csv", index = False)