In [1]:
# Hide deprecation warnings
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import heapq
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import coo_matrix

## Load Datasets

In [2]:
df_order_products__prior = pd.read_csv("../data/raw/order_products__prior.csv")
df_order_products__train = pd.read_csv("../data/raw/order_products__train.csv")
df_orders = pd.read_csv("../data/interim/df_orders_clustered.csv") 
df_products = pd.read_csv("../data/raw/products.csv")

## Data Preparation

### User basket products

In [3]:
df_orders_test = df_orders.loc[(df_orders.eval_set == "train")].reset_index()
df_orders_test.head()

Unnamed: 0,index,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,cluster
0,10,1187899,1,train,11,4,8,14.0,0
1,25,1492625,2,train,15,1,11,30.0,0
2,49,2196797,5,train,5,0,11,6.0,0
3,74,525192,7,train,21,2,11,6.0,0
4,78,880375,8,train,4,1,14,10.0,0


In [4]:
df_orders_test.shape

(131209, 9)

In [5]:
df_orders_test = df_orders_test[["order_id", "user_id", 'cluster']]
df_orders_test.head()

Unnamed: 0,order_id,user_id,cluster
0,1187899,1,0
1,1492625,2,0
2,2196797,5,0
3,525192,7,0
4,880375,8,0


In [6]:
df_test = df_order_products__train[["order_id", "product_id"]]
df_test.head()

Unnamed: 0,order_id,product_id
0,1,49302
1,1,11109
2,1,10246
3,1,49683
4,1,43633


In [7]:
df_test.shape

(1384617, 2)

In [8]:
df_test = df_test.groupby("order_id")["product_id"]\
                         .apply(list).reset_index().rename(columns={"product_id": "products"})
    
df_test.head()

Unnamed: 0,order_id,products
0,1,"[49302, 11109, 10246, 49683, 43633, 13176, 472..."
1,36,"[39612, 19660, 49235, 43086, 46620, 34497, 486..."
2,38,"[11913, 18159, 4461, 21616, 23622, 32433, 2884..."
3,96,"[20574, 30391, 40706, 25610, 27966, 24489, 39275]"
4,98,"[8859, 19731, 43654, 13176, 4357, 37664, 34065..."


In [9]:
df_test.shape

(131209, 2)

In [10]:
df_test = pd.merge(df_orders_test, df_test, on="order_id")
df_test.head()

Unnamed: 0,order_id,user_id,cluster,products
0,1187899,1,0,"[196, 25133, 38928, 26405, 39657, 10258, 13032..."
1,1492625,2,0,"[22963, 7963, 16589, 32792, 41787, 22825, 1364..."
2,2196797,5,0,"[15349, 19057, 16185, 21413, 20843, 20114, 482..."
3,525192,7,0,"[12053, 47272, 37999, 13198, 43967, 40852, 176..."
4,880375,8,0,"[15937, 5539, 10960, 23165, 22247, 4853, 27104..."


In [11]:
df_test = df_test[["user_id", "products", "cluster"]]
df_test.head()

Unnamed: 0,user_id,products,cluster
0,1,"[196, 25133, 38928, 26405, 39657, 10258, 13032...",0
1,2,"[22963, 7963, 16589, 32792, 41787, 22825, 1364...",0
2,5,"[15349, 19057, 16185, 21413, 20843, 20114, 482...",0
3,7,"[12053, 47272, 37999, 13198, 43967, 40852, 176...",0
4,8,"[15937, 5539, 10960, 23165, 22247, 4853, 27104...",0


In [12]:
len(df_test)

131209

### Users prior purchases per product

In [13]:
df_orders_train = df_orders.loc[df_orders.eval_set == "prior"]
df_orders_train.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,cluster
0,2539329,1,prior,1,2,8,,0
1,2398795,1,prior,2,3,7,15.0,0
2,473747,1,prior,3,3,12,21.0,0
3,2254736,1,prior,4,4,7,29.0,0
4,431534,1,prior,5,4,15,28.0,0


In [14]:
df_orders_train.shape

(3214874, 8)

In [15]:
df_orders_train = df_orders_train[["order_id", "user_id", "cluster"]]
df_orders_train.head()

Unnamed: 0,order_id,user_id,cluster
0,2539329,1,0
1,2398795,1,0
2,473747,1,0
3,2254736,1,0
4,431534,1,0


In [16]:
df_train = pd.merge(df_orders_train, df_order_products__prior[["order_id", "product_id"]],\
                                 on="order_id")
df_train.head()

Unnamed: 0,order_id,user_id,cluster,product_id
0,2539329,1,0,196
1,2539329,1,0,14084
2,2539329,1,0,12427
3,2539329,1,0,26088
4,2539329,1,0,26405


In [17]:
df_train = df_train[["user_id", "product_id", "cluster"]]
df_train = df_train.groupby(["user_id", "product_id", "cluster"])\
                                                      .size().reset_index().rename(columns={0:"quantity"})
df_train.head() 

Unnamed: 0,user_id,product_id,cluster,quantity
0,1,196,0,10
1,1,10258,0,9
2,1,10326,0,1
3,1,12427,0,10
4,1,13032,0,3


In [18]:
df_train.shape

(13307953, 4)

In [19]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13307953 entries, 0 to 13307952
Data columns (total 4 columns):
user_id       int64
product_id    int64
cluster       int64
quantity      int64
dtypes: int64(4)
memory usage: 406.1 MB


## Utility Matrices

In [20]:
clusternumber = len(df_train.cluster.unique())

In [21]:
cluster = []

for i in range(clusternumber):
    cluster.append(df_train.loc[df_train['cluster'] == i].drop('cluster',axis=1))

In [22]:
for i in range(clusternumber):
    cluster[i]["user_id"] = cluster[i]["user_id"].astype("category")
    cluster[i]["product_id"] = cluster[i]["product_id"].astype("category")

In [23]:
utility_matrix = []

for i in range(clusternumber):
    utility_matrix.append(coo_matrix((cluster[i]["quantity"],
                                     (cluster[i]["product_id"].cat.codes.copy(),
                                      cluster[i]["user_id"].cat.codes.copy()))))

In [24]:
for i in range(clusternumber):
    print("Utility matrix {} shape: {}".format(i,utility_matrix[i].shape))

Utility matrix 0 shape: (49434, 164459)
Utility matrix 1 shape: (35129, 7551)
Utility matrix 2 shape: (47065, 34199)


## Popular products and users new products

In [25]:
popular_products = list(df_order_products__prior["product_id"].value_counts().head(10).index)

popular_products

[24852, 13176, 21137, 21903, 47209, 47766, 47626, 16797, 26209, 27845]

In [26]:
print("Most popular products:")
df_products.product_name.loc[df_products.product_id.isin(popular_products)].reset_index(drop=True)

Most popular products:


0    Bag of Organic Bananas
1              Strawberries
2      Organic Strawberries
3      Organic Baby Spinach
4                    Banana
5                     Limes
6        Organic Whole Milk
7      Organic Hass Avocado
8               Large Lemon
9           Organic Avocado
Name: product_name, dtype: object

In [27]:
utility_matrix_T = []

for i in range(clusternumber):
    utility_matrix_T.append(utility_matrix[i].T.tocsr())

In [28]:
users = []

for i in range(clusternumber):
    users.append({uid:i for i, uid in enumerate(cluster[i]["user_id"].cat.categories)})

In [29]:
products = []

for i in range(clusternumber):
    products.append(dict(enumerate(cluster[i]["product_id"].cat.categories)))

In [30]:
def past_products(row):
    return set([products[row["cluster"]][i] for i in \
                utility_matrix_T[row["cluster"]][users[row["cluster"]][row["user_id"]]].indices]) 

def new_products(row):
    return set(row['products']) - set(row['past_products'])

In [31]:
df_test['past_products'] = df_test.apply(past_products, axis=1)
df_test['new_products'] = df_test.apply(new_products, axis=1)

df_test.head()

Unnamed: 0,user_id,products,cluster,past_products,new_products
0,1,"[196, 25133, 38928, 26405, 39657, 10258, 13032...",0,"{17122, 196, 14084, 26405, 46149, 13032, 26088...",{27845}
1,2,"[22963, 7963, 16589, 32792, 41787, 22825, 1364...",0,"{45066, 2573, 18961, 23, 1559, 32792, 22559, 1...","{5699, 12324, 24838, 12007, 13640, 11913, 3188..."
2,5,"[15349, 19057, 16185, 21413, 20843, 20114, 482...",0,"{11777, 28289, 40706, 48775, 20754, 6808, 1398...","{20843, 48204, 19057, 20114, 16185}"
3,7,"[12053, 47272, 37999, 13198, 43967, 40852, 176...",0,"{11520, 35333, 519, 10504, 47623, 45066, 13198...",{12053}
4,8,"[15937, 5539, 10960, 23165, 22247, 4853, 27104...",0,"{11136, 8193, 17794, 26882, 39812, 24838, 651,...","{27104, 5539, 31717, 48230, 22247, 41259, 3780..."


## Recommendation with user to user similarity

#### Example: User ID 1

Get cluster

In [32]:
user_ex = 1
cluster = df_train.cluster.loc[df_train.user_id == user_ex].unique()[0]
cluster

0

Get top similar users

In [33]:
similarities = cosine_similarity(utility_matrix_T[cluster][users[cluster][1]],utility_matrix_T[cluster])

In [34]:
ids = np.argpartition(similarities[0], -11)[-11:]
best = sorted(zip(ids, similarities[0][ids]), key=lambda x: -x[1])[1:]

In [35]:
ids

array([114419,  90743,  30663,  48978, 109459, 109793, 116893, 116975,
         8343, 143364,      0])

In [36]:
best

[(143364, 0.69954805271423959),
 (30663, 0.63279465820501035),
 (109793, 0.62118777059707386),
 (8343, 0.61769493794340791),
 (116893, 0.61481644555118409),
 (116975, 0.61430280375308932),
 (109459, 0.6088063653416409),
 (48978, 0.60432645275039742),
 (90743, 0.59345547322526804),
 (114419, 0.5756524760966798)]

Let's check if they're really similar

In [37]:
ex_user_products = set(utility_matrix_T[cluster][ids[-1]].nonzero()[1])
print("User products history:")
df_products.product_name.loc[df_products.product_id.isin(ex_user_products)].reset_index(drop=True)

User products history:


0     Grade A Pasteurized 2% Milkfat Lowfat Cottage ...
1                Red Velvet Cupcake Cake & Frosting Mix
2                             Men's One Tablets - 30 CT
3                         Skin Trip Coconut Moisturizer
4                                 Chocolate Cashew Milk
5                   Organic Vegetarian Pho Soup Starter
6                             Unsweetened Coconut Juice
7              Turkey Bacon, Uncured, Cherrywood Smoked
8                                  Organic Banana Chips
9                     Cheese Natural Sliced Pepper Jack
10                        Tortillas, White Corn, Grande
11                           Sea Salt Caramel ice Cream
12                                 Organic Power Greens
13                                   Original Detergent
14                        Lotzza Motzza Pepperoni Pizza
15    Maximum Absorbency Small/Medium Incontinence U...
16                             Whole Grain Hot Dog Buns
17                    Organic Steamable Cut Gree

In [38]:
similar_user_products = set(utility_matrix_T[cluster][ids[-2]].nonzero()[1])
print("Most similar user products history:")
df_products.product_name.loc[df_products.product_id.isin(similar_user_products)].reset_index(drop=True)

Most similar user products history:


0     Grade A Pasteurized 2% Milkfat Lowfat Cottage ...
1                               Organic Unsalted Butter
2                       Riserva Ducale Chianti Classico
3                         Skin Trip Coconut Moisturizer
4                   Organic Vegetarian Pho Soup Starter
5                                  Dry-Cured Prosciutto
6                 Instant Coffee Crystals Classic Decaf
7         Veggie & Fruit Snacks, Carrot, Mango & Orange
8                                  Organic Banana Chips
9       Fruit Naturals Cherry Mixed Fruit in 100% Juice
10                 Micro Brewed Black Cherry Cream Soda
11                      San Marzano Tomatoes With Basil
12            Organic Green Tea With Pomegranate & Acai
13                                    Classic Mouthwash
14                             Whole Grain Hot Dog Buns
Name: product_name, dtype: object

In [39]:
print("Recall:",len(similar_user_products.intersection(ex_user_products)) / len(similar_user_products))

Recall: 0.3333333333333333


Quite similar products!

Let's get now the recommendations

In [40]:
ids = ids[:-1]

In [41]:
if len(df_test.products.loc[df_test.user_id == user_ex])>0:
    products_in_basket = df_test.products.loc[df_test.user_id == user_ex].tolist()[0]
else:
    products_in_basket = []
final_recommendations = []
final_valuation = []

for i in range(len(ids)):
    similar_users_products = utility_matrix_T[cluster][ids[i]].nonzero()[1]
    #Mask to filter products already in the user's cart
    mask = np.isin(similar_users_products, products_in_basket, invert=True)
    for j in range(len(similar_users_products[mask])):
        if np.isin(similar_users_products[mask][j], final_recommendations, invert=True):
            final_recommendations.append(similar_users_products[mask][j])
            final_valuation.append(best[-(i+1)][1])
        else:
            index = final_recommendations.index(similar_users_products[mask][j])
            final_valuation[index]+= best[-(i+1)][1]
            
final_recommendations = np.asarray(final_recommendations)
final_valuation = np.asarray(final_valuation)

In [42]:
ind = heapq.nlargest(min(10,len(final_recommendations)), range(len(final_valuation)), final_valuation.take)
final_recommendations = final_recommendations[ind]

In [43]:
print("Recommended products:")
df_products.product_name.loc[df_products.product_id.isin(final_recommendations)].reset_index(drop=True)

Recommended products:


0    Grade A Pasteurized 2% Milkfat Lowfat Cottage ...
1                               Original String Cheese
2               Red Velvet Cupcake Cake & Frosting Mix
3                      Riserva Ducale Chianti Classico
4                        Skin Trip Coconut Moisturizer
5                  Organic Vegetarian Pho Soup Starter
6                      San Marzano Tomatoes With Basil
7            Organic Green Tea With Pomegranate & Acai
8                                    Marshmallow Creme
9                             Whole Grain Hot Dog Buns
Name: product_name, dtype: object

Let's do it now for the rest of the users, or a sample of them

In [44]:
subset = 0.05 #We will make the predictions only in 5% of the data

df_test = df_test.sample(n=int(len(df_test) * subset)).reset_index(drop=True)
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6560 entries, 0 to 6559
Data columns (total 5 columns):
user_id          6560 non-null int64
products         6560 non-null object
cluster          6560 non-null int64
past_products    6560 non-null object
new_products     6560 non-null object
dtypes: int64(2), object(3)
memory usage: 256.3+ KB


In [45]:
def recall_user2user(row):
    cluster = df_train.cluster.loc[df_train.user_id == row['user_id']].unique()[0]
    similarities = cosine_similarity(utility_matrix_T[cluster][users[cluster][row["user_id"]]],utility_matrix_T[cluster])
    ids = np.argpartition(similarities[0], -11)[-11:]
    best = sorted(zip(ids, similarities[0][ids]), key=lambda x: -x[1])[1:]
    
    ids = ids[:-1]

    if len(df_test.products.loc[df_test.user_id == row['user_id']])>0:
        products_in_basket = df_test.products.loc[df_test.user_id == row['user_id']].tolist()[0]
    else:
        products_in_basket = []

    final_recommendations = []
    final_valuation = []

    for i in range(len(ids)):
        similar_users_products = utility_matrix_T[cluster][ids[i]].nonzero()[1]
        #Mask to filter products already in the user's cart
        mask = np.isin(similar_users_products, products_in_basket, invert=True)
        for j in range(len(similar_users_products[mask])):
            if np.isin(similar_users_products[mask][j], final_recommendations, invert=True):
                final_recommendations.append(similar_users_products[mask][j])
                final_valuation.append(best[-(i+1)][1])
            else:
                index = final_recommendations.index(similar_users_products[mask][j])
                final_valuation[index]+= best[-(i+1)][1]

    final_recommendations = np.asarray(final_recommendations)
    final_valuation = np.asarray(final_valuation)
    
    ind = heapq.nlargest(min(10,len(final_recommendations)), range(len(final_valuation)), final_valuation.take)
    final_recommendations = set(final_recommendations[ind])
    
    return final_recommendations


df_test['Recommendations'] = df_test.apply(recall_user2user, axis=1)

In [46]:
df_test.head()

Unnamed: 0,user_id,products,cluster,past_products,new_products,Recommendations
0,68472,"[39814, 14678, 31474, 21525, 13424, 44088, 198...",0,"{16283, 42629, 39814, 13575, 23719, 19887, 387...","{13424, 44088, 21525, 14678}","{30306, 17123, 20856, 13510, 32840, 38569, 162..."
1,43902,"[45066, 24390, 33787, 33754, 34551, 23537, 276...",0,"{260, 2309, 1158, 11782, 24964, 24841, 14218, ...","{14947, 45541, 24390, 44632, 23537, 17300, 439...","{47520, 44835, 21801, 44777, 23915, 16715, 210..."
2,151361,"[36976, 24852, 25055, 26604, 35221, 32740]",1,"{1025, 8193, 23044, 13829, 4614, 10246, 6664, ...",{},"{18560, 34434, 33798, 14951, 17611, 33707, 155..."
3,83091,"[35221, 43631, 36386, 49355, 14947, 47185, 424...",0,"{40963, 27652, 23051, 44048, 3605, 35221, 2703...","{48226, 37606, 49319, 11259, 35164, 16959}","{47520, 43554, 27523, 3782, 36199, 44775, 4341..."
4,111854,"[4198, 19495, 15164, 42535, 37436, 39139, 46274]",0,"{37512, 28427, 2708, 6550, 17819, 19495, 21288...","{46274, 39139, 4198, 42535, 37436}","{195, 41193, 19403, 41071, 15091, 26230, 13112..."


In [47]:
df_test = df_test[['user_id','cluster','products','Recommendations']]
df_test.columns = ['User','Cluster','Products in basket','Recommendations']

In [48]:
df_test.sort_values('User').head()

Unnamed: 0,User,Cluster,Products in basket,Recommendations
5031,13,0,"[27435, 27086, 4210, 47078, 19934]","{40389, 24741, 39208, 26091, 35827, 26965, 161..."
4442,46,0,"[24852, 42987, 24097, 4605, 21781, 13733]","{11041, 24741, 38360, 39208, 38185, 28074, 167..."
320,79,0,"[27845, 13176, 19057, 21137, 44910, 32747, 342...","{27715, 39075, 23127, 21801, 21933, 21038, 469..."
4546,96,0,"[31371, 43304, 32578, 27966, 39275, 2228, 2629...","{39075, 18967, 24741, 28074, 3979, 7021, 31222..."
1954,138,0,[42475],"{30306, 21801, 6155, 13966, 17937, 32531, 2194..."
