# Final Project - Recommender System

Clayton Cohn<br>
Rosario Fabien<br>
Richard Olekanma

In [1]:
# Original dataset taken from: https://www.kaggle.com/mkechinov/ecommerce-events-history-in-cosmetics-shop?select=2019-Dec.csv
# Original dataset attributable to: https://rees46.com/

import pandas as pd

DATA_PATH = "/Users/claytoncohn/Dropbox/New/DePaul/DSC478/FinalProject/"

In [2]:
# Clayton Cohn's portion: item-based collaboritve filtering recommender system

# First import the data (I pre-cleaned it in FinalProjectClean.py)
import numpy as np

df = pd.read_csv("cosmetics_data.csv", header=0, names=["product_id", "user_id"])
print(df.shape)
df.head(10)

(79819, 2)


Unnamed: 0,product_id,user_id
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
5,5,0
6,6,0
7,7,0
8,8,0
9,9,0


In [3]:
# Create dictionary of all users and how many times each user bought each project
# Each key is user
# Each value is array of products purchased by that user

users = {}
for index, row in df.iterrows():
    user = row["user_id"]
    product = row["product_id"]
    if user in users:
        products = users[user]
        products.append(product)
        users.update({ user : products })
    else:
        users.update({ user : [product] })

In [4]:
# Code snippet taken from:
# https://stackoverflow.com/questions/7971618/python-return-first-n-keyvalue-pairs-from-dict

# Helper function to grab first n (key,value) pairs of given dictionary
from itertools import islice

def take(n, iterable):
    return list(islice(iterable, n))

head = take(10, users.items())

print("{} total users.".format(len(users)))

for row in head:
    print(row)

19209 total users.
(0, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
(1, [14, 15, 14, 15, 145, 145, 14, 15, 14, 15, 145])
(2, [16])
(3, [17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32])
(4, [33, 34])
(5, [35])
(6, [36, 37])
(7, [38, 39, 40, 41])
(8, [42, 43, 44, 45, 46, 47, 34, 42, 43, 44, 45, 46, 34, 42, 43, 44, 45, 46, 34])
(9, [48, 49, 50, 51])


In [5]:
# Create an array from above users dictionary.
# Each entry will look like this: [user,product,count]
# An entry of [0,0,3] means that user 0 bought product 0 a total of 3 times

import numpy as np

data = []
for key, value in users.items():
    user = key
    products = np.array(value)
    prod_unique = np.unique(products)
    
    for i in prod_unique:
        data.append([user, i, np.sum(products==i)])

In [6]:
print("{} user/product combinations.".format(len(data)))
data[:25]

77470 user/product combinations.


[[0, 0, 1],
 [0, 1, 1],
 [0, 2, 1],
 [0, 3, 1],
 [0, 4, 1],
 [0, 5, 1],
 [0, 6, 1],
 [0, 7, 1],
 [0, 8, 1],
 [0, 9, 1],
 [0, 10, 1],
 [0, 11, 1],
 [0, 12, 1],
 [0, 13, 1],
 [1, 14, 4],
 [1, 15, 4],
 [1, 145, 3],
 [2, 16, 1],
 [3, 17, 1],
 [3, 18, 1],
 [3, 19, 1],
 [3, 20, 1],
 [3, 21, 1],
 [3, 22, 1],
 [3, 23, 1]]

In [7]:
# Convert above array to a Pandas DataFrame

df_data = pd.DataFrame(data=data, columns=["user_id", "product_id", "count"])
print(df_data.shape)
df_data.sample(10)

(77470, 3)


Unnamed: 0,user_id,product_id,count
61548,14594,709,1
56184,13110,2,1
57420,13443,168,1
25816,5694,888,1
25901,5713,47,1
30590,6752,44,1
40338,9059,324,1
63387,15102,430,1
73539,18079,797,1
74477,18350,965,2


In [8]:
# Convert DataFrame to Pivot Table

df_mat_user = pd.pivot_table(df_data, values="count",index="user_id",columns="product_id")
df_mat_user.head(10)

product_id,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,


In [9]:
# Swap NaNs for 0s
# This works fine, as NaN means no purchases and 0 means no purchases

df_mat_user = df_mat_user.fillna(0)
df_mat_user.head(10)

product_id,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
# Must split users for validation purposes
# Item-item similarity matrix will be based on the training set
# hold-out (test) set of users will be validated with same item-item similarity matrix 
#    created from training set

from sklearn.model_selection import train_test_split

SEED = 33

X_train, X_test = train_test_split(df_mat_user, test_size=0.2, random_state=SEED)
X_train.head()

product_id,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8537,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17145,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3326,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2919,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# Renamed variables to be more intuitive

users_train = X_train
users_test = X_test
products = X_train.T
print(products.shape)
products

(1000, 15367)


user_id,8537,17145,3326,2919,18004,6819,947,3222,18964,16727,...,2044,13710,18131,2109,16486,10435,57,578,5848,2439
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# Compute item-item similarity matrix
# Using cosine similarity

from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

products_s = sparse.csr_matrix(products)
item_item_mat = cosine_similarity(products_s)
item_item_mat

array([[1.        , 0.24935373, 0.34384262, ..., 0.        , 0.01673889,
        0.        ],
       [0.24935373, 1.        , 0.34347103, ..., 0.00841794, 0.01018924,
        0.        ],
       [0.34384262, 0.34347103, 1.        , ..., 0.        , 0.01224898,
        0.        ],
       ...,
       [0.        , 0.00841794, 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.01673889, 0.01018924, 0.01224898, ..., 0.        , 1.        ,
        0.02738955],
       [0.        , 0.        , 0.        , ..., 0.        , 0.02738955,
        1.        ]])

In [13]:
# Recommender function

# Function takes a user, identifies user's favorite product, returns top n closest 
#     products to user's favorite product based on item-item similarity matrix

import operator

def recommend(user, users, item_item_mat,n,verbose=True):
    
    # Get user's most purchased product
    u = users.T[user]
    prod_index, prod_count = max(enumerate(u), key=operator.itemgetter(1))
    if verbose: 
        print("User {}'s favorite product is {} (purchased {} times)".format(user,prod_index,prod_count))
    
    # Retrieve top n items closest to user's favorite
    item_item = item_item_mat[prod_index]
    item_item_df = pd.Series(data=item_item)
    item_item_df.sort_values(axis=0, ascending=False, inplace=True)
    
    top = item_item_df[1:n + 1]
    
    return top    

In [14]:
# Test call to demonstrate functionality of recommender
# A random user is grabbed, and n recommended products are returned

TEST_USER = SEED

top_n = recommend(TEST_USER, users_train, item_item_mat, 5)
top_n

User 33's favorite product is 143 (purchased 3.0 times)


266    0.631754
989    0.362554
267    0.236810
144    0.205782
269    0.190693
dtype: float64

In [15]:
# Function to evaluate quality of recommendations
# Function takes user, recommendations as arguments, and checks those 
#    recommendations against user's actual purchase history
# Binary return type: 1 = user DID buy at least one recommended product
#                     0 = user DID NOT buy any of the recommended products
# Evaluation will be done on both training and test sets for each user

def didPurchase(user, users, products, top_n_ind):
    ret_val = 0
    u = users.T[user]
    for i in top_n_ind:
        if u[i] > 0:
            ret_val = 1
            break
    return ret_val

In [16]:
# Isolate the indexes (product_ids) of recommended products
top_n_ind = list(top_n.index)
top_n_ind

[266, 989, 267, 144, 269]

In [17]:
# Test call to see if user actually bought any of the recommendations
print(didPurchase(TEST_USER, users_train, products,top_n_ind))

1


In [18]:
# Will check each user's recommendations against purchase history

# TRAINING SET VALIDATION

bought_recs_train = 0
for index, row in users_train.iterrows():
    top_n = recommend(index, users_train, item_item_mat, 5, verbose=False)
    top_n_ind = list(top_n.index)
    if didPurchase(index, users_train,products,top_n_ind): bought_recs_train += 1
        
bought_recs_train

3956

In [19]:
# Get percentage of users (training) who actually purchased a recommended product
pct_purchased_train = bought_recs_train / len(users_train)
print("% of users who purchased recommended item (training):",pct_purchased_train)

% of users who purchased recommended item (training): 0.2574347628034099


In [20]:
# TEST SET VALIDATION

bought_recs_test = 0
for index, row in users_test.iterrows():
    top_n = recommend(index, users_test, item_item_mat, 5, verbose=False)
    top_n_ind = list(top_n.index)
    if didPurchase(index, users_test,products,top_n_ind): bought_recs_test += 1
        
bought_recs_test

834

In [21]:
# Get percentage of users (test) who actually purchased a recommended product
pct_purchased_test = bought_recs_test / len(users_test)
print("% of users who purchased recommended item (testing):",pct_purchased_test)

% of users who purchased recommended item (testing): 0.2170744403956273


In [22]:
'''
This is rather substantial. It says that 1/5 of the time, consumers purchased at least one item that was recommended
to them. Although I have no way of telling how this number stacks up against Amazon, it is nervertheless significant.
Obviously there are caveats. For one thing, I only included the <5% most frequently purchased products. I imagine it
is harder to recommend products when the matrix is sparser. Also, I read that I can improve the item-item similarity
matrix by only counting cosine similarities of items both purchased by the same user.
'''

'\nThis is rather substantial. It says that 1/5 of the time, consumers purchased at least one item that was recommended\nto them. Although I have no way of telling how this number stacks up against Amazon, it is nervertheless significant.\nObviously there are caveats. For one thing, I only included the <5% most frequently purchased products. I imagine it\nis harder to recommend products when the matrix is sparser. Also, I read that I can improve the item-item similarity\nmatrix by only counting cosine similarities of items both purchased by the same user.\n'