# SOURCE CODE
# https://gist.github.com/victorkohler/f48ea6512058719ba52053851fedc745

In [72]:
import sys
import pandas as pd
import numpy as np
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
import random

from sklearn.preprocessing import MinMaxScaler

import implicit

# Data Preprocessing for Matrix User x Item

In [73]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [74]:
#np.shape(train_data)
np.shape(test_data)

(3782335, 12)

In [75]:
train = train_data.append(test_data)
np.shape(train)

(19715327, 12)

In [76]:
item_metadata = pd.read_csv('item_metadata.csv', sep=',', engine='python')

In [77]:
train_v2 = train[train['action_type']=='clickout item']
train_v2= train_v2[train_v2.reference.notnull()]
train_v2.head()

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices
29,01R92KASN25O,62991f7c78f27,1541055672,8,clickout item,7818446,DE,"Luebeck, Germany",mobile,,7818446|51315|2133708|119638|86359|133581|6688...,163|75|93|98|169|201|129|117|108|170|133|80|87...
32,026J2T90PT57,67c4d45f56146,1541105688,1,clickout item,2681512,MX,"Ezequiel Montes, Mexico",mobile,,6010044|5433188|5156000|4780400|5137162|992819...,38|35|54|37|73|37|37|30|71|75|64|26|25|37|40|3...
39,032W52V15J3A,e84393cf62d13,1541102603,7,clickout item,929533,HR,"Cancun, Mexico",mobile,,3132957|100226|1954167|9462680|2776177|929533|...,25|324|67|52|57|21|90|240|64|37|70|60|167|45|6...
73,03F93632Y45U,f09aa370de0af,1541109160,34,clickout item,8496296,US,"San Diego, USA",mobile,,9112592|102414|9140448|5654906|4341120|9739716...,263|114|263|273|122|194|236|161|219|113|110|19...
393,08527D0MD8VC,3e2a95e5727f9,1541099104,320,clickout item,8266350,ES,"Lisbon, Portugal",desktop,,3505518|1834297|4837106|3990396|2042495|486747...,47|50|45|40|45|40|40|50|50|45|50|45|38|49|48|3...


In [78]:
train_v2.dtypes

user_id            object
session_id         object
timestamp           int64
step                int64
action_type        object
reference          object
platform           object
city               object
device             object
current_filters    object
impressions        object
prices             object
dtype: object

In [79]:
train_v2["reference"]= train_v2["reference"].astype(int)
train_v2["step"]= train_v2["step"].astype(int)
train_v2.dtypes

user_id            object
session_id         object
timestamp           int64
step                int64
action_type        object
reference           int64
platform           object
city               object
device             object
current_filters    object
impressions        object
prices             object
dtype: object

In [80]:
train_v2.head()
#np.shape(train_v2)

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices
29,01R92KASN25O,62991f7c78f27,1541055672,8,clickout item,7818446,DE,"Luebeck, Germany",mobile,,7818446|51315|2133708|119638|86359|133581|6688...,163|75|93|98|169|201|129|117|108|170|133|80|87...
32,026J2T90PT57,67c4d45f56146,1541105688,1,clickout item,2681512,MX,"Ezequiel Montes, Mexico",mobile,,6010044|5433188|5156000|4780400|5137162|992819...,38|35|54|37|73|37|37|30|71|75|64|26|25|37|40|3...
39,032W52V15J3A,e84393cf62d13,1541102603,7,clickout item,929533,HR,"Cancun, Mexico",mobile,,3132957|100226|1954167|9462680|2776177|929533|...,25|324|67|52|57|21|90|240|64|37|70|60|167|45|6...
73,03F93632Y45U,f09aa370de0af,1541109160,34,clickout item,8496296,US,"San Diego, USA",mobile,,9112592|102414|9140448|5654906|4341120|9739716...,263|114|263|273|122|194|236|161|219|113|110|19...
393,08527D0MD8VC,3e2a95e5727f9,1541099104,320,clickout item,8266350,ES,"Lisbon, Portugal",desktop,,3505518|1834297|4837106|3990396|2042495|486747...,47|50|45|40|45|40|40|50|50|45|50|45|38|49|48|3...


In [81]:
#train_v3 = train_v2.head(10000)
train_v3 = train_v2
train_v3.head()
np.shape(train_v3)

(1861792, 12)

In [82]:
data = train_v3[['user_id','reference']]
data.head()
#np.shape(data)

Unnamed: 0,user_id,reference
29,01R92KASN25O,7818446
32,026J2T90PT57,2681512
39,032W52V15J3A,929533
73,03F93632Y45U,8496296
393,08527D0MD8VC,8266350


In [83]:
data=data.groupby(['user_id','reference']).size().reset_index()
data.columns = ['user', 'item', 'click_count']
data.head()

Unnamed: 0,user,item,click_count
0,0001VQMGUI65,477811,1
1,0001VQMGUI65,950829,1
2,0001VQMGUI65,2019467,1
3,0001VQMGUI65,3133074,2
4,0003QTCX5MJX,2195060,1


In [84]:
# Create a numeric user_id and artist_id column
data['user'] = data['user'].astype("category")
data['item'] = data['item'].astype("category")
data['user_id'] = data['user'].cat.codes
data['item_id'] = data['item'].cat.codes
data.head()

Unnamed: 0,user,item,click_count,user_id,item_id
0,0001VQMGUI65,477811,1,0,61469
1,0001VQMGUI65,950829,1,0,76199
2,0001VQMGUI65,2019467,1,0,125448
3,0001VQMGUI65,3133074,2,0,167748
4,0003QTCX5MJX,2195060,1,1,133147


# ALS Collaborative Filtering Model

In [85]:
# The implicit library expects data as a item-user matrix so we
# create two matricies, one for fitting the model (item-user) 
# and one for recommendations (user-item)
sparse_item_user = sparse.csr_matrix((data['click_count'].astype(float), (data['item_id'], data['user_id'])))
sparse_user_item = sparse.csr_matrix((data['click_count'].astype(float), (data['user_id'], data['item_id'])))

# Initialize the als model and fit it using the sparse item-user matrix
model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=20)

# Calculate the confidence by multiplying it by our alpha value.
alpha_val = 15
data_conf = (sparse_item_user * alpha_val).astype('double')

#Fit the model
model.fit(data_conf)

100%|██████████| 20.0/20 [00:31<00:00,  1.39s/it]


In [86]:
#---------------------
# FIND SIMILAR ITEMS
#---------------------

# Find the 10 most similar to items 50
item_id = 50 #items 50
n_similar = 10

# Use implicit to get similar items.
similar = model.similar_items(item_id, n_similar)

# Print the names of our most similar artists
for item in similar:
    idx, score = item
    print(data.item.loc[data.item_id == idx].iloc[0])

5073
6444110
13468
147422
101797
555191
2397228
939960
6863
974163


In [87]:
#------------------------------
# CREATE USER RECOMMENDATIONS
#------------------------------

# Create recommendations for user with id 30
user_id = 30

# Use the implicit recommender.
recommended = model.recommend(user_id, sparse_user_item)

items = []
scores = []

# Get artist names from ids
for item in recommended:
    idx, score = item
    items.append(data.item.loc[data.item_id == idx].iloc[0])
    scores.append(score)

# Create a dataframe of artist names and scores
recommendations = pd.DataFrame({'item': items, 'score': scores})

print(recommendations)

      item     score
0  1455251  0.000002
1     5742  0.000002
2   106769  0.000002
3  3248926  0.000001
4     8796  0.000001
5  4290518  0.000001
6  5055268  0.000001
7   104710  0.000001
8  1711789  0.000001
9  1345387  0.000001


# Data Preprocessing for Side Information - Feature Engineering

In [88]:
train['action_type'].unique()

array(['search for destination', 'change of sort order',
       'filter selection', 'interaction item image',
       'interaction item info', 'interaction item deals', 'clickout item',
       'search for item', 'search for poi'], dtype=object)

In [89]:
user_id_table = train['user_id']
user_id_table = pd.DataFrame(user_id_table)
user_id_table = pd.DataFrame(user_id_table['user_id'].unique())
user_id_table.columns = ['user_id']
user_id_table.head()
np.shape(user_id_table)

(948041, 1)

In [90]:
search_for_destination_table = train[train['action_type']=='search for destination']
search_for_destination_table = search_for_destination_table[['user_id','action_type']]
search_for_destination_table['search_for_destination'] = 1

change_of_sort_order_table = train[train['action_type']=='change of sort order']
change_of_sort_order_table = change_of_sort_order_table[['user_id','action_type']]
change_of_sort_order_table['change_of_sort_order'] = 1

filter_selection_table = train[train['action_type']=='filter selection']
filter_selection_table = filter_selection_table[['user_id','action_type']]
filter_selection_table['filter_selection'] = 1

interaction_item_image_table = train[train['action_type']=='interaction item image']
interaction_item_image_table = interaction_item_image_table[['user_id','action_type']]
interaction_item_image_table['interaction_item_image'] = 1

interaction_item_info_table = train[train['action_type']=='interaction item info']
interaction_item_info_table = interaction_item_info_table[['user_id','action_type']]
interaction_item_info_table['interaction_item_info'] = 1

interaction_item_deals_table = train[train['action_type']=='interaction item deals']
interaction_item_deals_table = interaction_item_deals_table[['user_id','action_type']]
interaction_item_deals_table['interaction_item_deals'] = 1

search_for_item_table = train[train['action_type']=='search for item']
search_for_item_table = search_for_item_table[['user_id','action_type']]
search_for_item_table['search_for_item'] = 1

search_for_poi_table = train[train['action_type']=='search for poi']
search_for_poi_table = search_for_poi_table[['user_id','action_type']]
search_for_poi_table['search_for_poi'] = 1

In [91]:
search_for_destination_table=search_for_destination_table.groupby('user_id').search_for_destination.nunique().reset_index()
change_of_sort_order_table=change_of_sort_order_table.groupby('user_id').change_of_sort_order.nunique().reset_index()
filter_selection_table=filter_selection_table.groupby('user_id').filter_selection.nunique().reset_index()
interaction_item_image_table=interaction_item_image_table.groupby('user_id').interaction_item_image.nunique().reset_index()
interaction_item_info_table=interaction_item_info_table.groupby('user_id').interaction_item_info.nunique().reset_index()
interaction_item_deals_table=interaction_item_deals_table.groupby('user_id').interaction_item_deals.nunique().reset_index()
search_for_item_table=search_for_item_table.groupby('user_id').search_for_item.nunique().reset_index()
search_for_poi_table=search_for_poi_table.groupby('user_id').search_for_poi.nunique().reset_index()
search_for_poi_table.head()
np.shape(search_for_poi_table)

(111924, 2)

In [92]:
search_for_poi_table.head()

Unnamed: 0,user_id,search_for_poi
0,0008BO33KUQ0,1
1,001JFV0AYE6Z,1
2,001LT4KVZ14I,1
3,001TD1JLUE4D,1
4,0022E2NEUGQX,1


In [93]:
list(interaction_item_image_table)

['user_id', 'interaction_item_image']

In [95]:
side_information_table = user_id_table.merge(search_for_destination_table, how='left', left_on='user_id', right_on = 'user_id').merge(change_of_sort_order_table, how='left', left_on='user_id', right_on = 'user_id').merge(filter_selection_table, how='left', left_on='user_id', right_on = 'user_id').merge(interaction_item_image_table, how='left', left_on='user_id', right_on = 'user_id').merge(interaction_item_info_table, how='left', left_on='user_id', right_on = 'user_id').merge(interaction_item_deals_table, how='left', left_on='user_id', right_on = 'user_id').merge(search_for_item_table, how='left', left_on='user_id', right_on = 'user_id').merge(search_for_poi_table, how='left', left_on='user_id', right_on = 'user_id')
side_information_table = side_information_table.fillna(0)
side_information_table.head()

Unnamed: 0,user_id,search_for_destination,change_of_sort_order,filter_selection,interaction_item_image,interaction_item_info,interaction_item_deals,search_for_item,search_for_poi
0,008UN2RY0DMY,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
1,01R92KASN25O,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,026J2T90PT57,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
3,032W52V15J3A,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,03F93632Y45U,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0


# Create User x Item Table

In [None]:
user_id_item_table = data.pivot(index='user_id', columns='item_id', values='click_count').reset_index()
user_id_item_table = user_id_item_table.fillna(0)
user_id_item_table = user_id_item_table.add_prefix('item_')
#keep_same = {'user_id'}
#user_id_item_table.columns = ['{}{}'.format(c, '' if c in keep_same else 'item_')
 #                             for c in user_id_item_table.columns]
user_id_item_table.rename(columns={'item_user_id':'user_id'}, inplace=True)
user_id_item_table.head()

In [None]:
# Name of Dataframe: 
# - user_id_item_table: Table contains users and items
# - side_information_table: Table contains user_id and feature variables