In [None]:
# SOURCE CODE
# https://gist.github.com/victorkohler/f48ea6512058719ba52053851fedc745

In [None]:
import sys
import pandas as pd
import numpy as np
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
import random

from sklearn.preprocessing import MinMaxScaler

import implicit

In [47]:
train = pd.read_csv('train.csv')
np.shape(train)
item_metadata = pd.read_csv('item_metadata.csv', sep=',', engine='python')

In [48]:
train_v2 = train[train['action_type']=='clickout item']

In [49]:
train_v2["reference"]= train_v2["reference"].astype(int)
train_v2["step"]= train_v2["step"].astype(int)
train_v2.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


user_id            object
session_id         object
timestamp           int64
step                int64
action_type        object
reference           int64
platform           object
city               object
device             object
current_filters    object
impressions        object
prices             object
dtype: object

In [50]:
train_v2.head()
#np.shape(train_v2)

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices
29,01R92KASN25O,62991f7c78f27,1541055672,8,clickout item,7818446,DE,"Luebeck, Germany",mobile,,7818446|51315|2133708|119638|86359|133581|6688...,163|75|93|98|169|201|129|117|108|170|133|80|87...
32,026J2T90PT57,67c4d45f56146,1541105688,1,clickout item,2681512,MX,"Ezequiel Montes, Mexico",mobile,,6010044|5433188|5156000|4780400|5137162|992819...,38|35|54|37|73|37|37|30|71|75|64|26|25|37|40|3...
39,032W52V15J3A,e84393cf62d13,1541102603,7,clickout item,929533,HR,"Cancun, Mexico",mobile,,3132957|100226|1954167|9462680|2776177|929533|...,25|324|67|52|57|21|90|240|64|37|70|60|167|45|6...
73,03F93632Y45U,f09aa370de0af,1541109160,34,clickout item,8496296,US,"San Diego, USA",mobile,,9112592|102414|9140448|5654906|4341120|9739716...,263|114|263|273|122|194|236|161|219|113|110|19...
393,08527D0MD8VC,3e2a95e5727f9,1541099104,320,clickout item,8266350,ES,"Lisbon, Portugal",desktop,,3505518|1834297|4837106|3990396|2042495|486747...,47|50|45|40|45|40|40|50|50|45|50|45|38|49|48|3...


In [51]:
#train_v3 = train_v2.head(10000)
train_v3 = train_v2
train_v3.head()
np.shape(train_v3)

(1586586, 12)

In [52]:
data = train_v3[['user_id','reference']]
data.head()
#np.shape(data)

Unnamed: 0,user_id,reference
29,01R92KASN25O,7818446
32,026J2T90PT57,2681512
39,032W52V15J3A,929533
73,03F93632Y45U,8496296
393,08527D0MD8VC,8266350


In [53]:
data=data.groupby(['user_id','reference']).size().reset_index()
data.columns = ['user', 'item', 'click_count']
data.head()

Unnamed: 0,user,item,click_count
0,0001VQMGUI65,477811,1
1,0001VQMGUI65,950829,1
2,0001VQMGUI65,2019467,1
3,0001VQMGUI65,3133074,2
4,0003QTCX5MJX,2195060,1


In [54]:
# Create a numeric user_id and artist_id column
data['user'] = data['user'].astype("category")
data['item'] = data['item'].astype("category")
data['user_id'] = data['user'].cat.codes
data['item_id'] = data['item'].cat.codes
data.head()

Unnamed: 0,user,item,click_count,user_id,item_id
0,0001VQMGUI65,477811,1,0,59075
1,0001VQMGUI65,950829,1,0,73109
2,0001VQMGUI65,2019467,1,0,119719
3,0001VQMGUI65,3133074,2,0,159473
4,0003QTCX5MJX,2195060,1,1,126989


In [55]:
# The implicit library expects data as a item-user matrix so we
# create two matricies, one for fitting the model (item-user) 
# and one for recommendations (user-item)
sparse_item_user = sparse.csr_matrix((data['click_count'].astype(float), (data['item_id'], data['user_id'])))
sparse_user_item = sparse.csr_matrix((data['click_count'].astype(float), (data['user_id'], data['item_id'])))

# Initialize the als model and fit it using the sparse item-user matrix
model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=20)

# Calculate the confidence by multiplying it by our alpha value.
alpha_val = 15
data_conf = (sparse_item_user * alpha_val).astype('double')

#Fit the model
model.fit(data_conf)

100%|██████████| 20.0/20 [00:29<00:00,  1.27s/it]


In [58]:
#---------------------
# FIND SIMILAR ITEMS
#---------------------

# Find the 10 most similar to items 50
item_id = 50 #items 50
n_similar = 10

# Use implicit to get similar items.
similar = model.similar_items(item_id, n_similar)

# Print the names of our most similar artists
for item in similar:
    idx, score = item
    print(data.item.loc[data.item_id == idx].iloc[0])

5073
3215196
1964139
7061118
5017
3448484
23179
23191
110132
4024536


In [59]:
#------------------------------
# CREATE USER RECOMMENDATIONS
#------------------------------

# Create recommendations for user with id 30
user_id = 30

# Use the implicit recommender.
recommended = model.recommend(user_id, sparse_user_item)

items = []
scores = []

# Get artist names from ids
for item in recommended:
    idx, score = item
    items.append(data.item.loc[data.item_id == idx].iloc[0])
    scores.append(score)

# Create a dataframe of artist names and scores
recommendations = pd.DataFrame({'item': items, 'score': scores})

print(recommendations)

      item     score
0  1455251  0.056667
1    44118  0.053937
2    43775  0.052289
3    92313  0.044441
4    87943  0.042117
5    44902  0.040587
6    44208  0.038802
7    92077  0.038547
8  3053490  0.033541
9  5213052  0.032212


In [None]:
# Apply the model to the testing data
test = pd.read_csv('test.csv')
np.shape(test)
train_v2 = train[train['action_type']=='clickout item']
train_v2["reference"]= train_v2["reference"].astype(int)
train_v2["step"]= train_v2["step"].astype(int)
train_v2.dtypes
train_v2.head()
#np.shape(train_v2)
#train_v3 = train_v2.head(10000)
train_v3 = train_v2
train_v3.head()
np.shape(train_v3)
data = train_v3[['user_id','reference']]
data.head()
#np.shape(data)
data=data.groupby(['user_id','reference']).size().reset_index()
data.columns = ['user', 'item', 'click_count']
data.head()
# Create a numeric user_id and artist_id column
data['user'] = data['user'].astype("category")
data['item'] = data['item'].astype("category")
data['user_id'] = data['user'].cat.codes
data['item_id'] = data['item'].cat.codes
data.head()