In [41]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import sparse
from sklearn import neighbors

In [42]:
N_NEIGHBORS = 25

In [43]:
dat = pd.read_csv('../data/retailrocket-recommender-system-dataset/test-events-scored.csv')
dat.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid,score
0,1433223239808,1377281,view,251467,,1
1,1433224244282,1370216,view,176721,,1
2,1433224070841,1398644,view,135256,,1
3,1433221622167,1342963,view,306886,,1
4,1433223146605,1282360,view,106564,,1


In [44]:
dat.shape

(275613, 6)

# Compute scores for each visitorid/itemid

In [45]:
scores = dat.groupby('visitorid itemid'.split()).score.sum()
scores.head()

visitorid  itemid
1264934    19417     1
           103508    1
           223429    1
           225257    1
           243626    1
Name: score, dtype: int64

In [46]:
iids = pd.read_csv(
    '../data/retailrocket-recommender-system-dataset/iids-encoding.csv', 
    index_col='itemid', 
)
iids.head()

Unnamed: 0_level_0,iid
itemid,Unnamed: 1_level_1
15,0
19,1
25,2
42,3
147,4


# Add the iid column and drop irrelevant itemidsd

In [47]:
dat = pd.merge(scores.reset_index(), iids, left_on='itemid', right_index=True, how='inner')
dat.head()

Unnamed: 0,visitorid,itemid,score,iid
3,1264934,225257,1,6436
831,1265532,225257,1,6436
2890,1266932,225257,1,6436
4610,1268120,225257,1,6436
7351,1270084,225257,1,6436


In [48]:
dat.shape

(67631, 4)

# Load and fit the model

In [49]:
Xsparse = sparse.load_npz('../data/retailrocket-recommender-system-dataset/Xsparse.npz')
Xsparse

<14535x13428 sparse matrix of type '<class 'numpy.float64'>'
	with 24315 stored elements in Compressed Sparse Row format>

In [50]:
from sklearn import neighbors
model = neighbors.KNeighborsRegressor(metric='cosine', n_neighbors=N_NEIGHBORS)
model.fit(Xsparse, Xsparse.todense())

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='cosine',
          metric_params=None, n_jobs=None, n_neighbors=25, p=2,
          weights='uniform')

# Format test into an sklearn feature matrix

In [51]:
scores_test = dat.groupby('visitorid iid'.split()).score.sum()
scores_test.head()

visitorid  iid  
1264934    6436     1
           6967     1
1264938    10957    1
1264939    1782     1
1264942    8168     1
Name: score, dtype: int64

In [52]:
Xtest_shape = (
    len(scores_test.index.levels[0]),
    Xsparse.shape[1]
)
Xtest = np.zeros(shape=Xtest_shape)
Xtest[scores_test.index.codes[0], scores_test.index.codes[1]] = scores_test.values
Xtest

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [53]:
Xtest.shape

(46224, 13428)

In [54]:
Xtest_sparse = sparse.csr_matrix(Xtest)
Xtest_sparse

<46224x13428 sparse matrix of type '<class 'numpy.float64'>'
	with 67631 stored elements in Compressed Sparse Row format>

# Predict scores for our visitors

In [55]:
%%time
ypreds = [
    model.predict(Xtest_sparse[i:i+1000])
    for i in range(0, Xtest_sparse.shape[0], 1000)
]

CPU times: user 57 s, sys: 49.2 s, total: 1min 46s
Wall time: 1min 50s


In [56]:
ypred = np.vstack(ypreds)
ypred

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [57]:
from sklearn import metrics

In [58]:
%%time
metrics.mean_squared_error(ypred, Xtest)

CPU times: user 8.26 s, sys: 16.1 s, total: 24.4 s
Wall time: 26.8 s


0.005851921322267257

In [60]:
ypred_vids, ypred_iids  = ypred.nonzero()
ypred_scores = ypred[ypred_vids, ypred_iids]

In [61]:
vid_to_visitorid = scores_test.index.levels[0]
ypred_visitorids = vid_to_visitorid[ypred_vids]

In [88]:
iid_to_itemid = iids.reset_index().set_index('iid').squeeze()
ypred_itemids = iid_to_itemid[ypred_iids]

In [89]:
result = pd.DataFrame({
    'visitorid': ypred_visitorids,
    'itemid': ypred_itemids,
    'score': ypred_scores
})

In [90]:
result.head()

Unnamed: 0_level_0,visitorid,itemid,score
iid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
338,1264934,12836,0.48
766,1264934,28367,0.64
1021,1264934,36904,0.68
1281,1264934,46232,0.64
1349,1264934,48557,0.44


In [92]:
result.shape

(4801723, 3)

In [91]:
result.to_csv('../data/retailrocket-recommender-system-dataset/pred-test-scores.csv', index=False)