In [98]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import sparse
from sklearn import neighbors

# Set params

In [99]:
N_NEIGHBORS = 25

In [100]:
dat = pd.read_csv('../data/retailrocket-recommender-system-dataset/train-scores-encoded.csv')
dat.head()

Unnamed: 0,visitorid,itemid,score,iid
0,75,257575,25,7400
1,172,10034,20,266
2,172,465522,23,13389
3,186,49029,17,1369
4,264,161949,17,4567


In [102]:
scores = dat.groupby('visitorid iid'.split()).score.sum()
scores.head()

visitorid  iid  
75         7400     25
172        266      20
           13389    23
186        1369     17
264        4567     17
Name: score, dtype: int64

# Format into a sklearn feature matrix

In [111]:
scores.index.codes[0]

FrozenNDArray([0, 1, 1, 2, 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14, 15, 16, 16, 17, 18, 19, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 35, 36, 37, 38, 39, 39, 40, 41, 41, 41, 42, 43, 44, 45, 46, 46, 46, 47, 48, 49, 49, 49, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 57, 58, 59, 60, 60, 61, 62, 63, 64, 65, 66, 67, 68, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 77, 77, 77, 78, ...], dtype='int16')

In [112]:
scores.index.codes[1]

FrozenNDArray([7400, 266, 13389, 1369, 4567, 13244, 8279, 504, 11475, 2685, 11998, 7043, 5891, 11089, 12423, 6976, 26, 590, 8649, 4935, 11042, 9493, 2667, 11415, 12864, 3904, 8919, 11674, 2475, 11436, 5144, 11960, 10097, 5177, 10424, 2336, 6883, 3104, 385, 9507, 9861, 10006, 9799, 1258, 2776, 504, 10211, 198, 221, 3263, 12523, 6291, 715, 11253, 11815, 5306, 9321, 9636, 4864, 1946, 2999, 4787, 6516, 11641, 6616, 2803, 9823, 7423, 4851, 4671, 2137, 3206, 3918, 10758, 9470, 1278, 3607, 3923, 2915, 6900, 1035, 3328, 12214, 5128, 900, 5270, 8550, 10977, 8508, 3684, 13045, 205, 4479, 8052, 10440, 1809, 3094, 8353, 10959, 1401, ...], dtype='int16')

In [113]:
scores.index.levels[0]

Int64Index([     75,     172,     186,     264,     270,     419,     420,
                539,     627,     745,
            ...
            1263992, 1264026, 1264132, 1264213, 1264264, 1264334, 1264440,
            1264468, 1264709, 1264860],
           dtype='int64', name='visitorid', length=14535)

In [114]:
scores.index.levels[1]

Int64Index([    0,     1,     2,     3,     4,     5,     6,     7,     8,
                9,
            ...
            13418, 13419, 13420, 13421, 13422, 13423, 13424, 13425, 13426,
            13427],
           dtype='int64', name='iid', length=13428)

In [103]:
X_shape = (
    len(scores.index.levels[0]),
    len(scores.index.levels[1]),
)
X = np.zeros(shape=X_shape)
X[scores.index.codes[0], scores.index.codes[1]] = scores.values
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [104]:
X.shape

(14535, 13428)

In [105]:
from sklearn import metrics
model = neighbors.KNeighborsRegressor(metric='cosine', n_neighbors=N_NEIGHBORS)

In [106]:
%%time
model.fit(X[:1460], X[:1460])
ypred = model.predict(X[:1460])
print(f'MSE: {metrics.mean_squared_error(X[:1460], ypred)}')

MSE: 0.050003230297762595
CPU times: user 4.05 s, sys: 1.9 s, total: 5.95 s
Wall time: 4.5 s


In [107]:
Xsparse = sparse.csr_matrix(X)

In [108]:
%%time
model.fit(Xsparse[:1460], X[:1460])
ypred = model.predict(Xsparse[:1460])
print(f'MSE: {metrics.mean_squared_error(X[:1460], ypred)}')

MSE: 0.050003230297762595
CPU times: user 1.79 s, sys: 1.58 s, total: 3.37 s
Wall time: 3.53 s


# Store training data for kneighbors and just fit on load

In [110]:
sparse.save_npz('../data/retailrocket-recommender-system-dataset/Xsparse.npz', Xsparse)