## SLIM - Cython practice

In [2]:
import numpy as np
import scipy.sparse as sp
from DataParser import DataParser
from Notebooks_utils.data_splitter import train_test_holdout
import time

In [3]:
parser = DataParser()
URM_all = parser.get_URM_all()
URM_train, URM_test = train_test_holdout(URM_all, train_perc = 0.85)

In [4]:
URM_train

<7947x25975 sparse matrix of type '<class 'numpy.float64'>'
	with 96146 stored elements in Compressed Sparse Row format>

In [6]:
n_users, n_items = URM_train.shape

In [7]:
item_item_S = np.zeros((n_items, n_items), dtype = np.float)
item_item_S

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
URM_train_coo = URM_train.tocoo()

sample_index = np.random.randint(URM_train_coo.nnz)
sample_index

In [None]:
user_id = URM_train_coo.row[sample_index]
item_id = URM_train_coo.col[sample_index]
rating = URM_train_coo.data[sample_index]

(user_id, item_id, rating)

In [None]:
#compute the prediction

predicted_rating = URM_train[user_id].dot(item_item_S[:,item_id])[0]
predicted_rating

We compute the prediction error and update the item-item similarity 

In [None]:
prediction_error = rating - predicted_rating
prediction_error

In [None]:
#update: Only those we used to compute the prediction, i.e., only the items in the profile of the sampled user.
items_in_user_profile = URM_train.indices[URM_train.indptr[user_id]:URM_train.indptr[user_id+1]]
items_in_user_profile

In [None]:
item_item_S[items_in_user_profile,item_id]

In [None]:
learning_rate = 1e-4
update = prediction_error * learning_rate
update

In [None]:
item_item_S[items_in_user_profile,item_id] += update

In [None]:
len(items_in_user_profile)

In [None]:
n_items

In [None]:
#the model is changeing and so also the predictions
predicted_rating = URM_train[user_id].dot(item_item_S[:,item_id])[0]
predicted_rating

In [None]:
URM_train_coo = URM_train.tocoo()
item_item_S = np.zeros((n_items, n_items), dtype = np.float)

learning_rate = 1e-6
loss = 0.0

start_time = time.time()
for sample_num in range(100000):
    
    # Randomly pick sample
    sample_index = np.random.randint(URM_train_coo.nnz)

    user_id = URM_train_coo.row[sample_index]
    item_id = URM_train_coo.col[sample_index]
    rating = URM_train_coo.data[sample_index]

    # Compute prediction
    predicted_rating = URM_train[user_id].dot(item_item_S[:,item_id])[0]
        
    # Compute prediction error, or gradient
    prediction_error = rating - predicted_rating
    loss += prediction_error**2
    
    # Update model, in this case the similarity
    items_in_user_profile = URM_train[user_id].indices
    item_item_S[items_in_user_profile,item_id] += prediction_error * learning_rate
    
    # Print some stats
    if (sample_num +1)% 5000 == 0:
        elapsed_time = time.time() - start_time
        samples_per_second = sample_num/elapsed_time
        print("Iteration {} in {:.2f} seconds, loss is {:.2f}. Samples per second {:.2f}".format(sample_num+1, elapsed_time, loss/sample_num, samples_per_second))

In [8]:
item_item_S

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [9]:
%load_ext Cython

In [10]:


%%cython

import numpy as np
import time

from libc.stdlib cimport rand, srand, RAND_MAX

def train_multiple_epochs(URM_train, learning_rate_input, n_epochs):

    URM_train_coo = URM_train.tocoo()
    cdef int n_items = URM_train.shape[1]
    cdef int n_interactions = URM_train.nnz
    cdef int[:] URM_train_row = URM_train_coo.row
    cdef int[:] URM_train_col = URM_train_coo.col
    cdef double[:] URM_train_data = URM_train_coo.data
    cdef int[:] URM_train_indices = URM_train.indices
    cdef int[:] URM_train_indptr = URM_train.indptr

    cdef double[:,:] item_item_S = np.zeros((n_items, n_items), dtype = np.float)
    cdef double learning_rate = learning_rate_input
    cdef double loss = 0.0
    cdef long start_time
    cdef double rating, predicted_rating, prediction_error
    cdef int start_profile, end_profile
    cdef int index, sample_num, user_id, item_id, seen_item_id
    
    for n_epoch in range(n_epochs):
        
        loss = 0.0
        start_time = time.time()
        
        for sample_num in range(n_interactions):

            # Randomly pick sample
            index = rand() % n_interactions

            user_id = URM_train_row[index]
            item_id = URM_train_col[index]
            rating = URM_train_data[index]

            # Compute prediction
            start_profile = URM_train_indptr[user_id]
            end_profile = URM_train_indptr[user_id+1]
            predicted_rating = 0.0

            for index in range(start_profile, end_profile):
                seen_item_id = URM_train_indices[index]
                predicted_rating += item_item_S[seen_item_id,item_id]

            # Compute prediction error, or gradient
            prediction_error = rating - predicted_rating
            loss += prediction_error**2

            # Update model, in this case the similarity
            for index in range(start_profile, end_profile):
                seen_item_id = URM_train_indices[index]
                item_item_S[seen_item_id,item_id] += prediction_error * learning_rate

#             # Print some stats
#             if (sample_num +1)% 1000000 == 0:
#                 elapsed_time = time.time() - start_time
#                 samples_per_second = sample_num/elapsed_time
#                 print("Iteration {} in {:.2f} seconds, loss is {:.2f}. Samples per second {:.2f}".format(sample_num+1, elapsed_time, loss/sample_num, samples_per_second))

            
        elapsed_time = time.time() - start_time
        samples_per_second = sample_num/elapsed_time
     
        print("Epoch {} complete in in {:.2f} seconds, loss is {:.3E}. Samples per second {:.2f}".format(n_epoch+1, time.time() - start_time, loss/sample_num, samples_per_second))

    return np.array(item_item_S), loss/sample_num, samples_per_second



In [None]:
n_items = URM_train.shape[1]
learning_rate = 1e-3
    
item_item_S, loss, samples_per_second = train_multiple_epochs(URM_train, learning_rate, 10)

Epoch 1 complete in in 2.53 seconds, loss is 8.986E-01. Samples per second 37987.66
Epoch 2 complete in in 1.13 seconds, loss is 7.578E-01. Samples per second 85368.40
Epoch 3 complete in in 0.63 seconds, loss is 6.699E-01. Samples per second 152690.94
Epoch 4 complete in in 1.12 seconds, loss is 6.055E-01. Samples per second 85841.44
Epoch 5 complete in in 0.62 seconds, loss is 5.549E-01. Samples per second 154538.71
Epoch 6 complete in in 1.10 seconds, loss is 5.158E-01. Samples per second 87335.69
Epoch 7 complete in in 0.58 seconds, loss is 4.810E-01. Samples per second 165849.69
Epoch 8 complete in in 1.06 seconds, loss is 4.516E-01. Samples per second 90519.94
Epoch 9 complete in in 0.55 seconds, loss is 4.265E-01. Samples per second 174185.19
