In [None]:
from math import sqrt
import numpy as np
import pandas as pd
from collections import defaultdict
import requests

from polara import get_movielens_data
from polara import RecommenderData
from polara import RecommenderModel

from tqdm import tqdm_notebook
%matplotlib inline

# Data

We will use only rating information. The preparation steps are the same as in `polara_intro.ipynb`.

In [None]:
data = get_movielens_data()

In [None]:
data_model = RecommenderData(data, *data.columns[:3], seed=0)
data_model.warm_start = False
data_model.prepare()

Check correct fields mapping:

In [None]:
data_model.fields

# Popularity baseline

In [None]:
from polara import PopularityModel

In [None]:
pop = PopularityModel(data_model)

In [None]:
pop.build()
pop.evaluate()

# Submissions

When your model is ready use the function below to submit your results.  
Use *your name* or some unique label for submission in order to identify your score on the leaderboard.

In [None]:
def save_and_submit(name, recommendations):
    '''Function to submit your model predictions to the leaderboard.
    First argument is the name of the submission, second argument
    is the matrix of top-n recommendations.'''
    np.savez(name, recs=recommendations)
    files = {'upload': open(f'{name}.npz','rb')}
    url = "http://recsysvalley.azurewebsites.net/upload"
    r = requests.post(url, files=files)
    return r.status_code, r.reason

# Warm up - PureSVD model

In [None]:
from polara import SVDModel

In [None]:
svd = SVDModel(data_model)

What would be the procedure to tune SVD?

In [None]:
# implement tuning procedure

In [None]:
pd.Series(svd_res).sort_index().plot()

In [None]:
svd.rank = pd.Series(svd_res).idxmax()
recs = svd.recommendations

In [None]:
save_and_submit('svd_baseline', recs)

# iALS (WRMF) model

In order to use this model you need to install an externall library https://github.com/benfred/implicit/.

In [None]:
from polara.recommender.external.implicit.ialswrapper import ImplicitALS
from polara.evaluation.pipelines import random_grid
from polara.evaluation.pipelines import set_config

In [None]:
als = ImplicitALS(data_model) # create model

Hyper-parameter tuning can be implemented with random search on a parameter grid.

<img src=random_grid.png/>

You can find more details in: 
*James Bergstra and Yoshua Bengio. “Random search for hyper-parameter optimization”. In: Journal of Machine Learning Research 13.Feb (2012), pp. 281–305.*

In [None]:
als_params = dict(alpha = [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100],
                  epsilon = [0.01, 0.03, 0.1, 0.3, 1],
                  weight_func = [None, np.sign, np.sqrt, np.log2, np.log10],
                  regularization = [0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3],
                  rank = range(10, max_rank+1, 10))

In [None]:
als_params_grid = random_grid(als_params)
len(als_params_grid)

In [None]:
als.verbose = False

als_grid_results = defaultdict(dict)
for params in tqdm_notebook(als_params_grid):
    set_config(als, als_params.keys(), params)
    als.build()
    als_grid_results[params] = als.evaluate('hits').true_positive

In [None]:
als_res = pd.Series(data=list(als_grid_results.values()), index=list(als_grid_results.keys()))

In [None]:
als_res.plot()

In [None]:
als_res.idxmax()

Don't forget to apply the best configuration:

In [None]:
set_config(als, als_params.keys(), als_res.idxmax())

In [None]:
als.build()
als_recs = als.recommendations

In [None]:
save_and_submit('als_baseline', als_recs)

# SGD-based Matrix Factorization

In [None]:
from polara.lib.optimize import sgd_step

In [None]:
sgd_step

In [None]:
def unbiased_sgdmf(user_idx, item_idx, feedback,
                   rank=10, lrate=0.005, reg=0.05,
                   num_epochs=25, tol=1e-4,
                   seed=None, verbose=True):
    '''The main function to iterate over epochs in SGD approach.
    '''
    n_users = user_idx.max() + 1
    n_items = item_idx.max() + 1
    
    # initialization with random numbers
    random_state = np.random.RandomState(seed) if seed else np.random
    P = random_state.normal(scale=0.1, size=(n_users, rank))
    Q = random_state.normal(scale=0.1, size=(n_items, rank))
        
    last_err = np.finfo(np.float64).max
    for epoch in range(num_epochs):
        # make a single SGD step
        new_err = # what should be here??
        
        # control the progress by calculating error
        err_delta = abs(last_err - new_err) / last_err
        
        if verbose:
            rmse = sqrt(new_err / len(feedback))
            print('Epoch {} RMSE: {}'.format(epoch+1, rmse))
        
        last_err = new_err
        if err_delta < tol:
            break
    return P, Q

In [None]:
class SGDMatrixFactorization(RecommenderModel):
    '''This is a full implementation of the basic
    matrix factorization based on stochastic gradient descent.
    It uses folding-in approach to recommend items for test users.'''
    
    def __init__(self, *args, **kwargs):
        super(BasicMatrixFactorization, self).__init__(*args, **kwargs)
        self.rank = 10
        self.lrate = 0.005 # learning rate
        self.reg = 0.05 # regularization constant
        self.num_epochs = 25
        self.tol = 1e-4
        self.verbose = True
        self.seed = 0
        self.method = 'SGD-MF'
        self.factors = dict.fromkeys(self.data.fields[:2]) # for user and item factors
    
    def build(self):
        userid, itemid, feedback = self.data.fields
        
        trainset = self.data.training.sample(frac=1, random_state=self.seed) # shuffle data
        useridx = trainset[userid].values
        itemidx = trainset[itemid].values
        feedback = trainset[feedback].values
        # sgd learning parameters
        config = dict(rank=self.rank,
                      lrate=self.lrate,
                      reg=self.reg,
                      num_epochs=self.num_epochs,
                      tol=self.tol,
                      seed=self.seed,
                      verbose=self.verbose)
        # performing sgd
        user_factors, item_factors = unbiased_sgdmf(useridx, itemidx, feedback, **config)
        # store learned factor matrices
        self.factors[userid] = user_factors
        self.factors[itemid] = item_factors
    
    
    def slice_recommendations(self, test_data, shape, start, stop, test_users=None):
        slice_data = self._slice_test_data(test_data, start, stop)
        
        P = self.factors[self.data.fields.userid]
        Q = self.factors[self.data.fields.itemid]
                
        scores = P[test_users[start:stop], :].dot(Q.T)
        return scores, slice_data

In [None]:
sgd = SGDMatrixFactorization(data_model)

In [None]:
sgd.build()

In [None]:
sgd_recs = sgd.recommendations

In [None]:
save_and_submit('sgd_baseline', sgd_recs)