In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import math
import wandb
import optuna
from optuna.integration.wandb import WeightsAndBiasesCallback
from optuna.trial import TrialState
# from models.optuna.SVD import SVD_
from models.optuna.SVD import get_trainset

import os
import sys

import surprise
from surprise import SVD, NMF, KNNBaseline, SVDpp, KNNBasic
from surprise import Dataset, Reader
from surprise.model_selection import GridSearchCV

In [3]:

def extract_users_items_predictions(data_pd):
    users, movies = [
        np.squeeze(arr)
        for arr in np.split(
            data_pd.Id.str.extract("r(\d+)_c(\d+)").values.astype(int) - 1, 2, axis=-1
        )
    ]
    predictions = data_pd.Prediction.values
    return users, movies, predictions


In [4]:
data_pd = pd.read_csv('./data/data_train.csv')
sub_pd = pd.read_csv('./data/sampleSubmission.csv')

train_users, train_movies, train_predictions = extract_users_items_predictions(
    data_pd)  # use whole data bc doing gridsearchcv

train_df = pd.DataFrame()
train_df['users'] = train_users
train_df['movies'] = train_movies
train_df['ratings'] = train_predictions

data = Dataset.load_from_df(train_df, Reader(rating_scale=(1, 5)))
# required to directly call fit 
trainset = data.build_full_trainset()

sub_users, sub_movies, sub_preds_wrong = extract_users_items_predictions(
    sub_pd)

sub_data = zip(sub_users, sub_movies)

In [14]:
# help(trainset); 
# trainset.ur
for t in trainset.all_items():
    print(t)
    break

0


In [22]:

class SVDpp_(SVDpp):
    """Wrapper class for SVD to be used with Optuna"""

    def __init__(
        self,
        trainset=None,
        n_factors=20,
        n_epochs=20,
        init_mean=0,
        init_std_dev=0.1,
        lr_all=0.007,
        reg_all=0.02,
        lr_bu=None,
        lr_bi=None,
        lr_pu=None,
        lr_qi=None,
        lr_yj=None,
        reg_bu=None,
        reg_bi=None,
        reg_pu=None,
        reg_qi=None,
        reg_yj=None,
        random_state=None,
        verbose=False,
        *args,
        **kwargs
    ):
        SVDpp.__init__(
            self,
            n_factors,
            n_epochs,
            init_mean,
            init_std_dev,
            lr_all,
            reg_all,
            lr_bu,
            lr_bi,
            lr_pu,
            lr_qi,
            lr_yj,
            reg_bu,
            reg_bi,
            reg_pu,
            reg_qi,
            reg_yj,
            random_state,
            verbose,
            *args,
            **kwargs
        )
        self.lr_all = lr_all
        self.reg_all = reg_all
        if trainset is None:
            trainset = get_trainset()
        if type(trainset) != surprise.trainset.Trainset:
            trainset = trainset.build_full_trainset()
        self.trainset = trainset

    def get_params(self, deep=None):
        params_lst = [
            "trainset",
            "n_factors",
            "n_epochs",
            "init_mean",
            "init_std_dev",
            "lr_all",
            "reg_all",
            "lr_bu",
            "lr_bi",
            "lr_pu",
            "lr_qi",
            "lr_yj",
            "reg_bu",
            "reg_bi",
            "reg_pu",
            "reg_qi",
            "reg_yj",
            "random_state",
            "verbose",
        ]
        return {key: getattr(self, key) for key in params_lst}

    def set_params(self, **params):
        for key, val in params.items():
            setattr(self, key, val)

    def fit(self, X, y):
        return SVDpp.fit(self, self.trainset)

    def predict(self, X):
        def _predict(it):
            return SVDpp.predict(self, it[0], it[1]).est

        from multiprocessing import Pool
        pool = Pool()
        res_ls = pool.map(_predict, X)
#         res_ls = [SVDpp.predict(self, u, m).est for (u, m) in X]
        return res_ls


In [23]:
algo = SVDpp_(trainset, random_state=42, n_epochs=1)

In [24]:
X = np.column_stack((train_users, train_movies))
y = train_predictions
X_test = np.column_stack((sub_users, sub_movies))
X.shape, train_users.shape, X

((1176952, 2),
 (1176952,),
 array([[  43,    0],
        [  60,    0],
        [  66,    0],
        ...,
        [9993,  999],
        [9996,  999],
        [9999,  999]]))

In [25]:
algo.fit(X, y)

<__main__.SVDpp_ at 0x7f764a20c130>

In [28]:
res = algo.predict(X)
res

AttributeError: Can't pickle local object 'SVDpp_.predict.<locals>._predict'

In [19]:
test_res = algo.predict(X_test)

In [20]:
test_res = [r.est for r in test_res]
# X_test

In [21]:
sub_pd['Prediction'] = test_res
sub_pd.to_csv('./results/svd_optuna.csv', index=False)

In [33]:
! head results/svd_optuna.csv

Id,Prediction
r37_c1,3.3465079344840216
r73_c1,2.303056471943172
r156_c1,4.375156289223989
r160_c1,3.2265289939442705
r248_c1,3.0306463340157492
r256_c1,3.8671480155252054
r284_c1,4.062109412655746
r400_c1,3.1878873949063395
r416_c1,4.782712801817294


In [26]:
! pwd

/cluster/home/galtintas/cil


In [31]:
sub_pd['Prediction'] = test_res
sub_pd

Unnamed: 0,Id,Prediction
0,r37_c1,3.346508
1,r73_c1,2.303056
2,r156_c1,4.375156
3,r160_c1,3.226529
4,r248_c1,3.030646
...,...,...
1176947,r9974_c1000,3.728241
1176948,r9977_c1000,3.490701
1176949,r9978_c1000,4.094847
1176950,r9982_c1000,2.537528


In [35]:
sub_data

<zip at 0x2b65a1876c00>