# Surprise: [Get Starting!](https://surprise.readthedocs.io/en/stable/getting_started.html#getting-started)

In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
# surprise
from surprise import SVD, NormalPredictor, KNNBasic , Dataset, accuracy, Reader
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV

サンプルデータを取得

In [5]:
# Load movielens-100k 
data = Dataset.load_builtin('ml-100k')

アルゴリズムの定義

In [3]:
# SVD algorithm.
algo1 = SVD()

交差検証

In [4]:
%%time

# cross-validation
cross_validate(algo1, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

None

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9426  0.9478  0.9440  0.9448  0.0022  
MAE (testset)     0.7452  0.7480  0.7435  0.7455  0.0019  
Fit time          5.57    5.79    5.36    5.58    0.17    
Test time         0.47    0.39    0.38    0.41    0.04    
CPU times: user 18.3 s, sys: 153 ms, total: 18.5 s
Wall time: 18.7 s


※ RMSE(Root Mean Squared Error)は二乗誤差。少ない方が評価できる。

訓練データ、本番データに分割

In [6]:
trainset, testset = train_test_split(data, test_size=.2)

モデルの訓練

In [7]:
%%time

algo1.fit(trainset)

None

CPU times: user 4.44 s, sys: 16.7 ms, total: 4.46 s
Wall time: 4.47 s


訓練データの正解率

In [8]:
# train data
_trainset = trainset.build_testset()
accuracy.rmse(algo1.test(_trainset))

None

RMSE: 0.6857


本番データでRMSEの正解率をチェック

In [9]:
predictions = algo1.test(testset)
accuracy.rmse(predictions)

None

RMSE: 0.9334


In [12]:
df = pd.DataFrame(predictions).drop(['details'], axis=1)

df.sample(10)

Unnamed: 0,uid,iid,r_ui,est
6538,727,164,5.0,3.362982
6961,222,450,3.0,2.215884
9969,488,468,5.0,3.074466
10470,409,303,4.0,3.381236
14263,655,874,4.0,2.84675
5769,271,285,4.0,4.148748
3267,767,207,5.0,4.151665
19790,721,995,3.0,3.100397
8449,625,655,3.0,3.515316
2963,7,204,5.0,4.407245


本番データではRMSEが高く、精度が悪くなっている

Grid Search

In [2]:
%%time

# hyper parameter
param_grid = {
    'n_epochs': [5, 10],
    'lr_all': [0.002, 0.005],
    'reg_all': [0.4, 0.6],
}

# grid search
gs1 = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
gs1.fit(data)

None

NameError: name 'data' is not defined

In [12]:
# RMSE score
gs1.best_score['rmse']

0.9641699295027255

In [13]:
# best parameter
gs1.best_params['rmse']

{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}

In [14]:
%%time

# choice best model
algo2 = gs1.best_estimator['rmse']

# train algorithm
algo2.fit(trainset)

None

CPU times: user 2.27 s, sys: 4.57 ms, total: 2.28 s
Wall time: 2.28 s


In [15]:
# predict
accuracy.rmse(algo2.test(testset))

None

RMSE: 0.9553
