# Surprise: [Get Starting!](https://surprise.readthedocs.io/en/stable/getting_started.html#getting-started)

In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
# surprise
from surprise import SVD, NormalPredictor, KNNBasic , Dataset, accuracy, Reader
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV

サンプルデータを取得

In [2]:
# Load movielens-100k 
data = Dataset.load_builtin('ml-100k')

アルゴリズムの定義

In [3]:
# SVD algorithm.
algo1 = SVD()

交差検証

In [5]:
%%time

# cross-validation
cross_validate(algo1, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

None

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9442  0.9520  0.9367  0.9443  0.0062  
MAE (testset)     0.7464  0.7492  0.7394  0.7450  0.0041  
Fit time          4.21    4.41    4.58    4.40    0.15    
Test time         0.28    0.23    0.28    0.26    0.02    
CPU times: user 14.4 s, sys: 69.1 ms, total: 14.5 s
Wall time: 14.5 s


訓練データ、本番データに分割

In [6]:
trainset, testset = train_test_split(data, test_size=.2)

モデルの訓練

In [7]:
%%time

algo1.fit(trainset)

None

CPU times: user 4.79 s, sys: 6.97 ms, total: 4.8 s
Wall time: 4.8 s


本番データでRMSEの正解率をチェック

In [8]:
predictions = algo1.test(testset)
accuracy.rmse(predictions)

None

RMSE: 0.9341


Grid Search

In [11]:
%%time

# hyper parameter
param_grid = {
    'n_epochs': [5, 10],
    'lr_all': [0.002, 0.005],
    'reg_all': [0.4, 0.6],
}

# grid search
gs1 = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
gs1.fit(data)

None

CPU times: user 44 s, sys: 145 ms, total: 44.1 s
Wall time: 44.2 s


In [12]:
# RMSE score
gs1.best_score['rmse']

0.9641699295027255

In [13]:
# best parameter
gs1.best_params['rmse']

{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}

In [14]:
%%time

# choice best model
algo2 = gs1.best_estimator['rmse']

# train algorithm
algo2.fit(trainset)

None

CPU times: user 2.27 s, sys: 4.57 ms, total: 2.28 s
Wall time: 2.28 s


In [15]:
# predict
accuracy.rmse(algo2.test(testset))

None

RMSE: 0.9553


### 自作アプリを念頭に置いたランダムのサンプルデータを作成

In [16]:
# item vector
user_row = 20000
user_num = 1000

# DataFrame dict
df_dict = {
    'wordbook': [1 if np.random.rand() >= .8 else 0 for i in range(user_row)],
    'user': [f'user_{np.random.randint(1, user_num + 1)}' for u in range(user_row)],
    'rating': [1 if np.random.rand() >= .8 else 0 for r in range(user_row)],
}

# df
df = pd.DataFrame(df_dict)

# dataset
reader = Reader(rating_scale=(1, 1))
sample_data = Dataset.load_from_df(df, reader=reader)

# algorithm
algo3 = NormalPredictor()

# cross validator
cross_validate(algo3, sample_data, cv=10, verbose=True)

None

Evaluating RMSE, MAE of algorithm NormalPredictor on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.8994  0.8925  0.9036  0.8978  0.9014  0.8969  0.8902  0.9019  0.8911  0.9058  0.8981  0.0051  
MAE (testset)     0.8090  0.7965  0.8165  0.8060  0.8125  0.8045  0.7925  0.8135  0.7940  0.8205  0.8065  0.0092  
Fit time          0.02    0.02    0.02    0.02    0.02    0.02    0.02    0.02    0.02    0.02    0.02    0.00    
Test time         0.01    0.01    0.01    0.01    0.01    0.01    0.01    0.01    0.01    0.01    0.01    0.00    


In [17]:
%%time

# parmas
params = {
    'k': [10000, 1000, 700, 500, 300],
    'min_k': [1000, 100, 70, 50, 30, 10, 1],
}

# grid search
gs2 = GridSearchCV(KNNBasic, params, cv=3)
gs2.fit(sample_data)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [18]:
# score
gs2.best_score

{'rmse': 0.8980714273697354, 'mae': 0.8065495090081979}

In [19]:
# params
gs2.best_params

{'rmse': {'k': 10000, 'min_k': 1000}, 'mae': {'k': 10000, 'min_k': 1000}}

In [20]:
# choice model
algo4 = gs2.best_estimator['rmse']

# split train and test
train_sample, test_sample = train_test_split(sample_data)

# train
algo4.fit(train_sample)

None

Computing the msd similarity matrix...
Done computing similarity matrix.


In [21]:
# predict
accuracy.rmse(algo4.test(test_sample))

None

RMSE: 0.8968
