In [1]:
import numpy as np
import pandas as pd
from surprise import Dataset, Reader, SVD, NormalPredictor, KNNBasic, accuracy
from surprise.model_selection import GridSearchCV, cross_validate

# MovieLens

In [2]:
loc = '~/.surprise_data/ml-100k/ml-100k/u.data'
names = ['uid', 'iid', 'r_ui', 'timestamp']

mv = pd.read_csv(loc, sep='\t', names=names)
mv = mv.drop(['timestamp'], axis=1)

mv.head()

Unnamed: 0,uid,iid,r_ui
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [3]:
mv[['r_ui']].describe()

Unnamed: 0,r_ui
count,100000.0
mean,3.52986
std,1.125674
min,1.0
25%,3.0
50%,4.0
75%,4.0
max,5.0


# 自作アプリのサンプルデータで推薦

In [4]:
# item vector
rating_row = 2000
user_num = 30
item_num = 50

# DataFrame dict
df_dict = {
    'uid': [np.random.randint(1, user_num + 1) for u in range(rating_row)],
    'iid': [np.random.randint(1, item_num + 1) for i in range(rating_row)],
    'r_ui': [2 if np.random.rand() <= .1 else 1 for r in range(rating_row)],
}

# df
df = pd.DataFrame(df_dict)

df.head()

Unnamed: 0,uid,iid,r_ui
0,15,25,1
1,13,16,1
2,19,18,1
3,13,40,1
4,15,2,1


In [5]:
df.shape

(2000, 3)

In [6]:
# dataset
reader = Reader(rating_scale=(1, 2))
sample_data = Dataset.load_from_df(df, reader=reader)

# predict

In [7]:
# algorithm
n_pre = NormalPredictor()

# cross validator
cross_validate(n_pre, sample_data, cv=10, verbose=True)

None

Evaluating RMSE, MAE of algorithm NormalPredictor on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.3653  0.3392  0.3517  0.3685  0.3596  0.3289  0.3330  0.3478  0.3327  0.3395  0.3466  0.0135  
MAE (testset)     0.2400  0.2184  0.2278  0.2290  0.2278  0.2011  0.2065  0.2109  0.2008  0.2104  0.2173  0.0127  
Fit time          0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    


In [8]:
from collections import defaultdict

# data
train = sample_data.build_full_trainset()
test = train.build_anti_testset()

# algorithm, predictions
svd= SVD()
svd.fit(train)
pre = svd.test(test)

# ratings dict
# li = defaultdict(list)
# for u_id, i_id, r, est, _ in pre:
#     li[u_id].append((i_id, est))

In [9]:
rating = pd.DataFrame(pre).drop(['details'], axis=1)

rating.sample(10)

Unnamed: 0,uid,iid,r_ui,est
264,16,8,1.086,1.01477
307,22,49,1.086,1.1558
46,8,47,1.086,1.227733
223,28,15,1.086,1.188812
162,2,44,1.086,1.191343
61,24,18,1.086,1.147693
44,8,3,1.086,1.212069
245,6,31,1.086,1.0
277,10,24,1.086,1.053888
220,28,40,1.086,1.0


In [10]:
rating[['est']].describe()

Unnamed: 0,est
count,391.0
mean,1.092078
std,0.077157
min,1.0
25%,1.021537
50%,1.082801
75%,1.143221
max,1.31593


In [11]:
recom = rating.sort_values('est', ascending=False).iloc[:40]

recom.shape

(40, 4)

In [12]:
recom.head()

Unnamed: 0,uid,iid,r_ui,est
326,30,24,1.086,1.31593
353,17,49,1.086,1.304878
218,28,25,1.086,1.29801
112,1,24,1.086,1.291708
295,18,33,1.086,1.287071


# GridSearch

In [25]:
%%time

# parmas
params = {
    'k': [10000, 1000, 700, 500, 300],
    'min_k': [1000, 100, 70, 50, 30, 10, 1],
}

# grid search
gs2 = GridSearchCV(KNNBasic, params, cv=3)
gs2.fit(sample_data)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [26]:
# score
gs2.best_score

{'rmse': 0.9464380185544904, 'mae': 0.8957502094892256}

In [27]:
# params
gs2.best_params

{'rmse': {'k': 10000, 'min_k': 1000}, 'mae': {'k': 10000, 'min_k': 1000}}

In [31]:
# choice model
knn = gs2.best_estimator['rmse']

# train
train = sample_data.build_full_trainset()
knn.fit(train)

None

Computing the msd similarity matrix...
Done computing similarity matrix.


In [33]:
# predict
test = train.build_anti_testset()
accuracy.rmse(knn.test(test))

None

RMSE: 0.8957


In [37]:
prediction = pd.DataFrame(test, columns=('uid', 'iid', 'r_ui'))

prediction.sample(10)

Unnamed: 0,uid,iid,r_ui
74176,299,89,0.10425
33429,159,331,0.10425
116424,175,319,0.10425
50519,99,289,0.10425
80992,222,214,0.10425
120444,221,104,0.10425
109746,169,372,0.10425
74467,218,81,0.10425
69458,48,437,0.10425
57345,45,228,0.10425
