In [1]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate

In [17]:
data = Dataset.load_builtin("ml-100k", prompt=False)
data.raw_ratings[:10] # 유저, 아이템, 평점, 아이디

[('196', '242', 3.0, '881250949'),
 ('186', '302', 3.0, '891717742'),
 ('22', '377', 1.0, '878887116'),
 ('244', '51', 2.0, '880606923'),
 ('166', '346', 1.0, '886397596'),
 ('298', '474', 4.0, '884182806'),
 ('115', '265', 2.0, '881171488'),
 ('253', '465', 5.0, '891628467'),
 ('305', '451', 3.0, '886324817'),
 ('6', '86', 3.0, '883603013')]

In [4]:
md = SVD()

In [5]:
cross_validate(md, data, measures=["rmse", "mae"], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9365  0.9344  0.9317  0.9446  0.9292  0.9353  0.0053  
MAE (testset)     0.7389  0.7371  0.7330  0.7456  0.7329  0.7375  0.0047  
Fit time          3.25    3.24    3.25    3.25    3.24    3.25    0.01    
Test time         0.10    0.09    0.09    0.10    0.14    0.10    0.02    


{'test_rmse': array([0.93654357, 0.9344329 , 0.93166484, 0.9445908 , 0.9292072 ]),
 'test_mae': array([0.73891276, 0.73709967, 0.73300477, 0.74556982, 0.7328737 ]),
 'fit_time': (3.25142502784729,
  3.2400131225585938,
  3.2453083992004395,
  3.2537081241607666,
  3.240626573562622),
 'test_time': (0.0957484245300293,
  0.09275364875793457,
  0.09374427795410156,
  0.09773802757263184,
  0.1436154842376709)}

In [6]:
#컨텐츠 기반 필터링
#컨텐츠 기반 필터링은 이전의 행동과 명시적 피드백을 통해 좋아하는 것과 유사한 항목을 추천
#ex)내가 지금 까지 시청한 영화 목록과 다른 사용자의 시청 목록을 비교해 나와 비슷한 취향의 사용자가 시청한 영화를 추천

In [9]:
import numpy as np


In [10]:
data = Dataset.load_builtin("ml-100k", prompt=False)
raw_data = np.array(data.raw_ratings, dtype=int)

In [11]:
raw_data[:, 0] -= 1
raw_data[:, 1] -= 1

In [12]:
n_users = np.max(raw_data[:, 0])
n_movies = np.max(raw_data[:, 1])
shape = (n_users + 1, n_movies + 1)
shape

(943, 1682)

In [13]:
# 인접 행렬
adj_matrix = np.ndarray(shape, dtype = int)
for user_id, movie_id, rating, time in raw_data:
    adj_matrix[user_id][movie_id] = 1
adj_matrix

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]])

In [14]:
my_id, my_vector = 0, adj_matrix[0]
best_match, best_match_id, best_match_vector = -1, -1, []

for user_id, user_vector in enumerate(adj_matrix):
    if my_id != user_id:
        similarity = np.dot(my_vector, user_vector)
        if similarity > best_match:
            best_match = similarity
            best_match_id = user_id
            best_match_vector = user_vector
print("Best Match : {}, Best Match ID: {}".format(best_match, best_match_id))

Best Match : 183, Best Match ID: 275


In [15]:
recommend_list = []
for i, log in enumerate(zip(my_vector, best_match_vector)):
    log1, log2 = log
    if log1 < 1. and log2 > 0. :
        recommend_list.append(i)
print(recommend_list)


[272, 273, 275, 280, 281, 283, 287, 288, 289, 290, 292, 293, 297, 299, 300, 301, 302, 306, 312, 314, 315, 316, 317, 321, 322, 323, 324, 327, 330, 331, 332, 333, 339, 342, 345, 346, 353, 354, 355, 356, 357, 363, 364, 365, 366, 372, 374, 378, 379, 381, 382, 383, 384, 385, 386, 387, 390, 391, 392, 394, 395, 396, 398, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 412, 414, 416, 417, 418, 419, 420, 422, 424, 425, 426, 427, 428, 430, 431, 432, 435, 442, 446, 447, 448, 449, 450, 451, 452, 454, 455, 457, 460, 461, 462, 468, 469, 470, 471, 472, 473, 474, 478, 495, 500, 507, 517, 522, 525, 530, 539, 540, 543, 545, 546, 548, 549, 550, 551, 553, 557, 558, 560, 561, 562, 563, 565, 566, 567, 568, 570, 571, 574, 575, 576, 577, 580, 581, 582, 585, 587, 589, 590, 594, 596, 602, 623, 626, 627, 630, 633, 635, 639, 646, 648, 651, 652, 654, 657, 664, 668, 671, 677, 678, 681, 683, 684, 685, 690, 691, 692, 695, 696, 708, 709, 714, 718, 719, 720, 724, 726, 727, 731, 733, 734, 736, 738, 741, 742, 745,

In [None]:
# 유클리드 거리를 사용해 추천
