In [1]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 4.6 MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1619421 sha256=750c5020396a7165bebd94a2171bdfcf9fb07be709a4c8a74c3b6bac6aa73041
  Stored in directory: /root/.cache/pip/wheels/76/44/74/b498c42be47b2406bd27994e16c5188e337c657025ab400c1c
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.1


In [2]:
from surprise import SVD, Dataset
from surprise.model_selection import cross_validate

In [4]:
data = Dataset.load_builtin('ml-100k', prompt=False)

In [7]:
data.raw_ratings[:10]

[('196', '242', 3.0, '881250949'),
 ('186', '302', 3.0, '891717742'),
 ('22', '377', 1.0, '878887116'),
 ('244', '51', 2.0, '880606923'),
 ('166', '346', 1.0, '886397596'),
 ('298', '474', 4.0, '884182806'),
 ('115', '265', 2.0, '881171488'),
 ('253', '465', 5.0, '891628467'),
 ('305', '451', 3.0, '886324817'),
 ('6', '86', 3.0, '883603013')]

In [8]:
model = SVD()       # Singular Value Decomposition

In [9]:
cross_validate(model, data, measures=['rmse','mae'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9348  0.9341  0.9369  0.9436  0.9290  0.9357  0.0048  
MAE (testset)     0.7359  0.7350  0.7374  0.7450  0.7323  0.7371  0.0043  
Fit time          4.86    4.97    4.90    4.88    4.98    4.92    0.05    
Test time         0.28    0.17    0.27    0.16    0.26    0.23    0.05    


{'fit_time': (4.8615522384643555,
  4.971941947937012,
  4.899038314819336,
  4.876251459121704,
  4.984079837799072),
 'test_mae': array([0.73592382, 0.73504609, 0.73738253, 0.74502672, 0.73225574]),
 'test_rmse': array([0.93483677, 0.93406044, 0.9368528 , 0.94363531, 0.92895857]),
 'test_time': (0.2834916114807129,
  0.17489266395568848,
  0.26822710037231445,
  0.16018438339233398,
  0.2553684711456299)}

### 컨텐츠 기반 필터링(Contents-based Filtering)
- 이전의 행동과 명시적 피드백을 통해 좋아하는 것과 유사한 항목을 추천
    *   내가 지금까지 시청한 영화 목록과 다른 사용자의 시청 목록을 비교해 나와 비슷한 취향의 사용자가 시청한 영화를 추천

- 유사도를 기반으로 추천
- 장점

    *   많은 수의 사용자를 대상으로 쉽게 확장 가능
    *   사용자가 관심을 갖지 않던 상품 추천 가능

- 단점

    *   입력 특성을 직접 설계해야 하기 때문에 많은 도메인 지식이 필요
    *   사용자의 기존 관심사항을 기반으로만 추천 가능



In [10]:
import numpy as np

In [11]:
raw_data = np.array(data.raw_ratings, dtype=int)
raw_data[:, 0] -= 1
raw_data[:, 1] -= 1

In [15]:
n_users = np.max(raw_data[:,0]) + 1
n_movies = np.max(raw_data[:,1]) + 1
shape = (n_users, n_movies)
shape

(943, 1682)

In [17]:
adj_matrix = np.ndarray(shape, dtype=int)
for user_id, movie_id, _, _ in raw_data:
    adj_matrix[user_id, movie_id] = 1.
adj_matrix[:3]

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [18]:
my_id, my_vector = 0, adj_matrix[0]
best_match, best_match_id, best_match_vector = -1, -1, []

for user_id, user_vector in enumerate(adj_matrix):
    if my_id != user_id:
        similarity = np.dot(my_vector, user_vector)
        if similarity > best_match:
            best_match = similarity
            best_match_id = user_id
            best_match_vector = user_vector

print(f'Best match: {best_match}, Best match ID: {best_match_id}')

Best match: 183, Best match ID: 275


In [19]:
recommend_list = []
for i, log in enumerate(zip(my_vector, best_match_vector)):
    log1, log2 = log
    if log1 < 1. and log2 > 0.:
        recommend_list.append(i)

print(recommend_list)

[272, 273, 275, 280, 281, 283, 287, 288, 289, 290, 292, 293, 297, 299, 300, 301, 302, 306, 312, 314, 315, 316, 317, 321, 322, 323, 324, 327, 330, 331, 332, 333, 339, 342, 345, 346, 353, 354, 355, 356, 357, 363, 364, 365, 366, 372, 374, 378, 379, 381, 382, 383, 384, 385, 386, 387, 390, 391, 392, 394, 395, 396, 398, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 412, 414, 416, 417, 418, 419, 420, 422, 424, 425, 426, 427, 428, 430, 431, 432, 435, 442, 446, 447, 448, 449, 450, 451, 452, 454, 455, 457, 460, 461, 462, 468, 469, 470, 471, 472, 473, 474, 478, 495, 500, 507, 517, 522, 525, 530, 539, 540, 543, 545, 546, 548, 549, 550, 551, 553, 557, 558, 560, 561, 562, 563, 565, 566, 567, 568, 570, 571, 574, 575, 576, 577, 580, 581, 582, 585, 587, 589, 590, 594, 596, 602, 623, 626, 627, 630, 633, 635, 639, 646, 648, 651, 652, 654, 657, 664, 668, 671, 677, 678, 681, 683, 684, 685, 690, 691, 692, 695, 696, 708, 709, 714, 718, 719, 720, 724, 726, 727, 731, 733, 734, 736, 738, 741, 742, 745,

- 유클리드 거리를 사용해 추천

In [21]:
my_id, my_vector = 0, adj_matrix[0]
best_match, best_match_id, best_match_vector = 9999, -1, []

for user_id, user_vector in enumerate(adj_matrix):
    if my_id != user_id:
        euclidean_dist = np.sqrt(np.sum(np.square(my_vector - user_vector)))
        if euclidean_dist < best_match:
            best_match = euclidean_dist
            best_match_id = user_id
            best_match_vector = user_vector

print(f'Best match: {best_match:.4f}, Best match ID: {best_match_id}')

Best match: 14.8324, Best match ID: 737


In [22]:
recommend_list = []
for i, log in enumerate(zip(my_vector, best_match_vector)):
    log1, log2 = log
    if log1 < 1. and log2 > 0.:
        recommend_list.append(i)

print(recommend_list)

[297, 312, 317, 342, 356, 366, 379, 384, 392, 402, 404, 407, 417, 422, 428, 433, 448, 454, 469, 473, 495, 510, 516, 526, 527, 549, 567, 602, 635, 649, 650, 654, 658, 661, 664, 696, 731, 746, 750, 754, 915, 918, 925, 929, 950, 968, 1015, 1046]


- Cosine 유사도를 사용해 추천

In [23]:
def cos_similarity(v1, v2):
    norm1 = np.sqrt(np.sum(np.square(v1)))
    norm2 = np.sqrt(np.sum(np.square(v2)))
    return np.dot(v1, v2) / (norm1 * norm2)

In [28]:
my_id, my_vector = 0, adj_matrix[0]
best_match, best_match_id, best_match_vector = -1, -1, []

for user_id, user_vector in enumerate(adj_matrix):
    if my_id != user_id:
        cos_sim = cos_similarity(my_vector, user_vector)
        if cos_sim > best_match:
            best_match = cos_sim
            best_match_id = user_id
            best_match_vector = user_vector

print(f'Best match: {best_match:.4f}, Best match ID: {best_match_id}')

Best match: 0.5691, Best match ID: 915


In [29]:
recommend_list = []
for i, log in enumerate(zip(my_vector, best_match_vector)):
    log1, log2 = log
    if log1 < 1. and log2 > 0.:
        recommend_list.append(i)

print(recommend_list)

[272, 275, 279, 280, 283, 285, 289, 294, 297, 316, 317, 355, 365, 366, 368, 379, 380, 381, 384, 386, 392, 398, 401, 404, 416, 420, 422, 424, 426, 427, 430, 432, 450, 460, 461, 466, 469, 471, 473, 474, 475, 479, 482, 483, 497, 505, 508, 510, 511, 522, 526, 527, 529, 530, 534, 536, 540, 545, 548, 549, 556, 557, 558, 560, 565, 567, 568, 569, 577, 580, 581, 582, 592, 596, 630, 635, 639, 641, 649, 651, 654, 673, 677, 678, 683, 684, 692, 696, 701, 703, 707, 708, 709, 712, 714, 719, 720, 726, 731, 734, 736, 738, 740, 745, 747, 754, 755, 761, 762, 763, 766, 780, 789, 791, 805, 819, 823, 824, 830, 843, 862, 865, 918, 929, 930, 938, 942, 943, 947, 958, 959, 960, 970, 977, 1004, 1008, 1009, 1010, 1013, 1041, 1045, 1069, 1072, 1073, 1078, 1097, 1100, 1108, 1112, 1118, 1134, 1193, 1205, 1207, 1216, 1219, 1267, 1334, 1400, 1427, 1596, 1681]


- 기존 방법에 명시적 피드백(사용자가 평가한 영화 점수)을 추가해 실험

In [26]:
adj_matrix = np.ndarray(shape, dtype=int)
for user_id, movie_id, rating, _ in raw_data:
    adj_matrix[user_id, movie_id] = rating
adj_matrix[:3]

array([[5, 3, 4, ..., 0, 0, 0],
       [4, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [27]:
my_id, my_vector = 0, adj_matrix[0]
best_match, best_match_id, best_match_vector = 9999, -1, []

for user_id, user_vector in enumerate(adj_matrix):
    if my_id != user_id:
        euclidean_dist = np.sqrt(np.sum(np.square(my_vector - user_vector)))
        if euclidean_dist < best_match:
            best_match = euclidean_dist
            best_match_id = user_id
            best_match_vector = user_vector

print(f'Best match: {best_match:.4f}, Best match ID: {best_match_id}')

Best match: 55.0636, Best match ID: 737


In [30]:
my_id, my_vector = 0, adj_matrix[0]
best_match, best_match_id, best_match_vector = -1, -1, []

for user_id, user_vector in enumerate(adj_matrix):
    if my_id != user_id:
        cos_sim = cos_similarity(my_vector, user_vector)
        if cos_sim > best_match:
            best_match = cos_sim
            best_match_id = user_id
            best_match_vector = user_vector

print(f'Best match: {best_match:.4f}, Best match ID: {best_match_id}')

Best match: 0.5691, Best match ID: 915


### 협업 필터링(Collaborative Filtering)
- 사용자와 항목의 유사성을 동시에 고려해 추천
- 기존에 내 관심사가 아닌 항목이라도 추천 가능
- 자동으로 임베딩 학습 가능
- 장점
    *   자동으로 임베딩을 학습하기 때문에 도메인 지식이 필요 없음
    *   기존의 관심사가 아니더라도 추천 가능

- 단점
    *   학습과정에 나오지 않은 항목은 임베딩을 만들 수 없음
    *   추가 특성을 사용하기 어려움



In [31]:
from surprise import KNNBasic, SVD, SVDpp, NMF

In [32]:
data = Dataset.load_builtin('ml-100k', prompt=False)

- KNN을 사용한 협업 필터링

In [33]:
model = KNNBasic()
cross_validate(model, data, measures=['rmse','mae'], cv=5, verbose=True, n_jobs=-1)

Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9800  0.9797  0.9762  0.9787  0.9832  0.9795  0.0023  
MAE (testset)     0.7726  0.7742  0.7703  0.7764  0.7752  0.7738  0.0021  
Fit time          0.44    0.59    0.76    0.54    0.46    0.56    0.11    
Test time         7.05    7.65    8.06    6.60    3.82    6.64    1.49    


{'fit_time': (0.4386742115020752,
  0.5883979797363281,
  0.7593333721160889,
  0.5438756942749023,
  0.46398329734802246),
 'test_mae': array([0.77261384, 0.77423153, 0.77030587, 0.77637327, 0.77523267]),
 'test_rmse': array([0.98000128, 0.97969146, 0.97618095, 0.97865275, 0.98317495]),
 'test_time': (7.050502777099609,
  7.653592109680176,
  8.056213617324829,
  6.601367950439453,
  3.8239586353302)}

- SVD를 사용한 협업 필터링

In [34]:
model = SVD()
cross_validate(model, data, measures=['rmse','mae'], cv=5, verbose=True, n_jobs=-1)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9357  0.9411  0.9296  0.9343  0.9386  0.9358  0.0039  
MAE (testset)     0.7392  0.7392  0.7319  0.7386  0.7384  0.7375  0.0028  
Fit time          9.60    11.40   10.92   9.81    6.05    9.55    1.88    
Test time         0.28    0.47    0.28    0.28    0.17    0.30    0.09    


{'fit_time': (9.600859642028809,
  11.399492740631104,
  10.917721271514893,
  9.808423519134521,
  6.045348405838013),
 'test_mae': array([0.73922017, 0.73917761, 0.73190594, 0.73862617, 0.73838514]),
 'test_rmse': array([0.93570542, 0.94106488, 0.929596  , 0.93425397, 0.93857106]),
 'test_time': (0.2796800136566162,
  0.46613192558288574,
  0.27910518646240234,
  0.27753233909606934,
  0.17382359504699707)}

- NMF를 사용한 협업 필터링

In [35]:
model = NMF()
cross_validate(model, data, measures=['rmse','mae'], cv=5, verbose=True, n_jobs=-1)

Evaluating RMSE, MAE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9635  0.9584  0.9600  0.9639  0.9617  0.9615  0.0021  
MAE (testset)     0.7566  0.7541  0.7540  0.7582  0.7577  0.7561  0.0018  
Fit time          9.91    11.66   11.54   9.94    5.80    9.77    2.12    
Test time         0.23    0.28    0.24    0.26    0.14    0.23    0.05    


{'fit_time': (9.909843921661377,
  11.661888360977173,
  11.536036252975464,
  9.936559438705444,
  5.795465707778931),
 'test_mae': array([0.75661459, 0.75410728, 0.75398331, 0.75815653, 0.7576732 ]),
 'test_rmse': array([0.96350435, 0.95838918, 0.95996249, 0.96391066, 0.96173705]),
 'test_time': (0.23318195343017578,
  0.2808096408843994,
  0.24248576164245605,
  0.25680065155029297,
  0.14276790618896484)}

- SVD++를 사용한 협업 필터링

In [36]:
model = SVDpp()
cross_validate(model, data, measures=['rmse','mae'], cv=5, verbose=True, n_jobs=-1)

Evaluating RMSE, MAE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9186  0.9192  0.9085  0.9234  0.9217  0.9183  0.0052  
MAE (testset)     0.7202  0.7207  0.7124  0.7227  0.7250  0.7202  0.0042  
Fit time          319.07  322.21  317.27  320.70  165.04  288.86  61.93   
Test time         6.00    7.82    6.10    6.17    3.17    5.85    1.50    


{'fit_time': (319.06866455078125,
  322.21330070495605,
  317.27006101608276,
  320.6989974975586,
  165.03698921203613),
 'test_mae': array([0.72018084, 0.72074389, 0.71244714, 0.72273544, 0.72495623]),
 'test_rmse': array([0.91858513, 0.91922002, 0.90853413, 0.92342253, 0.92174196]),
 'test_time': (5.9994447231292725,
  7.817309617996216,
  6.0964813232421875,
  6.167213678359985,
  3.170262336730957)}

### 하이브리드
- 컨텐츠 기반 필터링과 협업 필터링을 조합한 방식
- 많은 하이브리드 방식이 존재
- 실습에서는 협업 필터링으로 임베딩을 학습하고 컨텐츠 기반 필터링으로 유사도 기반 추천을 수행하는 추천 엔진 개발

In [37]:
from sklearn.decomposition import randomized_svd, non_negative_factorization

In [41]:
data = Dataset.load_builtin('ml-100k', prompt=False)
raw_data = np.array(data.raw_ratings, dtype=int)
raw_data[:,0] -= 1
raw_data[:,1] -= 1

In [42]:
n_users = np.max(raw_data[:,0]) + 1
n_movies = np.max(raw_data[:,1]) + 1
shape = (n_users, n_movies)
shape

(943, 1682)

In [43]:
adj_matrix = np.ndarray(shape, dtype=int)
for user_id, movie_id, rating, _ in raw_data:
    adj_matrix[user_id, movie_id] = rating
adj_matrix[:3]

array([[5, 3, 4, ..., 0, 0, 0],
       [4, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [45]:
U, S, V = randomized_svd(adj_matrix, n_components=2, random_state=2022)
S = np.diag(S)
U.shape, S.shape, V.shape

((943, 2), (2, 2), (2, 1682))

In [46]:
np.matmul(np.matmul(U,S), V)

array([[ 3.91732681e+00,  1.47276647e+00,  7.98262097e-01, ...,
         6.24908131e-04,  1.41100873e-02,  1.36545892e-02],
       [ 1.85777220e+00,  3.96191003e-01,  5.05705346e-01, ...,
         5.38862953e-03,  1.77236113e-03,  5.26953921e-04],
       [ 8.94989110e-01,  1.71578372e-01,  2.51738344e-01, ...,
         2.92094718e-03,  5.39927727e-04, -1.25740674e-04],
       ...,
       [ 9.92051598e-01,  2.10814839e-01,  2.70363048e-01, ...,
         2.89019118e-03,  9.34212690e-04,  2.66605341e-04],
       [ 1.30425376e+00,  5.27669950e-01,  2.50080149e-01, ...,
        -4.20679042e-04,  5.30526027e-03,  5.28070254e-03],
       [ 2.82999393e+00,  9.70812205e-01,  6.15871593e-01, ...,
         2.02091472e-03,  8.67740394e-03,  8.03107566e-03]])

- 사용자 기반 추천
- 나와 비슷한 취향을 가진 다른 사용자의 행동을 추천
- 사용자 특징 벡터의 유사도 사용

In [47]:
my_id, my_vector = 0, U[0]
best_match, best_match_id, best_match_vector = -1, -1, []

for user_id, user_vector in enumerate(U):
    if my_id != user_id:
        cos_sim = cos_similarity(my_vector, user_vector)
        if cos_sim > best_match:
            best_match = cos_sim
            best_match_id = user_id
            best_match_vector = user_vector

print(f'Best match: {best_match:.4f}, Best match ID: {best_match_id}')

Best match: 1.0000, Best match ID: 235


In [49]:
recommend_list = []
for i, log in enumerate(zip(adj_matrix[my_id], adj_matrix[best_match_id])):
    log1, log2 = log
    if log1 < 1. and log2 > 0.:
        recommend_list.append(i)

print(recommend_list)

[272, 273, 274, 281, 285, 288, 293, 297, 303, 306, 312, 317, 327, 332, 369, 410, 418, 419, 422, 426, 428, 431, 434, 442, 461, 475, 477, 482, 495, 503, 504, 505, 506, 509, 519, 520, 522, 525, 531, 545, 548, 590, 594, 595, 613, 631, 654, 658, 660, 672, 684, 685, 691, 695, 698, 704, 716, 728, 734, 749, 755, 863, 865, 933, 1012, 1038, 1101, 1327, 1400]


- 항목 기반 추천
- 내가 본 항목과 비슷한 항목을 추천
- 항목 특징 벡터의 유사도 사용

In [58]:
my_id, my_vector = 0, V.T[0]
best_match, best_match_id, best_match_vector = -1, -1, []

for user_id, user_vector in enumerate(V.T):
    if my_id != user_id:
        cos_sim = cos_similarity(my_vector, user_vector)
        if cos_sim > best_match:
            best_match = cos_sim
            best_match_id = user_id
            best_match_vector = user_vector

print(f'Best match: {best_match:.4f}, Best match ID: {best_match_id}')

Best match: 1.0000, Best match ID: 1287


In [59]:
recommend_list = []
for i, user_vector in enumerate(adj_matrix):
    if adj_matrix[i, my_id] > 0.9:
        recommend_list.append(i)

print(recommend_list)

[0, 1, 4, 5, 9, 12, 14, 15, 16, 17, 19, 20, 22, 24, 25, 37, 40, 41, 42, 43, 44, 48, 53, 55, 56, 57, 58, 61, 62, 63, 64, 65, 66, 69, 71, 72, 74, 76, 78, 80, 81, 82, 83, 88, 91, 92, 93, 94, 95, 96, 98, 100, 101, 105, 107, 108, 116, 119, 120, 123, 124, 127, 129, 130, 133, 136, 137, 140, 143, 144, 147, 149, 150, 156, 157, 159, 161, 167, 173, 176, 177, 180, 181, 183, 188, 192, 193, 197, 198, 199, 200, 201, 202, 203, 208, 209, 212, 215, 221, 222, 229, 230, 231, 233, 234, 241, 242, 243, 245, 246, 247, 248, 249, 250, 251, 252, 253, 255, 261, 262, 264, 267, 270, 273, 274, 275, 276, 278, 279, 285, 286, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 300, 302, 304, 306, 307, 310, 311, 312, 313, 319, 321, 323, 324, 325, 326, 329, 330, 331, 335, 337, 338, 339, 342, 343, 344, 346, 347, 349, 356, 358, 359, 362, 364, 370, 373, 377, 378, 379, 380, 386, 387, 388, 389, 392, 393, 394, 395, 397, 398, 400, 401, 402, 405, 406, 410, 411, 415, 416, 418, 421, 423, 424, 428, 431, 433, 434, 437, 440, 444, 

- 비음수 행렬 분해를 사용한 하이브리드 추천

In [53]:
A, B, iter = non_negative_factorization(adj_matrix, n_components=2)



In [54]:
C=np.matmul(A,B)
A.shape, B.shape, C.shape

((943, 2), (2, 1682), (943, 1682))

- 사용자 기반 추천

In [55]:
my_id, my_vector = 0, U[0]
best_match, best_match_id, best_match_vector = -1, -1, []

for user_id, user_vector in enumerate(U):
    if my_id != user_id:
        cos_sim = cos_similarity(my_vector, user_vector)
        if cos_sim > best_match:
            best_match = cos_sim
            best_match_id = user_id
            best_match_vector = user_vector

print(f'Best match: {best_match:.4f}, Best match ID: {best_match_id}')

Best match: 1.0000, Best match ID: 235


In [56]:
recommend_list = []
for i, log in enumerate(zip(adj_matrix[my_id], adj_matrix[best_match_id])):
    log1, log2 = log
    if log1 < 1. and log2 > 0.:
        recommend_list.append(i)

print(recommend_list)

[272, 273, 274, 281, 285, 288, 293, 297, 303, 306, 312, 317, 327, 332, 369, 410, 418, 419, 422, 426, 428, 431, 434, 442, 461, 475, 477, 482, 495, 503, 504, 505, 506, 509, 519, 520, 522, 525, 531, 545, 548, 590, 594, 595, 613, 631, 654, 658, 660, 672, 684, 685, 691, 695, 698, 704, 716, 728, 734, 749, 755, 863, 865, 933, 1012, 1038, 1101, 1327, 1400]


- 항목 기반 추천

In [60]:
my_id, my_vector = 0, V.T[0]
best_match, best_match_id, best_match_vector = -1, -1, []

for user_id, user_vector in enumerate(V.T):
    if my_id != user_id:
        cos_sim = cos_similarity(my_vector, user_vector)
        if cos_sim > best_match:
            best_match = cos_sim
            best_match_id = user_id
            best_match_vector = user_vector

print(f'Best match: {best_match:.4f}, Best match ID: {best_match_id}')

Best match: 1.0000, Best match ID: 1287


In [61]:
recommend_list = []
for i, user_vector in enumerate(adj_matrix):
    if adj_matrix[i, my_id] > 0.9:
        recommend_list.append(i)

print(recommend_list)

[0, 1, 4, 5, 9, 12, 14, 15, 16, 17, 19, 20, 22, 24, 25, 37, 40, 41, 42, 43, 44, 48, 53, 55, 56, 57, 58, 61, 62, 63, 64, 65, 66, 69, 71, 72, 74, 76, 78, 80, 81, 82, 83, 88, 91, 92, 93, 94, 95, 96, 98, 100, 101, 105, 107, 108, 116, 119, 120, 123, 124, 127, 129, 130, 133, 136, 137, 140, 143, 144, 147, 149, 150, 156, 157, 159, 161, 167, 173, 176, 177, 180, 181, 183, 188, 192, 193, 197, 198, 199, 200, 201, 202, 203, 208, 209, 212, 215, 221, 222, 229, 230, 231, 233, 234, 241, 242, 243, 245, 246, 247, 248, 249, 250, 251, 252, 253, 255, 261, 262, 264, 267, 270, 273, 274, 275, 276, 278, 279, 285, 286, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 300, 302, 304, 306, 307, 310, 311, 312, 313, 319, 321, 323, 324, 325, 326, 329, 330, 331, 335, 337, 338, 339, 342, 343, 344, 346, 347, 349, 356, 358, 359, 362, 364, 370, 373, 377, 378, 379, 380, 386, 387, 388, 389, 392, 393, 394, 395, 397, 398, 400, 401, 402, 405, 406, 410, 411, 415, 416, 418, 421, 423, 424, 428, 431, 433, 434, 437, 440, 444, 