# Collaborative Filtering 실습

In [64]:
import os
import pandas as pd
import seaborn as sns
import scipy
import numpy as np
import random
from matplotlib import pyplot as plt
from datetime import datetime
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics.pairwise import cosine_similarity
from google.colab import drive
import math
import warnings
drive.mount('/content/drive')
warnings.filterwarnings("ignore")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 유사도 계산 실습

- 사용하게 될 유사도를 구현해봅니다
- 모델을 구현할 때는 라이브러리를 사용하거나 혹은 모델 패키지 내부에 구현되있는 경우가 많지만 개념을 잡기 위해 구현해보는 것이 필요합니다
- 유사도 항목
    1. cosine
    2. jaccard
    3. pearson correlation

In [65]:
# cosine similarity

def square_rooted(x):
    return math.sqrt(sum([a*a for a in x]))

def cosine(x, y):
    numerator = np.inner(a, b)
    denominator = square_rooted(x) * square_rooted(y)
    return round(numerator / float(denominator), 3)

In [66]:
a = [3, 45, 7, 2]
b = [2, 54, 13, 15]
cosine(a, b)

0.972

In [67]:
# cosine 유사도는 각 행렬이 연산가능한 크기여야 구할 수 있음
a = [3, 45, 7, 2, 3]
b = [2, 54, 13, 15]
cosine(a, b)

ValueError: ignored

In [68]:
# jacard similarity
def jaccard(x, y):
    
    intersection_cardinality = len(set(x).intersection(set(y))) # 교집합
    union_cardinality = len(set(x).union(set(y)))  # 합집합
    return round(intersection_cardinality / float(union_cardinality), 3)

In [69]:
a = [0,1,2,5,6]
b = [2,3,5,7,9]
jaccard(a, b)

0.25

In [70]:
a = [0,1,2,5,6,10]
b = [2,3,5,7,9]
jaccard(a, b)

0.222

In [71]:
# pearson correlation
def pearson_correlation(x, y):
    mean_x = sum(x)/len(x)
    mean_y = sum(y)/len(y)

    deviation_x = [i - mean_x for i in x]
    deviation_y = [i - mean_y for i in y]

    numerator = np.inner(deviation_x, deviation_y)
    denominator = square_rooted(deviation_x) * square_rooted(deviation_y)

    return round(numerator / denominator, 3)

In [72]:
a = [3, 45, 7, 2]
b = [2, 54, 13, 15]
pearson_correlation(np.array(a), np.array(b))

0.968

In [73]:
a = [3, 45, 7, 2, 3]
b = [2, 54, 13, 15]
pearson_correlation(a, b)

ValueError: ignored

# Memory-based CF 구현

## sparse matrix 만들기

In [74]:
# 학습 데이터가 들어있는 디렉토리 경로
path = '/content/drive/MyDrive/recomm_study/recomm_code/Recommend_learningspoons/data/ml-latest-small-20220921T022859Z-001/ml-latest-small/'
ratings_df = pd.read_csv(path + 'ratings.csv', encoding='utf-8')
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [75]:
# 각각의 User 집합, Item 집합 생성
user_id_list = sorted(list(set(ratings_df['userId'].values)))
movie_id_list = sorted(list(set(ratings_df['movieId'].values)))
print('유저 수: ', len(user_id_list), '영화 수: ', len(movie_id_list))

유저 수:  610 영화 수:  9724


In [76]:
# userId를 인덱스, movieId를 컬럼으로하는 pivot_table 생성
user_item_matrix = ratings_df.pivot_table(index='userId', columns='movieId', values='rating')
user_item_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


In [77]:
# 모르는 데이터(평가하지 않은 데이터) --> 0으로 채워줌
user_item_matrix = user_item_matrix.fillna(0)
user_item_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,2.5,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [78]:
# train 데이터와 test 데이터를 나눠줍니다

train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=10)

print(train_df.shape)
print(test_df.shape)

(80668, 4)
(20168, 4)


In [79]:
# train 데이터만으로 다시 추천 데이터 생성
user_item_matrix = train_df.pivot(index='userId', columns='movieId', values='rating').fillna(0)
user_item_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,191005,193565,193567,193571,193573,193579,193581,193583,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,2.5,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## User-based CF

In [80]:
# train데이터의 user similarity matrix
# 유저끼리 유사도를 구해줍니다
# 행렬의 차원은 유저수 x 유저수

user_sim = cosine_similarity(user_item_matrix, user_item_matrix)
user_sim_df = pd.DataFrame(user_sim, columns=user_item_matrix.index, index=user_item_matrix.index)
print(user_sim_df.shape)
user_sim_df

(610, 610)


userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.016301,0.002372,0.159681,0.077843,0.086387,0.129031,0.142718,0.076267,0.011988,...,0.072078,0.134184,0.188766,0.057567,0.134504,0.152000,0.216784,0.238994,0.085660,0.126228
2,0.016301,1.000000,0.000000,0.004447,0.021457,0.000000,0.033068,0.030285,0.000000,0.077491,...,0.203789,0.000000,0.014237,0.000000,0.000000,0.017086,0.015708,0.026126,0.035100,0.062231
3,0.002372,0.000000,1.000000,0.002876,0.006938,0.001567,0.000000,0.005876,0.000000,0.000000,...,0.003541,0.006292,0.017135,0.000000,0.000000,0.008137,0.021943,0.013369,0.000000,0.018756
4,0.159681,0.004447,0.002876,1.000000,0.095364,0.062998,0.072072,0.054389,0.015945,0.032033,...,0.081442,0.086043,0.246229,0.041505,0.083509,0.147993,0.105575,0.116951,0.038205,0.093053
5,0.077843,0.021457,0.006938,0.095364,1.000000,0.232372,0.050577,0.457225,0.000000,0.020130,...,0.029949,0.360573,0.087434,0.136533,0.133002,0.074504,0.097953,0.111661,0.181444,0.041444
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.152000,0.017086,0.008137,0.147993,0.074504,0.058634,0.166350,0.095844,0.065156,0.076395,...,0.132795,0.093870,0.235955,0.044477,0.108305,1.000000,0.115738,0.217882,0.051468,0.157066
607,0.216784,0.015708,0.021943,0.105575,0.097953,0.107113,0.122464,0.166779,0.016895,0.009983,...,0.062090,0.147275,0.161968,0.089290,0.112441,0.115738,1.000000,0.210962,0.156866,0.105044
608,0.238994,0.026126,0.013369,0.116951,0.111661,0.128532,0.272866,0.153214,0.061451,0.061855,...,0.149167,0.143084,0.181972,0.108152,0.151388,0.217882,0.210962,1.000000,0.097806,0.265188
609,0.085660,0.035100,0.000000,0.038205,0.181444,0.164772,0.048867,0.387803,0.000000,0.022308,...,0.000000,0.287308,0.022737,0.154484,0.071690,0.051468,0.156866,0.097806,1.000000,0.047442


### Average Rating

In [81]:
# user_id = 15 에 대한 test 데이터 평점 예측
user_id=15
user_test_df = test_df[test_df['userId'] == user_id]
user_test_df

Unnamed: 0,userId,movieId,rating,timestamp
1557,15,122904,2.0,1510571949
1484,15,3535,3.5,1510572486
1561,15,134853,4.5,1510572481
1500,15,5445,4.0,1510571793
1553,15,115713,2.0,1510572009
1497,15,4886,3.5,1510577956
1547,15,109487,4.0,1510571878
1504,15,5989,5.0,1510571938
1467,15,2011,5.0,1510572060
1558,15,122922,2.0,1510572670


In [82]:
# average rating
result = []

for _, row in user_test_df.iterrows():
  user_id = row['userId']
  movie_id = row['movieId']
  rating = row['rating']

  # movie_id가 user_item_matrix에 있는 movie_id라면
  if movie_id in user_item_matrix.columns:

    # 해당 영화에 평점을 매긴 다른 유저들 가져옴
    user_movie_matrix = user_item_matrix[user_item_matrix[movie_id] > 0][[movie_id]]
  
    #다른 유저들의 해당 영화 평점 합
    numerator = user_movie_matrix[movie_id].sum()

    # 다른 유저들의 총 인원 수
    denominator = len(user_movie_matrix)

    # 평균이 pred_rating이 됨
    pred_rating = numerator / denominator

    result.append([user_id, movie_id, rating, pred_rating])

result_df = pd.DataFrame(result, columns=['userId', 'movieId', 'rating', 'predictedRating'])
result_df

Unnamed: 0,userId,movieId,rating,predictedRating
0,15.0,122904.0,2.0,3.965116
1,15.0,3535.0,3.5,3.704545
2,15.0,134853.0,4.5,3.771429
3,15.0,5445.0,4.0,3.666667
4,15.0,115713.0,2.0,4.047619
5,15.0,4886.0,3.5,3.93
6,15.0,109487.0,4.0,4.042373
7,15.0,5989.0,5.0,3.912088
8,15.0,2011.0,5.0,3.5
9,15.0,122922.0,2.0,3.75


In [83]:
# train data의 전체 평균 평점
global_avg = train_df['rating'].mean()
print(global_avg)

3.503954480091238


In [84]:
# 전체 test 데이터에 있는 user들에 대해 average rating
result = []

for _, row in tqdm(test_df.iterrows()):

  user_id = row['userId']
  movie_id = row['movieId']
  rating = row['rating']

  # 영화가 user_item_matrix에 있고 유저가 user_item_matrix에 있다면
  # 위 코드 진행
  if movie_id in user_item_matrix.columns and user_id in user_item_matrix.index:
    
    user_movie_matrix = user_item_matrix[user_item_matrix[movie_id] > 0][[movie_id]]
    numerator = user_movie_matrix[movie_id].sum()
    denominator = len(user_movie_matrix)

    pred_rating = numerator / denominator

    result.append([user_id, movie_id, rating, pred_rating])
  
  # 그렇지 않다면 그냥 그 train 데이터의 전체 영화에 평점의 평균값을 pred_rating으로 
  else:
    result.append([user_id, movie_id, rating, global_avg])

result_df = pd.DataFrame(result, columns=['userId', 'movieId', 'rating', 'predictedRating'])
result_df

0it [00:00, ?it/s]

Unnamed: 0,userId,movieId,rating,predictedRating
0,239.0,924.0,4.0,3.959770
1,384.0,2763.0,3.0,3.742857
2,52.0,58559.0,5.0,4.270492
3,600.0,719.0,2.5,2.833333
4,318.0,117364.0,4.0,3.503954
...,...,...,...,...
20163,20.0,5015.0,4.0,3.142857
20164,177.0,6787.0,3.0,4.083333
20165,103.0,969.0,4.0,3.857143
20166,42.0,2875.0,3.0,3.083333


In [85]:
# rmse, mae
rmse = np.sqrt(mean_squared_error(result_df['rating'].values, result_df['predictedRating'].values))
mae = mean_absolute_error(result_df['rating'].values, result_df['predictedRating'].values)

print(f'rmse : {rmse}')
print(f'mae : {mae}')

rmse : 0.9792027101742625
mae : 0.7553096185597311


### Weighted Average Rating

In [86]:
# test데이터에 user_similarity를 가중치로 활용한 weighted average rating

result = []

for _, row in tqdm(test_df.iterrows()):

  user_id = row['userId']
  movie_id = row['movieId']
  rating = row['rating']

  if movie_id in user_item_matrix.columns and user_id in user_item_matrix.index:

    # 해당 영화에 평점을 매긴 user들과 그 유저들이 매긴 평점을 가져옴
    movie_ratings = user_item_matrix[user_item_matrix[movie_id] > 0][movie_id]
    user_ids = movie_ratings.index 
    
    # 추천 대상 유저(user_id)와 그 이웃들(user_ids) 간의 유사도 가져옴
    similarities = user_sim_df[user_ids].loc[user_id]
    numerator = np.inner(movie_ratings.values, similarities) # 가중합 계산
    denominator = similarities.sum()

    # 만약 user_ids의 수가 0이라면(공통으로 매긴 유저가 없다면) 아래 평점 예측은 건너뜀
    if denominator == 0:
      continue
    
    pred_rating = numerator / denominator

    result.append([user_id, movie_id, rating, pred_rating])

  else:
    result.append([user_id, movie_id, rating, global_avg])

weighted_average_result_df = pd.DataFrame(result, columns=['userId', 'movieId', 'rating', 'predictedRating'])
weighted_average_result_df

0it [00:00, ?it/s]

Unnamed: 0,userId,movieId,rating,predictedRating
0,239.0,924.0,4.0,3.908533
1,384.0,2763.0,3.0,3.765776
2,52.0,58559.0,5.0,4.288992
3,600.0,719.0,2.5,2.555142
4,318.0,117364.0,4.0,3.503954
...,...,...,...,...
20160,20.0,5015.0,4.0,3.237979
20161,177.0,6787.0,3.0,4.039030
20162,103.0,969.0,4.0,3.725923
20163,42.0,2875.0,3.0,3.061219


In [87]:
# rmse, mae
rmse = np.sqrt(mean_squared_error(weighted_average_result_df['rating'].values, weighted_average_result_df['predictedRating'].values))
mae = mean_absolute_error(weighted_average_result_df['rating'].values, weighted_average_result_df['predictedRating'].values)

print(f'rmse : {rmse}')
print(f'mae : {mae}')

rmse : 0.9713599605151955
mae : 0.7488152459430106


### k-Nearest Neighborhood CF (user-based)

In [88]:
# test 데이터 전체 중 유사도가 높은 유저 K명에 대해서만 weighted Average Rating

k=20
result = []

for _, row in tqdm(test_df.iterrows()):
  
  user_id = row['userId']
  movie_id = row['movieId']
  rating = row['rating']

  if movie_id in user_item_matrix.columns and user_id in user_item_matrix.index:

    movie_ratings = user_item_matrix[user_item_matrix[movie_id] > 0][movie_id]
    user_ids = movie_ratings.index

    # 유사도 값 높은 K명의 이웃에 대한 유사도와 그들이 매긴 해당 영화의 평점 가져옴
    candidate_sim = user_sim_df[user_ids].loc[user_id].sort_values(ascending=False)[:k]
    candidate_movie_ratings = movie_ratings[candidate_sim.index]

    # 앞의 가중평균 구하는 과정 반복
    numerator = np.inner(candidate_movie_ratings.values, candidate_sim)
    denominator = candidate_sim.sum()

    # 이웃의 수가 0인 경우 --> global_avg
    if denominator == 0:
      result.append([user_id, movie_id, rating, global_avg])
      continue
    
    pred_rating = numerator / denominator

    result.append([user_id, movie_id, rating, pred_rating])
  
  else:
    result.append([user_id, movie_id, rating, global_avg])

k_weighted_average_result_df = pd.DataFrame(result, columns=['userId', 'movieId', 'rating', 'predictedRating'])
k_weighted_average_result_df

0it [00:00, ?it/s]

Unnamed: 0,userId,movieId,rating,predictedRating
0,239.0,924.0,4.0,3.463541
1,384.0,2763.0,3.0,3.819279
2,52.0,58559.0,5.0,4.648799
3,600.0,719.0,2.5,2.464960
4,318.0,117364.0,4.0,3.503954
...,...,...,...,...
20163,20.0,5015.0,4.0,3.237979
20164,177.0,6787.0,3.0,4.020931
20165,103.0,969.0,4.0,3.569079
20166,42.0,2875.0,3.0,3.061219


In [89]:
# rmse, mae
rmse = np.sqrt(mean_squared_error(k_weighted_average_result_df['rating'].values, k_weighted_average_result_df['predictedRating'].values))
mae = mean_absolute_error(k_weighted_average_result_df['rating'].values, k_weighted_average_result_df['predictedRating'].values)

print(f'rmse : {rmse}')
print(f'mae : {mae}')

rmse : 0.9705218301732897
mae : 0.7473291624002141


In [90]:
# k의 개수를 변화시켜가면서 수행
# k의 수가 늘어날수록 오차가 줄어듦

num_neighbors = [5,10,15,20,25,30]
for k in num_neighbors:
  
  result = []
  for _, row in tqdm(test_df.iterrows()):
  
    user_id = row['userId']
    movie_id = row['movieId']
    rating = row['rating']

    if movie_id in user_item_matrix.columns and user_id in user_item_matrix.index:

      movie_ratings = user_item_matrix[user_item_matrix[movie_id] > 0][movie_id]
      user_ids = movie_ratings.index

      # 유사도 값 높은 K명의 이웃에 대한 유사도와 그들이 매긴 해당 영화의 평점 가져옴
      candidate_sim = user_sim_df[user_ids].loc[user_id].sort_values(ascending=False)[:k]
      candidate_movie_ratings = movie_ratings[candidate_sim.index]

      # 앞의 가중평균 구하는 과정 반복
      numerator = np.inner(candidate_movie_ratings.values, candidate_sim)
      denominator = candidate_sim.sum()

      # 이웃의 수가 0인 경우 --> global_avg
      if denominator == 0:
        result.append([user_id, movie_id, rating, global_avg])
        continue
      
      pred_rating = numerator / denominator

      result.append([user_id, movie_id, rating, pred_rating])
    
    else:
      result.append([user_id, movie_id, rating, global_avg])
  df = pd.DataFrame(result, columns=['userId', 'movieId', 'rating', 'predictedRating'])

  rmse = np.sqrt(mean_squared_error(df['rating'].values, df['predictedRating'].values))
  mae = mean_absolute_error(df['rating'].values, df['predictedRating'].values)

  print(f'이웃의 수가 {k}명인 경우 RMSE / MAE')
  print(f'rmse : {rmse}')
  print(f'mae : {mae}')

0it [00:00, ?it/s]

이웃의 수가 5명인 경우 RMSE / MAE
rmse : 0.9994386349816933
mae : 0.7710822767489532


0it [00:00, ?it/s]

이웃의 수가 10명인 경우 RMSE / MAE
rmse : 0.9778507790039508
mae : 0.7538149341895605


0it [00:00, ?it/s]

이웃의 수가 15명인 경우 RMSE / MAE
rmse : 0.972627970100798
mae : 0.7489336522963584


0it [00:00, ?it/s]

이웃의 수가 20명인 경우 RMSE / MAE
rmse : 0.9705218301732897
mae : 0.7473291624002141


0it [00:00, ?it/s]

이웃의 수가 25명인 경우 RMSE / MAE
rmse : 0.9700602200076299
mae : 0.7473819965134048


0it [00:00, ?it/s]

이웃의 수가 30명인 경우 RMSE / MAE
rmse : 0.9698934969216365
mae : 0.7472077525749342


## Item-based CF
- 아이템끼리의 유사도를 사용하여 평점을 예측합니다
- 이전에 배운 TF-IDF와 아이템 벡터 생성 방법은 다르지만 결과적으로 같은 아이템을 벡터로 표현한다는 점에서는 유사합니다

In [91]:
# train 데이터의 item_user_matrix 만들기
# --> item_sim 구할 것

item_user_matrix = train_df.pivot_table(index='movieId', columns='userId', values='rating').fillna(0)
item_user_matrix

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,0.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193579,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193581,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [92]:
item_sim = cosine_similarity(item_user_matrix, item_user_matrix)
item_sim_df = pd.DataFrame(data=item_sim, columns=item_user_matrix.index, 
                           index=item_user_matrix.index)
item_sim_df

movieId,1,2,3,4,5,6,7,8,9,10,...,191005,193565,193567,193571,193573,193579,193581,193583,193587,193609
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.338970,0.315604,0.030125,0.261765,0.295128,0.235605,0.093550,0.170233,0.280456,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.338970,1.000000,0.249558,0.094187,0.238732,0.208502,0.224759,0.120127,0.010906,0.274381,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.315604,0.249558,1.000000,0.000000,0.341006,0.240170,0.307270,0.269336,0.273639,0.194403,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.030125,0.094187,0.000000,1.000000,0.181493,0.051524,0.251309,0.162301,0.000000,0.104297,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.261765,0.238732,0.341006,0.181493,1.000000,0.250941,0.473463,0.117417,0.303564,0.180550,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193579,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
193581,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
193583,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
193587,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


### Weighted Average Rating

In [93]:
# IBCF - 모든 이웃 고려

k=20
result = []

for _, row in tqdm(test_df.iterrows()):

  user_id = row['userId']
  movie_id = row['movieId']
  rating = row['rating']

  if user_id in item_user_matrix.columns and movie_id in item_user_matrix.index:

    # 해당 유저가 봤던 다른 영화 리스트를 가져옴
    item_ratings = item_user_matrix[item_user_matrix[user_id] > 0][user_id]
    movie_ids = item_ratings.index

    item_sim = item_sim_df[movie_ids].loc[movie_id]
    numerator = np.inner(item_ratings.values, item_sim)
    denominator = item_sim.sum()

    if denominator == 0:
      result.append([user_id, movie_id, rating, global_avg])
      continue
    
    pred_rating = numerator / denominator
    result.append([user_id, movie_id, rating, pred_rating])
  
  else:
    result.append([user_id, movie_id, rating, global_avg])

item_based_result_df = pd.DataFrame(result, columns=['userId', 'movieId', 'rating', 'predictedRating'])
item_based_result_df

0it [00:00, ?it/s]

Unnamed: 0,userId,movieId,rating,predictedRating
0,239.0,924.0,4.0,4.079030
1,384.0,2763.0,3.0,2.963573
2,52.0,58559.0,5.0,4.571206
3,600.0,719.0,2.5,2.959931
4,318.0,117364.0,4.0,3.503954
...,...,...,...,...
20163,20.0,5015.0,4.0,3.563257
20164,177.0,6787.0,3.0,3.388352
20165,103.0,969.0,4.0,4.082881
20166,42.0,2875.0,3.0,3.678894


In [94]:
# rmse, mae
rmse = np.sqrt(mean_squared_error(item_based_result_df['rating'].values, item_based_result_df['predictedRating'].values))
mae = mean_absolute_error(item_based_result_df['rating'].values, item_based_result_df['predictedRating'].values)

print(f'rmse : {rmse}')
print(f'mae : {mae}')

rmse : 0.9255275896882756
mae : 0.7132160614874838


### k-Nearest Neighborhood CF (item-based)

In [95]:
# 아이템 유사도가 높은 K개의 영화에 대해서만 계산에 활용해 예측
k=20
result = []

for _, row in tqdm(test_df.iterrows()):

  user_id = row['userId']
  movie_id = row['movieId']
  rating = row['rating']

  if user_id in item_user_matrix.columns and movie_id in item_user_matrix.index:
    item_ratings = item_user_matrix[item_user_matrix[user_id] > 0][user_id]
    movie_ids = item_ratings.index

    candidate_sim = item_sim_df[movie_ids].loc[movie_id].sort_values(ascending=False)[:k]
    candidate_item_ratings = item_ratings[candidate_sim.index]

    numerator = np.inner(candidate_item_ratings.values, candidate_sim)
    denominator = candidate_sim.sum()

    if denominator == 0:
      result.append([user_id, movie_id, rating, global_avg])
      continue
    
    pred_rating = numerator / denominator

    result.append([user_id, movie_id, rating, pred_rating])
  
  else:
    result.append([user_id, movie_id, rating, global_avg])

knn_item_based_result_df = pd.DataFrame(result, columns=['userId', 'movieId', 'rating', 'predictedRating'])
knn_item_based_result_df

0it [00:00, ?it/s]

Unnamed: 0,userId,movieId,rating,predictedRating
0,239.0,924.0,4.0,4.552831
1,384.0,2763.0,3.0,2.969577
2,52.0,58559.0,5.0,4.627186
3,600.0,719.0,2.5,2.782147
4,318.0,117364.0,4.0,3.503954
...,...,...,...,...
20163,20.0,5015.0,4.0,3.383256
20164,177.0,6787.0,3.0,3.218865
20165,103.0,969.0,4.0,4.216606
20166,42.0,2875.0,3.0,3.381899


In [96]:
# rmse, mae
rmse = np.sqrt(mean_squared_error(knn_item_based_result_df['rating'].values, knn_item_based_result_df['predictedRating'].values))
mae = mean_absolute_error(knn_item_based_result_df['rating'].values, knn_item_based_result_df['predictedRating'].values)

print(f'rmse : {rmse}')
print(f'mae : {mae}')

rmse : 0.8737044719760972
mae : 0.6635459435076628


In [97]:
# 이웃 item 수를 조절해가며 성능 테스트
# k=20이 최적의 성능을 보였음
k_list = [5,10,15,20,25,30,35,40]

for k in k_list:
  
  result = []
  for _, row in tqdm(test_df.iterrows()):

    user_id = row['userId']
    movie_id = row['movieId']
    rating = row['rating']

    if user_id in item_user_matrix.columns and movie_id in item_user_matrix.index:
      item_ratings = item_user_matrix[item_user_matrix[user_id] > 0][user_id]
      movie_ids = item_ratings.index

      candidate_sim = item_sim_df[movie_ids].loc[movie_id].sort_values(ascending=False)[:k]
      candidate_item_ratings = item_ratings[candidate_sim.index]

      numerator = np.inner(candidate_item_ratings.values, candidate_sim)
      denominator = candidate_sim.sum()

      if denominator == 0:
        result.append([user_id, movie_id, rating, global_avg])
        continue
      
      pred_rating = numerator / denominator

      result.append([user_id, movie_id, rating, pred_rating])
    
    else:
      result.append([user_id, movie_id, rating, global_avg])
  
  df = pd.DataFrame(result, columns=['userId', 'movieId', 'rating', 'predictedRating'])
  rmse = np.sqrt(mean_squared_error(df['rating'].values, df['predictedRating'].values))
  mae = mean_absolute_error(df['rating'].values, df['predictedRating'].values)

  print(f'이웃의 수가 {k}명인 경우 RMSE / MAE')
  print(f'rmse : {rmse}')
  print(f'mae : {mae}')

0it [00:00, ?it/s]

이웃의 수가 5명인 경우 RMSE / MAE
rmse : 0.9033788210088529
mae : 0.6845453245137015


0it [00:00, ?it/s]

이웃의 수가 10명인 경우 RMSE / MAE
rmse : 0.8801080838563009
mae : 0.669108162454168


0it [00:00, ?it/s]

이웃의 수가 15명인 경우 RMSE / MAE
rmse : 0.8743742998270422
mae : 0.6644269629862735


0it [00:00, ?it/s]

이웃의 수가 20명인 경우 RMSE / MAE
rmse : 0.8737044719760972
mae : 0.6635459435076628


0it [00:00, ?it/s]

이웃의 수가 25명인 경우 RMSE / MAE
rmse : 0.8749839358342814
mae : 0.6649160660842618


0it [00:00, ?it/s]

이웃의 수가 30명인 경우 RMSE / MAE
rmse : 0.877136694881195
mae : 0.6668230275925994


0it [00:00, ?it/s]

이웃의 수가 35명인 경우 RMSE / MAE
rmse : 0.878400555202748
mae : 0.6680583198779592


0it [00:00, ?it/s]

이웃의 수가 40명인 경우 RMSE / MAE
rmse : 0.8799185246063966
mae : 0.6696879953525937
