In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# df
u_cols = ['user_id','age','sex','occupation','zip_code']
users = pd.read_csv('/content/drive/MyDrive/recomm_sample/u.user.csv',sep='|',names=u_cols,encoding='latin_1')

i_cols = ['movie_id','title','release date','video release date','IMDB URL','unknown','Action','Adventure','Animation','children\s',
          'comedy','crime','Documentary','Drama','Fantasy','Film-Noir','Horror','Musical','Mystery','Romance',
          'Sci-Fi','Thriller','war','western']
movies = pd.read_csv('/content/drive/MyDrive/recomm_sample/u.item.csv',sep='|',names=i_cols,
                     encoding='latin_1')

r_cols = ['user_id','movie_id','rating','timestamp']
ratings = pd.read_csv('/content/drive/MyDrive/recomm_sample/u.data.csv',sep='\t',names=r_cols,
                      encoding='latin_1')

ratings_2 = ratings.drop('timestamp',axis=1)
movies_2 = movies[['movie_id','title']]

In [3]:
# 모델 평가

def RMSE(y_true,y_pred):
  return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

x = ratings_2.copy()
y = ratings_2['user_id']  
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.25,stratify=y)   

def score(model):
  id_pairs = zip(X_test['user_id'],X_test['movie_id'])
  y_pred = np.array([model(user,movie) for (user,movie) in id_pairs])   # ratings_2의 (user_id,movie_id) pair에 모델의 예측값 계산 후 y_pred에 append
  y_true = np.array(X_test['rating'])
  return RMSE(y_true,y_pred)

rating_matrix = X_train.pivot(index='user_id',columns='movie_id',values='rating')

# 협업 필터링 : 대상 user와 취향이 비슷한 user들을 찾고, 추천 대상이 되는 item 중 대상 user가 구매하지 않았으면 그 제품을 추천

## 1. 코사인 유사도를 활용한 CF
## 2. 이웃을 고려한 CF
## 3. 사용자의 평가경향을 고려한 CF
## 4. 사용자간 신뢰도를 고려한 CF
## 5. 아이템 기반 CF(IBCF)

In [4]:
# 1. 코사인 유사도를 사용하여 협업필터링

from sklearn.metrics.pairwise import cosine_similarity
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy,matrix_dummy)
user_similarity = pd.DataFrame(user_similarity,index=rating_matrix.index,
                               columns=rating_matrix.index)  # user간 cosine유사도 계산 --> index / columns 동일하게 설정
user_similarity

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.144650,0.044115,0.019356,0.287802,0.354398,0.307888,0.287610,0.061899,0.324644,...,0.292826,0.111102,0.223834,0.137346,0.188147,0.134129,0.225364,0.142376,0.145144,0.281408
2,0.144650,1.000000,0.070248,0.116262,0.038168,0.203116,0.080441,0.077041,0.094100,0.142957,...,0.127292,0.203296,0.251339,0.275774,0.243928,0.173073,0.212673,0.076659,0.115715,0.062430
3,0.044115,0.070248,1.000000,0.145935,0.028944,0.065845,0.044388,0.067296,0.024002,0.062204,...,0.029558,0.000000,0.143753,0.019207,0.074173,0.000000,0.139233,0.110617,0.053041,0.021232
4,0.019356,0.116262,0.145935,1.000000,0.000000,0.039226,0.090611,0.104685,0.059238,0.053997,...,0.038906,0.050278,0.113799,0.081262,0.033283,0.000000,0.140725,0.182004,0.107897,0.052401
5,0.287802,0.038168,0.028944,0.000000,1.000000,0.168342,0.248288,0.163924,0.029959,0.133866,...,0.261331,0.067384,0.069741,0.056168,0.116148,0.081828,0.183721,0.119662,0.070618,0.223941
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.134129,0.173073,0.000000,0.000000,0.081828,0.110947,0.090331,0.071177,0.052359,0.060641,...,0.050723,0.407066,0.207324,0.183876,0.381266,1.000000,0.023141,0.160870,0.000000,0.117645
940,0.225364,0.212673,0.139233,0.140725,0.183721,0.249942,0.230171,0.135274,0.077207,0.275372,...,0.289458,0.101934,0.141146,0.072177,0.104111,0.023141,1.000000,0.117728,0.150737,0.166948
941,0.142376,0.076659,0.110617,0.182004,0.119662,0.155248,0.057423,0.116737,0.181292,0.054572,...,0.060056,0.118776,0.260214,0.101805,0.198360,0.160870,0.117728,1.000000,0.061339,0.056270
942,0.145144,0.115715,0.053041,0.107897,0.070618,0.300281,0.217352,0.192237,0.121450,0.196848,...,0.185756,0.049422,0.088738,0.079118,0.036456,0.000000,0.150737,0.061339,1.000000,0.160415


In [5]:
# 코사인 유사도 추천 모델
# 취향이 비슷한 이웃을 고려하지 않음(전체 사용자를 대상으로 평점 예측)

def cf_simple(user_id,movie_id):
  if movie_id in rating_matrix:
    sim_scores = user_similarity[user_id].copy()  # 유저별 다른 유저와의 코사인 유사도값 가져옴
    movie_ratings = rating_matrix[movie_id].copy() # 영화별로 사용자의 평점 값 가져옴(없으면 NaN)
    non_rating_idx = movie_ratings[movie_ratings.isnull()].index   # movie_id에 대해 null값을 가진 인덱스 체크
    movie_ratings = movie_ratings.dropna()  # movie_ratings 중 NaN값 제거
    sim_scores = sim_scores.drop(non_rating_idx)   # sim_scores의 user 중 non_rating_idx인 유저의 sim_scores 제거
    mean_rating = np.dot(sim_scores,movie_ratings) / sim_scores.sum()
  
  else:
    mean_rating = 3.0
  return mean_rating

score(cf_simple)

1.0146708535159759

In [None]:
# 피어슨 상관계수를 사용한 코드
# 왜 NULL값 뜨는지 체크

matrix_dummy_2 = rating_matrix.copy().fillna(0)
user_similarity_2 = matrix_dummy_2.corr(method='pearson')
user_similarity_2 = pd.DataFrame(user_similarity_2,index=rating_matrix.index,columns=rating_matrix.index)
print(user_similarity_2)

def cf_pearson(user_id,movie_id):
  if movie_id in rating_matrix:
    sim_scores_2 = user_similarity_2[user_id].copy()
    movie_ratings_2 = rating_matrix[movie_id].copy()
    non_rating_idx_2 = movie_ratings_2[movie_ratings_2.isnull()].index
    movie_ratings_2 = movie_ratings_2.dropna()
    sim_scores_2 = sim_scores_2.drop(non_rating_idx_2)
    mean_rating_2 = np.dot(sim_scores_2,movie_ratings_2) / sim_scores_2.sum()

  else : mean_rating_2 = 3.0
  return mean_rating_2

score(cf_pearson)

user_id       1         2         3         4         5         6         7    \
user_id                                                                         
1        1.000000  0.108796  0.139995  0.121464  0.063397  0.029341  0.203727   
2        0.108796  1.000000  0.063950  0.275490  0.151494  0.024148  0.145407   
3        0.139995  0.063950  1.000000  0.173285  0.077487 -0.031997  0.170890   
4        0.121464  0.275490  0.173285  1.000000  0.182386  0.036085  0.187888   
5        0.063397  0.151494  0.077487  0.182386  1.000000 -0.018960  0.173284   
...           ...       ...       ...       ...       ...       ...       ...   
939      0.063717  0.169103  0.071314  0.177265  0.185941  0.013619  0.132234   
940      0.107940  0.196255  0.174249  0.230739  0.189979  0.007142  0.173940   
941      0.034619  0.269565  0.044443  0.189969  0.149693 -0.023982  0.066586   
942      0.053339  0.167761  0.074183  0.162659  0.086931  0.132567  0.111252   
943      0.047383  0.148549 

nan

In [None]:
# 2. 이웃을 고려한 CF
# 이웃을 정하는 기준 : K-NN 쓰거나, 유사도 기준(예 : 0.8보다 커야!) Thresholding
#              --> 정해진 기준을 넘는 사용자가 없어 추천을 못하는 경우가 있으므로 주로 K-NN기법 사용


# KNN으로 이웃 개수 지정

def score(model,neighbor_size=0):
  id_pairs = zip(X_test['user_id'],X_test['movie_id'])
  y_pred = np.array([model(user,movie,neighbor_size) for (user,movie) in id_pairs])   # ratings_2의 (user_id,movie_id) pair에 모델의 예측값 계산 후 y_pred에 append
  y_true = np.array(X_test['rating'])
  return RMSE(y_true,y_pred)


def cf_knn(user_id,movie_id,neighbor_size=0):
  if movie_id in rating_matrix:
    sim_scores = user_similarity[user_id].copy()  
    movie_ratings = rating_matrix[movie_id].copy() 
    non_rating_idx = movie_ratings[movie_ratings.isnull()].index   
    movie_ratings = movie_ratings.dropna()  
    sim_scores = sim_scores.drop(non_rating_idx)

    # neighbor_size 지정하지 않은 경우
    if neighbor_size == 0:
      mean_rating = np.dot(sim_scores,movie_ratings) / sim_scores.sum()

    # neighbor_size 지정해준 경우
    else:
      if len(sim_scores) > 1:    # sim_scores에 있는 유저 수가 최소 2는 되어야 함!
        neighbor_size = min(neighbor_size, len(sim_scores))
        sim_scores = np.array(sim_scores)
        movie_ratings = np.array(movie_ratings)
        user_idx = np.argsort(sim_scores)  # users를 유사도 낮은것부터 정렬     np.argsort()를 통해 인덱스 리턴
        sim_scores = sim_scores[user_idx][-neighbor_size:]   # 유사도가 큰 이웃부터 유사도 점수 가져옴
        movie_ratings = movie_ratings[user_idx][-neighbor_size:]  # 유사도가 큰 이웃 부터 영화별 평점 가져옴
        mean_rating = np.dot(sim_scores,movie_ratings) / sim_scores.sum()
      
      else:
        mean_rating = 3.0
  else:
    mean_rating = 3.0

  return mean_rating

score(cf_knn,neighbor_size=30)

1.0075316999865278

In [None]:
# 실제로 유저에게 추천하는 기능 구현
rating_matrix = ratings.pivot_table(values='rating',index='user_id',columns='movie_id')    # 여기서는 train/test split X
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy,matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns= rating_matrix.index)

def recomm_movie(user_id,n_items,neighbor_size=30):
  user_movie = rating_matrix.loc[user_id].copy()   # 각 추천 받을 user의 평점 정보 copy
  for movie in rating_matrix:
    if pd.notnull(user_movie.loc[movie]) :  # 추천받을 유저가 이미 영화를 본경우(nan이 아닌경우) 
      user_movie.loc[movie] = 0             # 0 리턴(추천리스트에서 제외할 것임!)
    
    else:            # 영화를 보지 않은 경우(nan인 경우)
      user_movie.loc[movie] = cf_knn(user_id,movie,neighbor_size)  # cf_knn 가져와서 대상 user의 예상 평점 체크
  
  movie_sort = user_movie.sort_values(ascending=False)[:n_items]   # 예상 평점 높은 것부터 n_items개 추천
  recomm_movies = movies.loc[movie_sort.index]
  recomm = recomm_movies['title']
  return recomm

recomm_movie(user_id=2,n_items=5,neighbor_size=30)

movie_id
1293                     Ayn Rand: A Sense of Life (1997)
1467                                     Cure, The (1995)
1189                              That Old Feeling (1997)
1500    Prisoner of the Mountains (Kavkazsky Plennik) ...
318                       Everyone Says I Love You (1996)
Name: title, dtype: object

In [None]:
# 최적의 neighbor_size 개수 결정

# neighbor_size가 지나치게 크면 --> 개인의 취향 반영 X
#      ''              작으면 --> 소수의 평가에 지나치게 의존하여 오버피팅
# 도메인에 따라서 최적 이웃의 개수가 다르다!

rating_matrix = X_train.pivot(index='user_id',columns='movie_id',values='rating')
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy,matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns= rating_matrix.index)

size_list = [10,20,30,40,50,60]

for neighbor_size in size_list : 
  print(f'Neighbor_size = {neighbor_size} | RMSE = {np.round(score(cf_knn,neighbor_size),4)}')

# 이 데이터셋에서는 40~50개 정도로 설정하는게 좋을듯!

Neighbor_size = 10 | RMSE = 1.0223
Neighbor_size = 20 | RMSE = 1.0103
Neighbor_size = 30 | RMSE = 1.0075
Neighbor_size = 40 | RMSE = 1.007
Neighbor_size = 50 | RMSE = 1.0072
Neighbor_size = 60 | RMSE = 1.008


In [None]:
# 3. 사용자의 평가경향을 고려한 CF 
# 사용자에 따라 전체적으로 높게 평가한 User가 있고, 반대도 있음
 
# a = 평균 2.0  영화 x에 3.0
# b = 평균 4.0  영화 x에 3.0
# --> a의 3점은 높은 가중치를, b의 3점은 낮은 가중치를 줘야!

# 추천 대상 user의 경우에도 이를 고려해야!
# 대상 user 평점평균 3.0
# 이웃들 평점평균 4.0
# cf_knn 값이 3.5라면 3.5-1(4점-3점) = 2.5가 되어야!

# 1. 각 user의 평점평균 계산
# 2. 각 item 평점 - 각 user의 평점평균 = 평점편차
# 3. 각 user의 이웃의 item들에 대한 평점편차&유사도 가중평균 --> 해당 user의 해당 item에 대한 편차 예측값 계산
# 4. 편차 예측값 + 각 user의 평점평균
# 5. item에 대한 예측값을 구할수 없는 경우 --> 해당 user의 평점평균으로 대체

rating_mean = rating_matrix.mean(axis=1)    # 1. 각 user별 평점평균
rating_bias = (rating_matrix.T - rating_mean).T  # 2. 각 item(movie)별 평점 - user별 평점평균 = 평점편차(rating_matrix의 형태로 다시 T)

def cf_knn_bias(user_id,movie_id,neighbor_size=0):
  if movie_id in rating_bias : 
    sim_scores = user_similarity[user_id].copy()   # user별 코사인 유사도
    movie_ratings = rating_bias[movie_id].copy()   # item별 평점편차
    non_rating_idx = movie_ratings[movie_ratings.isnull()].index  
    movie_ratings = movie_ratings.drop(non_rating_idx)
    sim_scores = sim_scores.drop(non_rating_idx)

    if neighbor_size == 0 :    # 이웃이 없는 경우 --> 자신의 평점평균 + 편차 예측값
      prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum() # 3
      prediction = prediction + rating_mean[user_id] # 4.
    
    else :    # 이웃이 있는 경우
      if len(sim_scores) > 1:
        neighbor_size = min(neighbor_size,len(sim_scores))
        sim_scores = np.array(sim_scores)
        movie_ratings = np.array(movie_ratings)
        user_idx = np.argsort(sim_scores)
        sim_scores = sim_scores[user_idx][-neighbor_size:]
        movie_ratings = movie_ratings[user_idx][-neighbor_size:]

        prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum()   # 이웃들의 item들에 대한 평점편차 코사인 유사도 내적  / 코사인 유사도 총합
        prediction = prediction + rating_mean[user_id]    # 4.
      
      else:
        prediction = rating_mean[user_id]
    
  else:
    prediction = rating_mean[user_id]
  
  return prediction

score(cf_knn_bias,30)

0.9425566101585063

In [None]:
# 위 cf_knn_bias 활용하여 5개 item 추천

rating_matrix = ratings.pivot_table(values='rating',index='user_id',columns='movie_id')    # 여기서는 train/test split X
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy,matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns= rating_matrix.index)

def recomm_movie(user_id,n_items,neighbor_size=30):
  user_movie = rating_matrix.loc[user_id].copy()   # 각 추천 받을 user의 평점 정보 copy
  for movie in rating_matrix:
    if pd.notnull(user_movie.loc[movie]) :  # 추천받을 유저가 이미 영화를 본경우(nan이 아닌경우) 
      user_movie.loc[movie] = 0             # 0 리턴(추천리스트에서 제외할 것임!)
    
    else:            # 영화를 보지 않은 경우(nan인 경우)
      user_movie.loc[movie] = cf_knn_bias(user_id,movie,neighbor_size)  # cf_knn_bias로 대상 user의 평가 경향을 고려한 예상 평점 체크
  
  movie_sort = user_movie.sort_values(ascending=False)[:n_items]   # 예상 평점 높은 것부터 n_items개 추천
  recomm_movies = movies.loc[movie_sort.index]
  recomm = recomm_movies['title']
  return recomm

recomm_movie(user_id=2,n_items=5,neighbor_size=30)

movie_id
851                              Bloody Child, The (1996)
1467                                     Cure, The (1995)
1500    Prisoner of the Mountains (Kavkazsky Plennik) ...
1642                                    Angel Baby (1995)
1177                                   Major Payne (1994)
Name: title, dtype: object

In [None]:
# 4. 신뢰도 가중 CF
# 대상 user와 다른 user간 평가한 item 개수가 일정 개수 이상인 놈들만 이웃으로 활용하겠다!
# item 역시 평가한 user 수가 일정 수준 이상인 놈들만 가져와서 활용하겠다!

rating_binary = np.array((rating_matrix > 0).astype(float))   # full rating_matrix 중 평점이 있는 item은 1로!
rating_binary_2 = rating_binary.T   # 아래 내적곱을 위해 Transpose
counts = np.dot(rating_binary,rating_binary_2)   
counts = pd.DataFrame(counts,index=rating_matrix.index,columns=rating_matrix.index).fillna(0)

In [None]:
rating_binary

array([[1., 1., 1., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

In [None]:
counts   # 내적곱을 통해 user간 공통 평가 영화수 계산
         # 대각요소는 user 자신이 평가한 총 영화 개수

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,272.0,18.0,8.0,7.0,80.0,96.0,145.0,34.0,5.0,77.0,...,77.0,15.0,48.0,19.0,36.0,15.0,52.0,10.0,27.0,83.0
2,18.0,62.0,9.0,7.0,5.0,32.0,18.0,6.0,6.0,16.0,...,15.0,14.0,33.0,20.0,24.0,11.0,17.0,7.0,12.0,9.0
3,8.0,9.0,54.0,13.0,1.0,10.0,14.0,7.0,2.0,8.0,...,3.0,2.0,16.0,6.0,10.0,2.0,15.0,4.0,9.0,2.0
4,7.0,7.0,13.0,24.0,2.0,6.0,12.0,8.0,2.0,4.0,...,3.0,1.0,8.0,6.0,7.0,1.0,10.0,4.0,8.0,4.0
5,80.0,5.0,1.0,2.0,175.0,42.0,102.0,23.0,4.0,35.0,...,58.0,6.0,15.0,5.0,21.0,7.0,28.0,6.0,17.0,59.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,15.0,11.0,2.0,1.0,7.0,14.0,16.0,4.0,1.0,6.0,...,7.0,18.0,23.0,10.0,29.0,49.0,5.0,6.0,3.0,12.0
940,52.0,17.0,15.0,10.0,28.0,54.0,68.0,18.0,6.0,51.0,...,45.0,7.0,26.0,13.0,18.0,5.0,107.0,7.0,23.0,28.0
941,10.0,7.0,4.0,4.0,6.0,11.0,7.0,7.0,3.0,6.0,...,3.0,6.0,16.0,7.0,14.0,6.0,7.0,22.0,4.0,5.0
942,27.0,12.0,9.0,8.0,17.0,40.0,46.0,11.0,4.0,24.0,...,27.0,4.0,12.0,6.0,10.0,3.0,23.0,4.0,79.0,19.0


In [None]:
counts[1]

user_id
1      272.0
2       18.0
3        8.0
4        7.0
5       80.0
       ...  
939     15.0
940     52.0
941     10.0
942     27.0
943     83.0
Name: 1, Length: 943, dtype: float64

In [None]:
def cf_knn_bias_sig(user_id,movie_id,neighbor_size=0) :
  if movie_id in rating_bias:
    sim_scores = user_similarity[user_id].copy()   
    movie_ratings = rating_bias[movie_id].copy()   
    no_rating = movie_ratings.isnull()
    common_counts = counts[user_id]  # item 별로 대상 유저와 공통으로 평가한 유저 수 가져오기
    low_significance = common_counts < sig_level    # 공통으로 평가한 item이 적은 유저는 빼겠다!
    non_rating_idx = movie_ratings[no_rating | low_significance].index   # | : or 연산자!
    movie_ratings = movie_ratings.drop(non_rating_idx)
    sim_scores = sim_scores.drop(non_rating_idx)

    if neighbor_size == 0:
      prediction = np.dot(sim_scores,movie_ratings) / sim_scores.sum()
      prediction = prediction + rating_mean[user_id]

    
    elif len(sim_scores) > min_ratings :   # 해당 item 평가한 유저 수가 min_ratings 이상인 경우만 추천하겠다!
      neighbor_size = min(neighbor_size,len(sim_scores))
      sim_scores = np.array(sim_scores)
      movie_ratings = np.array(movie_ratings)
      user_idx = np.argsort(sim_scores)
      sim_scores = sim_scores[user_idx][-neighbor_size:]
      movie_ratings = movie_ratings[user_idx][-neighbor_size:]
      prediction = np.dot(sim_scores,movie_ratings) / sim_scores.sum()
      prediction = prediction + rating_mean[user_id]
      
    else: prediction = rating_mean[user_id]
  
  else:
    prediction = rating_mean[user_id]
  
  return prediction

sig_level = 3
min_ratings = 2
score(cf_knn_bias_sig,30)

0.9382618178391758

In [None]:
# prediction 값 : 1보다 작을수도 있고, 5보다 클 수 도 있음!
# --> 1이하는 1로, 5 이상은 5로 수정하는 코드 작성

def cf_knn_bias_sig(user_id,movie_id,neighbor_size=0) :
  if movie_id in rating_bias:
    sim_scores = user_similarity[user_id].copy()   
    movie_ratings = rating_bias[movie_id].copy()   
    no_rating = movie_ratings.isnull()
    common_counts = counts[user_id]  # item 별로 대상 유저와 공통으로 평가한 유저 수 가져오기
    low_significance = common_counts < sig_level    # 공통으로 평가한 item이 적은 유저는 빼겠다!
    non_rating_idx = movie_ratings[no_rating | low_significance].index   # | : or 연산자!
    movie_ratings = movie_ratings.drop(non_rating_idx)
    sim_scores = sim_scores.drop(non_rating_idx)

    if neighbor_size == 0:
      prediction = np.dot(sim_scores,movie_ratings) / sim_scores.sum()
      prediction = prediction + rating_mean[user_id]

    
    elif len(sim_scores) > min_ratings :   # 해당 item 평가한 유저 수가 min_ratings 이상인 경우만 추천하겠다!
      neighbor_size = min(neighbor_size,len(sim_scores))
      sim_scores = np.array(sim_scores)
      movie_ratings = np.array(movie_ratings)
      user_idx = np.argsort(sim_scores)
      sim_scores = sim_scores[user_idx][-neighbor_size:]
      movie_ratings = movie_ratings[user_idx][-neighbor_size:]
      prediction = np.dot(sim_scores,movie_ratings) / sim_scores.sum()
      prediction = prediction + rating_mean[user_id]
      
    else: prediction = rating_mean[user_id]
  
  else:
    prediction = rating_mean[user_id]
  
 # prediction 관련 수정 
  if prediction <= 1.0 :
    prediction = 1.0
  elif prediction >= 5.0 :
    prediction = 5.0
  else : prediction

  return prediction

sig_level = 3
min_ratings = 2
score(cf_knn_bias_sig,30)

0.9351860984653748

In [None]:
# 5. 아이템 기반 CF(IBCF)

# 유저기반 CF(UBCF) - 비슷한 취향의 이웃을 선정하는 방식
# 아이템기반 CF(IBCF) - 사용자들의 평가 패턴을 바탕으로 item 간 유사도를 계산하여 사용자의 특정 item에 대한 예측 평점 계산

rating_matrix = X_train.pivot(index='user_id',columns='movie_id',values = 'rating')
rating_matrix_t = rating_matrix.T
matrix_dummy = rating_matrix_t.copy().fillna(0)
item_similarity = cosine_similarity(matrix_dummy,matrix_dummy)     # 유저간 유사도 계산
item_similarity = pd.DataFrame(item_similarity,index=rating_matrix_t.index,
                               columns=rating_matrix_t.index)

# score모델은 맨 위 있던 기본 모델 로드
def score(model):
  id_pairs = zip(X_test['user_id'],X_test['movie_id'])
  y_pred = np.array([model(user,movie) for (user,movie) in id_pairs])  
  y_true = np.array(X_test['rating'])
  return RMSE(y_true,y_pred)

# user_similarity --> item similartiy , movie_ratings --> user_ratings로 변경된 것 확인!
def cf_ibcf(user_id,movie_id):
  if movie_id in item_similarity:
    sim_scores = item_similarity[movie_id]
    user_ratings = rating_matrix_t[user_id]
    non_rating_idx = user_ratings[user_ratings.isnull()].index
    user_ratings = user_ratings.dropna()
    sim_scores = sim_scores.drop(non_rating_idx)
    mean_ratings = np.dot(sim_scores,user_ratings) / sim_scores.sum()
  
  else:
    mean_ratings = 3.0
  return mean_ratings

score(cf_ibcf)

1.0134262743096738