## 추천시스템이 왜 필요한가?

## 우리 조의 최종 목표

## 기존의 해결방법

### 1. 콘텐츠 기반 필터링(Content based filtering)



### 2. 협업 필터링(Collaborative Filtering)

#1. 최근접 이웃(Nearest Neighbor) 협업 필터링



#2. 잠재 요인(Latent Factor) 협업 필터링




## 행렬분해 연습

In [1]:
import numpy as np

#원본 행렬 R생성, 분해 행렬 P와 Q의 초기화, 잠재 요인 차원K는 3으로 설정.
R = np.array([[4,np.NaN,np.NaN,2,np.NaN],
             [np.NaN, 5, np.NaN, 3, 1],
             [np.NaN, np.NaN, 3, 4, 4],
             [5,2,1,2,np.NaN]])
num_users, num_items = R.shape
print(num_users)
print(R.shape)
K=3

#P와 Q 행렬의 크기를 지정하고 정규 분포를 가진 임의의 값으로 입력합니다.
np.random.seed(1) #random성 제어, 첫 번째 값을 1로
P = np.random.normal(scale=1./K, size=(num_users, K)) # np.random.normal()은 정규분포에서 샘플링 해옴. loc은 평균, scale은 표준편차
Q = np.random.normal(scale=1./K, size=(num_items, K))

4
(4, 5)


In [2]:
# get_rmse()함수는 실제 R행렬과 예측 행렬의 오차를 구하는 함수이다.
# 실제 R행렬의 널이 아닌 행렬 값의 위치 인덱스를 추출해 이 인덱스에 있는 실제 R 행렬 값과 분해된 P,Q를 이용해 다시 조합된 예측 행렬 값의 RMSE값을 반환

from sklearn.metrics import mean_squared_error

def get_rmse(R,P,Q,non_zeros) :
    error = 0
    #두 개의 분해된 행렬p와 Q.T의 내적으로 예측 R행렬 생성
    full_pred_matrix = np.dot(P,Q.T)
    
    #실제 R행렬에서 널이 아닌 값의 위치 인덱스 추출해 실제 R행렬과 예측 행렬의 RMSE추출
    x_non_zero_ind = [non_zero[0] for non_zero in non_zeros] #np.nonzero함수를 사용해서 요소들 중 0이 아닌 값들의 index를 반환해 주는 함수를 사용한 것.. 같다.
    y_non_zero_ind = [non_zero[1] for non_zero in non_zeros]
    print(x_non_zero_ind)
    print(y_non_zero_ind)
    R_non_zeros = R[x_non_zero_ind, y_non_zero_ind]
    full_pred_matrix_non_zeros = full_pred_matrix[x_non_zero_ind, y_non_zero_ind]
    
    #mse(평균제곱오차) : 오차의 제곱에 대한 평균을 취한 값으로 통계적 추정의 정확성의 질 평가, 작을수록 추정의 정확성이 높아짐
    mse = mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros)
    #rmse(root mse)
    rmse = np.sqrt(mse)
    
    return rmse

In [3]:
#SGD(경사하강법)을 기반으로 행렬 분해를 수행.

#R>0인 행 위치, 열 위치, 값을 non_zeros 리스트에 저장
non_zeros = [(i,j,R[i,j]) for i in range(num_users) for j in range(num_items) if R[i,j] > 0]
print("non_zeros = ", non_zeros)

#SGD를 반복해서 업데이트할 횟수
steps = 1000
#SGD의 학습률
learning_rate = 0.01
#L2 Regularization 계수
r_lambda=0.01

#SGD기법으로 P와 Q의 행렬을 계속 업데이트
for step in range(steps) :
    for i,j,r in non_zeros :
        #실제 값과 예측 값의 차이인 오류 값 구함
        eij = r -np.dot(P[i,:],Q[j,:].T)
        
        #Regularization을 반영한 SGD 업데이트 공식 적용
        P[i,:] = P[i,:] + learning_rate *(eij * Q[j,:] - r_lambda*P[i,:])
        Q[j,:] = Q[j,:] + learning_rate *(eij * P[i,:] - r_lambda*Q[j,:])
        
    rmse=get_rmse(R,P,Q,non_zeros)
    if(step%50) == 0 :
        print("### iteration step : ", step, "rmse : ", rmse)

non_zeros =  [(0, 0, 4.0), (0, 3, 2.0), (1, 1, 5.0), (1, 3, 3.0), (1, 4, 1.0), (2, 2, 3.0), (2, 3, 4.0), (2, 4, 4.0), (3, 0, 5.0), (3, 1, 2.0), (3, 2, 1.0), (3, 3, 2.0)]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
### iteration step :  0 rmse :  3.2388050277987723
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0

[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[

[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[

[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
### iteration step :  600 rmse :  0.01670132290188466
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3,

[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[

[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]


In [4]:
pred_matrix = np.dot(P,Q.T)
print('예측 행렬:\n', np.round(pred_matrix,3))

예측 행렬:
 [[3.991 0.897 1.306 2.002 1.663]
 [6.696 4.978 0.979 2.981 1.003]
 [6.677 0.391 2.987 3.977 3.986]
 [4.968 2.005 1.006 2.017 1.14 ]]


### 1. 아이템 기반 - 최근접 이웃 협업 필터링

In [5]:
import pandas as pd
import numpy as np

In [121]:
import glob
from tqdm import tqdm
import dask.dataframe as dd

In [122]:
ratings = pd.read_csv('../DataSet/ratings_small/ratings_small.csv', low_memory=False)
movies = pd.read_csv('../DataSet/movies_metadata/movies_metadata.csv', low_memory=False)

print(ratings.shape)
print(movies.shape)

(100004, 4)
(45466, 24)


In [7]:
ratings.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182


In [8]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100004 entries, 0 to 100003
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100004 non-null  int64  
 1   movieId    100004 non-null  int64  
 2   rating     100004 non-null  float64
 3   timestamp  100004 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [9]:
movies.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [123]:
ratings = ratings[['userId', 'movieId', 'rating']] #timestamp는 사용하지 않음.
#ratings.astype('float')
ratings_matrix = ratings.pivot_table('rating', index='userId', columns='movieId')  #행은 사용자, 열은 아이템이 오도록 재구성
print(ratings_matrix.shape)
ratings_matrix.head(3)  #희소행렬의 모습이 보임

(671, 9066)


movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,4.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,


사용자가 평점을 매개지 않은 영화가 칼럼으로 변환되면서 NaN의 값으로 할당되었다. rating의 경우 0.5가 최소점 5.0이 최대임을 알 수 있다. 
이를 통해서 NaN은 이 값에 지장을 주지 않기 위해서 0으로 변환할 것이다.
그러나 칼럼명이 현재 movieId가 숫자로 할당되어 있어 사용자가 평점을 준 영화가 어떤 영환지 알 수 없다. 식별할 수 있도록 movies_metadata를 이용할 것이다.

In [11]:
ratings.describe()

Unnamed: 0,userId,movieId,rating
count,100004.0,100004.0,100004.0
mean,347.01131,12548.664363,3.543608
std,195.163838,26369.198969,1.058064
min,1.0,1.0,0.5
25%,182.0,1028.0,3.0
50%,367.0,2406.5,4.0
75%,520.0,5418.0,4.0
max,671.0,163949.0,5.0


### ratings 전처리과정

In [124]:
ratings.movieId = pd.to_numeric(ratings.movieId, errors = 'coerce')
ratings.userId = pd.to_numeric(ratings.userId, errors = 'coerce')
ratings.rating = pd.to_numeric(ratings.rating, errors = 'coerce')
len(ratings)
df = ratings
print(df)

        userId  movieId  rating
0            1       31     2.5
1            1     1029     3.0
2            1     1061     3.0
3            1     1129     2.0
4            1     1172     4.0
...        ...      ...     ...
99999      671     6268     2.5
100000     671     6269     4.0
100001     671     6365     4.0
100002     671     6385     2.5
100003     671     6565     3.5

[100004 rows x 3 columns]


결측치 처리 및 movie id를 count했을 때 개수에 대해 70퍼센트의 것들만 가져오고 적은 수의 평가가 있는 영화는 제외하겠습니다.

In [127]:
df = df[pd.notnull(df['rating'])]
f=['count','mean']
df_movie_summary = df.groupby('movieId')['rating'].agg(f)
#한번에 형변환 처리
df_movie_summary.index = df_movie_summary.index.map(int)
#quantile 사분위수
movie_benchmark = round(df_movie_summary['count'].quantile(0.7),0)
drop_movie_list = df_movie_summary[df_movie_summary['count'] < movie_benchmark].index
print('Movie minimum times of review: {}'.format(movie_benchmark))

Movie minimum times of review: 7.0


In [129]:
#userId에 대해서 영화 리뷰가 너무 적은 것은 제외
df_cust_summary = df.groupby('userId')['rating'].agg(f)
df_cust_summary.index = df_cust_summary.index.map(int)
cust_benchmark = round(df_cust_summary['count'].quantile(0.7),0)
drop_cust_list = df_cust_summary[df_cust_summary['count'] < cust_benchmark].index
print('Customer minimum times of review: {}'.format(cust_benchmark))

Customer minimum times of review: 138.0


In [130]:
#위에서 제외할 리스트들을 넣어주어 drop실행(df==ratings)
df = df[~df['movieId'].isin(drop_movie_list)]
df = df[~df['userId'].isin(drop_cust_list)]
df

Unnamed: 0,userId,movieId,rating
147,4,10,4.0
148,4,34,5.0
149,4,112,5.0
150,4,141,5.0
151,4,153,4.0
...,...,...,...
99686,665,5479,2.0
99688,665,5502,4.0
99689,665,5679,3.0
99690,665,5952,5.0


In [131]:
#(ratings_matrix)
df_p = pd.pivot_table(df,values='rating',index='userId',columns='movieId')
print(df_p)

movieId  1       2       3       4       5       6       7       9       \
userId                                                                    
4           NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
15          2.0     2.0     NaN     NaN     4.5     4.0     NaN     NaN   
17          NaN     NaN     NaN     NaN     NaN     4.5     NaN     NaN   
19          3.0     3.0     3.0     3.0     NaN     3.0     3.0     3.0   
21          NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
...         ...     ...     ...     ...     ...     ...     ...     ...   
652         NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
654         5.0     3.0     NaN     NaN     4.0     NaN     NaN     NaN   
659         NaN     NaN     NaN     NaN     NaN     3.0     NaN     NaN   
664         3.5     NaN     NaN     NaN     NaN     4.0     NaN     NaN   
665         NaN     3.0     3.0     NaN     3.0     NaN     NaN     NaN   

movieId  10      11     

### movies_metadata 전처리 과정

In [13]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [14]:
movies.describe()

Unnamed: 0,revenue,runtime,vote_average,vote_count
count,45460.0,45203.0,45460.0,45460.0
mean,11209350.0,94.128199,5.618207,109.897338
std,64332250.0,38.40781,1.924216,491.310374
min,0.0,0.0,0.0,0.0
25%,0.0,85.0,5.0,3.0
50%,0.0,95.0,6.0,10.0
75%,0.0,107.0,6.8,34.0
max,2787965000.0,1256.0,10.0,14075.0


In [133]:
#meta == movies_small
meta = movies[['id', 'original_title', 'genres', 'popularity', 'original_language']]
meta = meta.rename(columns={'id':'movieId'})

#문자열 컬럼을 숫자형 컬럼으로 바꿔줌
meta.movieId = pd.to_numeric(meta.movieId, errors = 'coerce')
meta.popularity = pd.to_numeric(meta.popularity, errors = 'coerce')

print(meta.shape)
meta.head()

(45466, 5)


Unnamed: 0,movieId,original_title,genres,popularity,original_language
0,862.0,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",21.946943,en
1,8844.0,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",17.015539,en
2,15602.0,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",11.7129,en
3,31357.0,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",3.859495,en
4,11862.0,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",8.387519,en


In [134]:
#contains = meta['movieId'].str.contains("-|~")
#meta[contains]

AttributeError: Can only use .str accessor with string values!

In [151]:
meta.isna().sum() #처리 필요

movieId               3
original_title        0
genres                0
popularity            6
original_language    11
dtype: int64

In [154]:
upd_meta = meta.dropna(subset=['movieId','original_language'])
print(upd_meta.shape)
upd_meta.isna().sum()

(45452, 5)


movieId              0
original_title       0
genres               0
popularity           3
original_language    0
dtype: int64

위에서 보듯 이상한 데이터들이 조금 보입니다. 이는 형변환을 막게 될 것이므로 삭제하겠습니다.

In [110]:
#movies = movies.drop([19730,29503,35587])

In [112]:
#contains_ = movies['movieId'].str.contains("-|~")
#movies[contains]

  movies[contains]


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,movieId,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count


삭제 완료

In [113]:
#movies_small=movies[['movieId', 'title']]

#형변환
#movies_small=movies_small.astype({'movieId':'float'})
#movies_small=movies_small.astype({'movieId':'int'})
#print(movies_small.shape)

(45463, 2)


In [159]:
#movieId로 조인하기(merge는 how=inner가 default이므로 영화중에 rating을 받은 것만 결과로 나오게 된다.)
rating_movies = pd.merge(df,upd_meta, on='movieId')
print(rating_movies)

#columns="title"로 title칼럼으로 피벗 수행
ratings_matrix = rating_movies.pivot_table('rating', index='userId', columns='original_title')

#NaN값을 모두 0으로 변환(0.5가 최소값이므로 영향이 없는 수로 변환)
ratings_matrix = ratings_matrix.fillna(0)

print(ratings_matrix.shape)
ratings_matrix.head(3)

       userId  movieId  rating           original_title  \
0           4      112     5.0  Italiensk for begyndere   
1          15      112     2.5  Italiensk for begyndere   
2          19      112     3.0  Italiensk for begyndere   
3          21      112     4.0  Italiensk for begyndere   
4          73      112     3.5  Italiensk for begyndere   
...       ...      ...     ...                      ...   
27915     514      452     3.0                Idioterne   
27916     537      452     4.0                Idioterne   
27917     564      452     4.0                Idioterne   
27918     659      452     4.0                Idioterne   
27919     518      711     4.0        Finding Forrester   

                                                  genres  popularity  \
0      [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...    5.396508   
1      [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...    5.396508   
2      [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...    5.396508   
3  

original_title,...Più forte ragazzi!,10 Items or Less,10 Things I Hate About You,12 + 1,1984,2 Days in Paris,"20,000 Leagues Under the Sea",2001: A Space Odyssey,24 Hour Party People,25th Hour,...,隠し砦の三悪人,風の谷のナウシカ,鬼婆,거룩한 계보,괴물,밀양,빈집,사마리아,해안선,활
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,1.5,3.0,0.0,1.0,0.0,4.0,0.0,2.0,0.0,0.0,...,1.5,0.0,2.0,2.0,0.0,0.0,3.0,0.0,0.0,0.0
17,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0


### 코사인 유사도
두 벡터 간의 코사인 각도를 이용하여 구할 수 있는 두 백터의 유사도를 의미합니다. 직관적으로 이해하면 두 벡터가 가리키는 방향이 얼마나 유사한지를 의미하는 것으로 -1부터 1의 값이 나옵니다. -1의 경우 반대의 방향을 의미합니다.
코사인 유사도를 구하는 메서드를 사용하기 위해 사이킷런의 cosine_similarity()를 이용하겠습니다. 이 메서드는 행을 기준으로 서로 다른 행을 비교해서 유사도를 산출합니다. ratings_matrix는 userId가 기준인 행 레벨 데이터입니다. 그대로 메서드를 적용한다면 사용자 간의 유사도가 나오게 됩니다. 우리의 목적에 맞게 영화 간의 유사도를 산출할 수 있도록 transpose()를 사용해서 original_title이 기준이 되도록 해보겠습니다.

In [160]:
ratings_matrix_T = ratings_matrix.transpose()
ratings_matrix_T.head()

userId,4,15,17,19,21,22,23,26,30,33,...,627,641,646,647,648,652,654,659,664,665
original_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
...Più forte ragazzi!,0.0,1.5,0.5,0.0,0.0,3.0,5.0,0.0,4.0,0.0,...,3.5,0.0,0.0,0.0,1.0,0.0,5.0,0.0,0.0,3.0
10 Items or Less,0.0,3.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Things I Hate About You,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
12 + 1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,1.0
1984,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [161]:
from sklearn.metrics.pairwise import cosine_similarity

item_similar = cosine_similarity(ratings_matrix_T,ratings_matrix_T)
item_similar

array([[1.        , 0.13582218, 0.2636923 , ..., 0.23655765, 0.25074757,
        0.16764382],
       [0.13582218, 1.        , 0.        , ..., 0.10889433, 0.19226394,
        0.19182581],
       [0.2636923 , 0.        , 1.        , ..., 0.        , 0.0522837 ,
        0.        ],
       ...,
       [0.23655765, 0.10889433, 0.        , ..., 1.        , 0.20661288,
        0.31624064],
       [0.25074757, 0.19226394, 0.0522837 , ..., 0.20661288, 1.        ,
        0.32292315],
       [0.16764382, 0.19182581, 0.        , ..., 0.31624064, 0.32292315,
        1.        ]])

In [162]:
#item_similar(넘파이 행렬)을 영화명에 매핑해서 DF로 변환
item_sim_df = pd.DataFrame(data=item_similar, index = ratings_matrix.columns, columns = ratings_matrix.columns)

print(item_sim_df.shape)
item_sim_df.head(3)

(1113, 1113)


original_title,...Più forte ragazzi!,10 Items or Less,10 Things I Hate About You,12 + 1,1984,2 Days in Paris,"20,000 Leagues Under the Sea",2001: A Space Odyssey,24 Hour Party People,25th Hour,...,隠し砦の三悪人,風の谷のナウシカ,鬼婆,거룩한 계보,괴물,밀양,빈집,사마리아,해안선,활
original_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
...Più forte ragazzi!,1.0,0.135822,0.263692,0.120818,0.171982,0.236732,0.394005,0.38125,0.306216,0.18622,...,0.451111,0.163723,0.222788,0.268742,0.17976,0.154875,0.264004,0.236558,0.250748,0.167644
10 Items or Less,0.135822,1.0,0.0,0.028023,0.0,0.06772,0.267863,0.120392,0.129064,0.0,...,0.140357,0.0,0.309476,0.117199,0.0,0.195237,0.265033,0.108894,0.192264,0.191826
10 Things I Hate About You,0.263692,0.0,1.0,0.049533,0.0,0.0,0.105031,0.095449,0.177437,0.129663,...,0.170746,0.0,0.083445,0.0,0.040397,0.095566,0.0,0.0,0.052284,0.0


In [163]:
item_sim_df["1984"].sort_values(ascending=False)[:6]

original_title
1984                                 1.000000
Lara Croft: Tomb Raider              0.725476
The Grudge 2                         0.651974
The Rolling Stones: Gimme Shelter    0.638886
L.A. Story                           0.598741
Teenage Mutant Ninja Turtles         0.575829
Name: 1984, dtype: float64

In [164]:
item_sim_df["Rocky III"].sort_values(ascending=False)[1:6]

original_title
Rocky V                    0.803035
Rocky IV                   0.738168
Sweet Sixteen              0.715339
The Discovery of Heaven    0.681300
Two Weeks Notice           0.595490
Name: Rocky III, dtype: float64

영화 조디악의 경우 상위 5개를 불러왔지만 유사도가 기본적으로 낮게 나오는 모습을 볼 수 있습니다. 이유가 무엇일지 알아봐야 할 필요가 있어 보입니다. 그리고 평가를 하기 위해서는 어떻게 바꿔야 할지 생각 해볼 필요가 있음.

## 아이템 기반 최근접 이웃 협업 필터링
앞에서는 영화의 유사도를 생성하여 영화를 추천했습니다. 이는 개인적인 취향을 반영하지 않고 영화 간의 유사도만을 가지고 추천한 것입니다. 이번에는 영화 유사도 데이터를 이용해 최근접 이웃 협업 필터링으로 개인에게 최적화된 영화 추천을 구현해보겠습니다. 개인화된 영화 추천의 가장 큰 특징은 아직 관람하지 않은 영화를 추천한다는 것입니다. 아직 관람하지 않은 영화에 대해서 아이템 유사도와 기존에 관람한 영화의 평점 데이터를 기반으로 해, 새롭게 모든 영화의 예측 평점을 계산한 후에 높은 예측 평점을 가진 영화를 추천하는 방식입니다.

In [165]:
def predict_rating(ratings_arr, item_sim_arr):
    ratings_pred=ratings_arr.dot(item_sim_arr)/np.array([np.abs(item_sim_arr).sum(axis=1)])
    return ratings_pred

In [166]:
ratings_pred = predict_rating(ratings_matrix.values,item_sim_df.values)  #df.values는 numpy array로 변환해서 row를 묶어 반환
ratings_pred_matrix = pd.DataFrame(data=ratings_pred, index=ratings_matrix.index, columns=ratings_matrix.columns)
ratings_pred_matrix.head(3)

original_title,...Più forte ragazzi!,10 Items or Less,10 Things I Hate About You,12 + 1,1984,2 Days in Paris,"20,000 Leagues Under the Sea",2001: A Space Odyssey,24 Hour Party People,25th Hour,...,隠し砦の三悪人,風の谷のナウシカ,鬼婆,거룩한 계보,괴물,밀양,빈집,사마리아,해안선,활
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,0.600986,0.412774,0.551024,0.635409,0.522853,0.574602,0.603745,0.597341,0.582198,0.698483,...,0.595059,0.450856,0.567788,0.461517,0.536166,0.57477,0.494891,0.51456,0.554305,0.494849
15,1.607318,1.771052,1.409177,1.262935,1.088924,1.605284,1.488737,1.506685,1.338929,1.343862,...,1.607,1.350031,1.468574,1.821281,1.393528,1.234661,1.598563,1.353772,1.237474,1.345447
17,0.575904,0.599487,0.446195,0.262593,0.364301,0.516698,0.546777,0.503308,0.48561,0.425758,...,0.542726,0.483557,0.482046,0.572384,0.638878,0.441777,0.566861,0.532056,0.398699,0.495836


예측 평점이 사용자별 영화의 실제 평점과 영화의 코사인 유사도를 내적한 값이기 때문에 기존에 영화를 관람하지 않아 0에 해당했던 실제 영화 평점이 예측에서는 값이 부여되는 경우가 많이 발생합니다. 예측 평점이 실제 평점에 비해 작을 수 있습니다. 이는 내적 결과를 코사인 유사도 벡터 합으로 나누었기 때문에 생기는 현상입니다.
예측 평가 지표로 MSE를 사용하여 실제 평점과 얼마나 차이가 있는지 확인해보겠습니다. 결측치를 0으로 점수를 부여했습니다. 평점을 주지 않아 0으로 세팅된 영화 유사도에 기반해서 예측된 점수를 평가해야 하므로 실제로 0.5와 5.0사이의 값을 받은 영화에 대해서만 오차 정도를 측정하겠습니다.

In [168]:
from sklearn.metrics import mean_squared_error

def get_mse(pred,actual):
    pred=pred[actual.nonzero()].flatten()
    actual=actual[actual.nonzero()].flatten()
    return mean_squared_error(pred,actual)
#flatten()은 모든 차원의 array를 1차로 바꿔줌.

print('아이템 기반 모든 최근접 이웃 MSE : ',get_mse(ratings_pred, ratings_matrix.values))

아이템 기반 모든 최근접 이웃 MSE :  7.838968867414304


이제 이 오차를 줄여야 합니다. 앞의 predict_rating()함수는 해당 영화와 다른 모든 영화와의 유사도 벡터를 적용했으므로 영화의 개수가 많아 상대적으로 평점 예측이 떨어집니다. 특정 영화와 가장 비슷한 유사도를 가지는 영화에 대해서만 유사도 벡터를 적용하는 함수로 변경하겠습니다.

In [169]:
def predict_rating_topsim(ratings_arr, item_sim_arr, n=20) : #top-n 유사도
    #사용자-아이템 평점 행렬 크기만큼 0으로 채운 예측 행렬 초기화
    pred=np.zeros(ratings_arr.shape)
    
    #사용자-아이템 평점 행렬의 열 크기만큼 루프 수행
    for col in range(ratings_arr.shape[1]) :
        #유사도 행렬에서 유사도가 큰 순으로 n개 데이터 행렬의 인덱스 반환
        top_n_items = [np.argsort(item_sim_arr[:,col][:-n-1:-1])]
        #개인화된 예측 평점을 계산
        for row in range(ratings_arr.shape[0]) :
            pred[row,col] = item_sim_arr[col, :][top_n_items].dot(ratings_arr[row,:][top_n_items].T)
            
            pred[row,col] /= np.sum(np.abs(item_sim_arr[col, :][top_n_items]))
        
    return pred

In [28]:
item_sim_df.describe()

title,!Women Art Revolution,'Gator Bait,'Twas the Night Before Christmas,...And God Created Woman,00 Schneider - Jagd auf Nihil Baxter,10 Items or Less,10 Things I Hate About You,"10,000 BC",11'09''01 - September 11,12 Angry Men,...,Zodiac,Zombie Flesh Eaters,Zombie Holocaust,Zozo,eXistenZ,xXx,¡Three Amigos!,À nos amours,Ödipussi,Şaban Oğlu Şaban
count,2794.0,2794.0,2794.0,2794.0,2794.0,2794.0,2794.0,2794.0,2794.0,2794.0,...,2794.0,2794.0,2794.0,2794.0,2794.0,2794.0,2794.0,2794.0,2794.0,2794.0
mean,0.041523,0.035513,0.047828,0.094189,0.086282,0.070016,0.041055,0.022872,0.011733,0.004199,...,0.081169,0.094189,0.075681,0.104087,0.091184,0.067819,0.094189,0.073542,0.010826,0.040299
std,0.111495,0.10678,0.130566,0.219805,0.184091,0.103784,0.081966,0.077118,0.068605,0.043023,...,0.109991,0.219805,0.154236,0.1929,0.120973,0.081345,0.219805,0.098624,0.065735,0.133307
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.012473,0.04295,0.0,0.00792,0.0,0.0
75%,0.0,0.0,0.0,0.061076,0.08806,0.112835,0.055797,0.0,0.0,0.0,...,0.146209,0.061076,0.087615,0.136493,0.165345,0.117944,0.061076,0.133383,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [171]:
ratings_pred = predict_rating_topsim(ratings_matrix.values , item_sim_df.values, n=20)

  pred[row,col] = item_sim_arr[col, :][top_n_items].dot(ratings_arr[row,:][top_n_items].T)
  pred[row,col] /= np.sum(np.abs(item_sim_arr[col, :][top_n_items]))


In [172]:
ratings_pred.shape

(202, 1113)

In [173]:
#NaN은 없는 것으로 확인되나 infinite값이 있어 해결중
#x1 = np.isfinite(ratings_pred)
#x1

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])

In [174]:
#np.where(x1==False)

(array([], dtype=int64), array([], dtype=int64))

In [57]:
#x2 = np.isfinite(ratings_matrix.values)
#x2

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])

In [58]:
#np.where(x2==False) #ratings_matrix는 당연히.. 무한값은 없다

(array([], dtype=int64), array([], dtype=int64))

In [69]:
#무한값 결정해주기
#np.isinf(ratings_pred)

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [70]:
#np.where(np.isinf(ratings_pred)) #없..다?

(array([], dtype=int64), array([], dtype=int64))

In [71]:
#결측치는?
#np.where(np.isnan(ratings_pred)) # 이건 또 왜나와..

(array([  0,   0,   0, ..., 670, 670, 670], dtype=int64),
 array([  28,   30,   33, ..., 2779, 2780, 2792], dtype=int64))

In [117]:
# 값을 먼저 보고 NaN값을 뭘로 대체할지 한번 보자
#print(ratings_pred)
#print(ratings_pred.max())
#print(ratings_pred.min())

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.19443802 0.18815014 0.08974532 ... 0.1904832  0.         0.60408816]
 [0.         0.         0.         ... 0.09237598 0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.11666281 0.06902584 0.05384719 ... 0.08384381 0.         0.20199738]
 [0.19841133 0.17710295 0.18289535 ... 0.09757325 0.40774844 0.36279213]]
5.000000000000001
0.0


In [80]:
#평균값으로 결정
#ratings_pred=np.nan_to_num(ratings_pred, nan=np.nanmean(ratings_pred) )

In [72]:
#np.where(np.isinf(ratings_matrix))

(array([], dtype=int64), array([], dtype=int64))

In [73]:
#np.where(np.isnan(ratings_matrix))

(array([], dtype=int64), array([], dtype=int64))

In [175]:
print('아이템 기반 인접 TOP-20 이웃 MSE: ', get_mse(ratings_pred, ratings_matrix.values ))

아이템 기반 인접 TOP-20 이웃 MSE:  7.2294626903832055


원래 더 낮게 나와야하는데 predict을 한 후 결측치가 생기고 이를 처리하는 과정에서 오차율이 더 높아졌습니다. N을 더 높여서 테스트해보겠습니다.

In [182]:
ratings_pred = predict_rating_topsim(ratings_matrix.values , item_sim_df.values, n=40)
#ratings_pred=np.nan_to_num(ratings_pred, nan=np.nanmean(ratings_pred) )
print('아이템 기반 인접 TOP-40 이웃 MSE: ', get_mse(ratings_pred, ratings_matrix.values ))

  pred[row,col] = item_sim_arr[col, :][top_n_items].dot(ratings_arr[row,:][top_n_items].T)
  pred[row,col] /= np.sum(np.abs(item_sim_arr[col, :][top_n_items]))


아이템 기반 인접 TOP-40 이웃 MSE:  7.330725237828074


In [177]:
#이번엔 10
ratings_pred = predict_rating_topsim(ratings_matrix.values , item_sim_df.values, n=10)
#ratings_pred=np.nan_to_num(ratings_pred, nan=np.nanmean(ratings_pred) )
print('아이템 기반 인접 TOP-10 이웃 MSE: ', get_mse(ratings_pred, ratings_matrix.values ))

  pred[row,col] = item_sim_arr[col, :][top_n_items].dot(ratings_arr[row,:][top_n_items].T)
  pred[row,col] /= np.sum(np.abs(item_sim_arr[col, :][top_n_items]))


아이템 기반 인접 TOP-10 이웃 MSE:  8.088476106846011


n이 낮아질수록 오차가 올라가고 있다.

In [181]:
#이번엔 100
ratings_pred = predict_rating_topsim(ratings_matrix.values , item_sim_df.values, n=100)
#ratings_pred=np.nan_to_num(ratings_pred, nan=np.nanmean(ratings_pred) )
print('아이템 기반 인접 TOP-100 이웃 MSE: ', get_mse(ratings_pred, ratings_matrix.values ))

  pred[row,col] = item_sim_arr[col, :][top_n_items].dot(ratings_arr[row,:][top_n_items].T)
  pred[row,col] /= np.sum(np.abs(item_sim_arr[col, :][top_n_items]))


아이템 기반 인접 TOP-100 이웃 MSE:  7.647356094606713


n이 40에 비해서 오차율이 더 올라갔습니다.

In [183]:
#n=20으로 다시 저장
ratings_pred = predict_rating_topsim(ratings_matrix.values , item_sim_df.values, n=20)
#ratings_pred=np.nan_to_num(ratings_pred, nan=np.nanmean(ratings_pred) )
print('아이템 기반 인접 TOP-20 이웃 MSE: ', get_mse(ratings_pred, ratings_matrix.values ))

  pred[row,col] = item_sim_arr[col, :][top_n_items].dot(ratings_arr[row,:][top_n_items].T)
  pred[row,col] /= np.sum(np.abs(item_sim_arr[col, :][top_n_items]))


아이템 기반 인접 TOP-20 이웃 MSE:  7.2294626903832055


In [184]:
# 계산된 예측 평점 데이터는 DataFrame으로 재생성(n=20)
ratings_pred_matrix = pd.DataFrame(data=ratings_pred, index= ratings_matrix.index,
                                   columns = ratings_matrix.columns)
ratings_pred_matrix

original_title,...Più forte ragazzi!,10 Items or Less,10 Things I Hate About You,12 + 1,1984,2 Days in Paris,"20,000 Leagues Under the Sea",2001: A Space Odyssey,24 Hour Party People,25th Hour,...,隠し砦の三悪人,風の谷のナウシカ,鬼婆,거룩한 계보,괴물,밀양,빈집,사마리아,해안선,활
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,0.191017,0.318609,0.112902,0.022123,0.135047,0.178339,0.536466,0.134087,0.185998,0.081452,...,0.160203,0.258438,0.193002,0.241174,0.388571,0.225998,0.207174,0.317493,0.120991,0.251980
15,1.739182,1.993753,1.116478,1.715129,1.034918,1.984736,1.496901,1.581750,0.985641,0.906718,...,1.690220,1.685933,1.462063,2.097257,1.517784,1.201975,1.510456,1.336638,1.279895,1.104599
17,0.696663,0.227148,0.946727,0.319768,0.362056,0.286194,0.542344,0.399042,0.241727,0.397720,...,0.584377,0.397869,0.501297,0.933461,0.727634,0.468791,0.359893,0.314368,0.347012,0.227859
19,0.471175,0.436296,0.176260,0.319449,0.342821,0.352811,0.440380,0.559142,0.418646,0.344353,...,0.550160,0.537076,0.412129,0.367800,0.333808,0.480430,0.497465,0.507235,0.498980,0.468155
21,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
652,0.487271,0.261881,0.231242,2.360800,0.644233,1.951225,0.477998,0.478884,0.517801,0.317891,...,0.355968,0.767570,0.640306,0.442952,0.442988,0.490558,0.625835,0.586396,0.592406,0.556648
654,2.327084,1.035609,1.483953,1.037159,0.852834,1.060513,1.481309,2.365403,1.183751,2.728009,...,2.269645,1.596130,1.499693,2.009248,1.291221,1.099297,1.441187,1.331919,1.546110,1.049877
659,0.752375,0.531280,0.256475,0.408198,0.411552,0.484432,0.663210,1.126484,0.520079,0.510286,...,0.898432,1.061380,0.563235,0.668215,0.485516,0.460066,0.707125,0.672557,0.660830,0.519335
664,0.524844,0.169036,0.724330,0.245044,0.265014,0.210069,0.421155,0.311283,0.170656,0.304736,...,0.454669,0.291818,0.389325,0.741903,0.570152,0.352771,0.264507,0.230421,0.257181,0.163593


이제 특정 사용자에 대해 영화를 추천해보겠습니다.(userid=15인 사람에게 추천) 먼저 어떤 영화를 좋아하는지 보기 위해서 사용자의 평점을 높은순으로 나열.

In [187]:
ratings_matrix

original_title,...Più forte ragazzi!,10 Items or Less,10 Things I Hate About You,12 + 1,1984,2 Days in Paris,"20,000 Leagues Under the Sea",2001: A Space Odyssey,24 Hour Party People,25th Hour,...,隠し砦の三悪人,風の谷のナウシカ,鬼婆,거룩한 계보,괴물,밀양,빈집,사마리아,해안선,활
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,1.5,3.0,0.0,1.0,0.0,4.0,0.0,2.0,0.0,0.0,...,1.5,0.0,2.0,2.0,0.0,0.0,3.0,0.0,0.0,0.0
17,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
652,0.0,0.0,0.0,4.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
654,5.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5,0.0,4.0,...,0.0,0.0,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0
659,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
664,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.5,0.0,0.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0


In [188]:
user_rating_id = ratings_matrix.loc[15, :]
user_rating_id[ user_rating_id > 0].sort_values(ascending=False)[:10]

original_title
Men in Black II                 5.0
The Man with the Golden Arm     5.0
Mr. Smith Goes to Washington    5.0
And Then There Were None        5.0
Astérix aux Jeux Olympiques     5.0
This Is 40                      5.0
Horrible Bosses                 5.0
Basquiat                        5.0
Deux frères                     5.0
The Projected Man               5.0
Name: 15, dtype: float64

제가 아는 영화는 맨인블랙정도인데 SF영화의 성격들을 선호하지 않는가 싶습니다.
이 사용자에게 아이템 기반 협업 필터링을 통해 영화를 추천해보겠습니다.

In [189]:
def get_unseen_movies(ratings_matrix, userId):
    # userId로 입력받은 사용자의 모든 영화정보 추출하여 Series로 반환함. 
    # 반환된 user_rating 은 영화명(title)을 index로 가지는 Series 객체임. 
    user_rating = ratings_matrix.loc[userId,:]
    
    # user_rating이 0보다 크면 기존에 관람한 영화임. 대상 index를 추출하여 list 객체로 만듬
    already_seen = user_rating[ user_rating > 0].index.tolist()
    
    # 모든 영화명을 list 객체로 만듬. 
    movies_list = ratings_matrix.columns.tolist()
    
    # list comprehension으로 already_seen에 해당하는 movie는 movies_list에서 제외함. 
    unseen_list = [ movie for movie in movies_list if movie not in already_seen]
    
    return unseen_list

In [191]:
def recomm_movie_by_userid(pred_df, userId, unseen_list, top_n=10):
    # 예측 평점 DataFrame에서 사용자id index와 unseen_list로 들어온 영화명 컬럼을 추출하여
    # 가장 예측 평점이 높은 순으로 정렬함. 
    recomm_movies = pred_df.loc[userId, unseen_list].sort_values(ascending=False)[:top_n]
    return recomm_movies
    
# 사용자가 관람하지 않는 영화명 추출   
unseen_list = get_unseen_movies(ratings_matrix, 15)

In [192]:
# 아이템 기반의 인접 이웃 협업 필터링으로 영화 추천 
recomm_movies = recomm_movie_by_userid(ratings_pred_matrix, 15, unseen_list, top_n=10)

In [193]:
# 평점 데이타를 DataFrame으로 생성. 
recomm_movies = pd.DataFrame(data=recomm_movies.values,index=recomm_movies.index,columns=['pred_score'])
recomm_movies

Unnamed: 0_level_0,pred_score
original_title,Unnamed: 1_level_1
The Halliday Brand,3.887316
Wuthering Heights,3.304914
Stitches,3.004306
Vesničko má středisková,2.86778
Lord of Illusions,2.743384
Ot 180 i Vyshe,2.683705
Atlantis: Milo's Return,2.595453
Tomorrow Never Dies,2.580165
The Tooth Fairy,2.543549
North by Northwest,2.51462


# 행렬분해를 이용한 잠재요인 협업 필터링

In [196]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error

def get_rmse(R, P, Q, non_zeros):
    error = 0
    # 두개의 분해된 행렬 P와 Q.T의 내적 곱으로 예측 R 행렬 생성
    full_pred_matrix = np.dot(P, Q.T)
    
    # 실제 R 행렬에서 널이 아닌 값의 위치 인덱스 추출하여 실제 R 행렬과 예측 행렬의 RMSE 추출
    x_non_zero_ind = [non_zero[0] for non_zero in non_zeros]
    y_non_zero_ind = [non_zero[1] for non_zero in non_zeros]
    R_non_zeros = R[x_non_zero_ind, y_non_zero_ind]
    
    full_pred_matrix_non_zeros = full_pred_matrix[x_non_zero_ind, y_non_zero_ind]
      
    mse = mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros)
    rmse = np.sqrt(mse)
    
    return rmse

In [195]:
def matrix_factorization(R, K, steps=200, learning_rate=0.01, r_lambda = 0.01):
    num_users, num_items = R.shape
    # P와 Q 매트릭스의 크기를 지정하고 정규분포를 가진 랜덤한 값으로 입력합니다. 
    np.random.seed(1)
    P = np.random.normal(scale=1./K, size=(num_users, K))
    Q = np.random.normal(scale=1./K, size=(num_items, K))

    break_count = 0
       
    # R > 0 인 행 위치, 열 위치, 값을 non_zeros 리스트 객체에 저장. 
    non_zeros = [ (i, j, R[i,j]) for i in range(num_users) for j in range(num_items) if R[i,j] > 0 ]
   
    # SGD기법으로 P와 Q 매트릭스를 계속 업데이트. 
    for step in range(steps):
        for i, j, r in non_zeros:
            # 실제 값과 예측 값의 차이인 오류 값 구함
            eij = r - np.dot(P[i, :], Q[j, :].T)
            # Regularization을 반영한 SGD 업데이트 공식 적용
            P[i,:] = P[i,:] + learning_rate*(eij * Q[j, :] - r_lambda*P[i,:])
            Q[j,:] = Q[j,:] + learning_rate*(eij * P[i, :] - r_lambda*Q[j,:])
       
        rmse = get_rmse(R, P, Q, non_zeros)
        if (step % 10) == 0 :
            print("### iteration step : ", step," rmse : ", rmse)
            
    return P, Q

In [198]:
#위에서 만든 rating_matrix 그대로 사용

P, Q = matrix_factorization(ratings_matrix.values, K=50, steps=200, learning_rate=0.01, r_lambda = 0.01)
pred_matrix = np.dot(P, Q.T)

### iteration step :  0  rmse :  3.4647302484655094
### iteration step :  10  rmse :  0.7880894383861908
### iteration step :  20  rmse :  0.6249785911066269
### iteration step :  30  rmse :  0.44420440280355133
### iteration step :  40  rmse :  0.3233129746717334
### iteration step :  50  rmse :  0.25303563133048645
### iteration step :  60  rmse :  0.21088864530306578
### iteration step :  70  rmse :  0.18372511923781362
### iteration step :  80  rmse :  0.1649953117041632
### iteration step :  90  rmse :  0.15141442381942236
### iteration step :  100  rmse :  0.14118341767210854
### iteration step :  110  rmse :  0.13323490747702219
### iteration step :  120  rmse :  0.12690079812572003
### iteration step :  130  rmse :  0.12174618013660547
### iteration step :  140  rmse :  0.11747799043483685
### iteration step :  150  rmse :  0.11389223788825312
### iteration step :  160  rmse :  0.11084263666197429
### iteration step :  170  rmse :  0.10822148902899847
### iteration step :  180 

In [199]:
ratings_pred_matrix = pd.DataFrame(data=pred_matrix, index= ratings_matrix.index,
                                   columns = ratings_matrix.columns)

ratings_pred_matrix.head(3)

original_title,...Più forte ragazzi!,10 Items or Less,10 Things I Hate About You,12 + 1,1984,2 Days in Paris,"20,000 Leagues Under the Sea",2001: A Space Odyssey,24 Hour Party People,25th Hour,...,隠し砦の三悪人,風の谷のナウシカ,鬼婆,거룩한 계보,괴물,밀양,빈집,사마리아,해안선,활
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,3.329265,5.322346,3.004986,3.094407,2.344547,5.360562,3.099452,4.076202,4.549152,4.496541,...,4.296037,2.932863,4.514137,3.850993,3.988845,3.781286,4.189713,4.067219,4.048085,4.826816
15,1.435404,3.001976,1.769875,1.000734,1.916531,3.9733,2.437361,2.013053,3.086206,3.166524,...,1.583277,1.925818,1.990993,2.024934,2.91649,2.924575,3.013902,3.03693,1.408105,3.502764
17,0.668335,3.576109,1.920631,1.216108,1.107525,4.156899,2.624392,3.70246,3.265211,3.381696,...,2.452833,3.550034,3.988167,2.991019,2.039528,2.312803,3.95413,3.6266,2.252287,4.036779


In [200]:
def get_unseen_movies(ratings_matrix, userId):
    # userId로 입력받은 사용자의 모든 영화정보 추출하여 Series로 반환함. 
    # 반환된 user_rating 은 영화명(title)을 index로 가지는 Series 객체임. 
    user_rating = ratings_matrix.loc[userId,:]
    
    # user_rating이 0보다 크면 기존에 관람한 영화임. 대상 index를 추출하여 list 객체로 만듬
    already_seen = user_rating[ user_rating > 0].index.tolist()
    
    # 모든 영화명을 list 객체로 만듬. 
    movies_list = ratings_matrix.columns.tolist()
    
    # list comprehension으로 already_seen에 해당하는 movie는 movies_list에서 제외함. 
    unseen_list = [ movie for movie in movies_list if movie not in already_seen]
    
    return unseen_list

In [201]:
def recomm_movie_by_userid(pred_df, userId, unseen_list, top_n=10):
    # 예측 평점 DataFrame에서 사용자id index와 unseen_list로 들어온 영화명 컬럼을 추출하여
    # 가장 예측 평점이 높은 순으로 정렬함. 
    recomm_movies = pred_df.loc[userId, unseen_list].sort_values(ascending=False)[:top_n]
    return recomm_movies

In [204]:
# 사용자가 관람하지 않는 영화명 추출   
unseen_list = get_unseen_movies(ratings_matrix, 15)

# 잠재 요인 협업 필터링으로 영화 추천
recomm_movies = recomm_movie_by_userid(ratings_pred_matrix, 15, unseen_list, top_n=10)

# 평점 데이타를 DataFrame으로 생성. 
recomm_movies = pd.DataFrame(data=recomm_movies.values,index=recomm_movies.index,columns=['pred_score'])
recomm_movies

Unnamed: 0_level_0,pred_score
original_title,Unnamed: 1_level_1
Yella,4.53207
Frankenstein,4.256326
Pirates of the Caribbean: Dead Man's Chest,4.046469
Un éléphant ça trompe énormément,4.017994
The Dreamers,3.991105
Totally Blonde,3.931061
Solar Crisis,3.899069
Sweet Sixteen,3.886358
Alien,3.817802
Once in a Lifetime: The Extraordinary Story of the New York Cosmos,3.74561


아이템 기반 협업 필터링의 결과와 달라진 모습들이 보입니다. 같은 user에 대해서 추천했지만 아이템 기반에서는 고전과 거친 느낌을, 잠재요인 협업 필터링에서는 오래된 영화이면서 공포적인 요소들이 있는 영화들을 추천했습니다.