## 추천시스템이 왜 필요한가?

## 우리 조의 최종 목표

## 기존의 해결방법

### 1. 콘텐츠 기반 필터링(Content based filtering)



### 2. 협업 필터링(Collaborative Filtering)

#1. 최근접 이웃(Nearest Neighbor) 협업 필터링



#2. 잠재 요인(Latent Factor) 협업 필터링




## 행렬분해 연습

In [1]:
import numpy as np

#원본 행렬 R생성, 분해 행렬 P와 Q의 초기화, 잠재 요인 차원K는 3으로 설정.
R = np.array([[4,np.NaN,np.NaN,2,np.NaN],
             [np.NaN, 5, np.NaN, 3, 1],
             [np.NaN, np.NaN, 3, 4, 4],
             [5,2,1,2,np.NaN]])
num_users, num_items = R.shape
print(num_users)
print(R.shape)
K=3

#P와 Q 행렬의 크기를 지정하고 정규 분포를 가진 임의의 값으로 입력합니다.
np.random.seed(1) #random성 제어, 첫 번째 값을 1로
P = np.random.normal(scale=1./K, size=(num_users, K)) # np.random.normal()은 정규분포에서 샘플링 해옴. loc은 평균, scale은 표준편차
Q = np.random.normal(scale=1./K, size=(num_items, K))

4
(4, 5)


In [2]:
# get_rmse()함수는 실제 R행렬과 예측 행렬의 오차를 구하는 함수이다.
# 실제 R행렬의 널이 아닌 행렬 값의 위치 인덱스를 추출해 이 인덱스에 있는 실제 R 행렬 값과 분해된 P,Q를 이용해 다시 조합된 예측 행렬 값의 RMSE값을 반환

from sklearn.metrics import mean_squared_error

def get_rmse(R,P,Q,non_zeros) :
    error = 0
    #두 개의 분해된 행렬p와 Q.T의 내적으로 예측 R행렬 생성
    full_pred_matrix = np.dot(P,Q.T)
    
    #실제 R행렬에서 널이 아닌 값의 위치 인덱스 추출해 실제 R행렬과 예측 행렬의 RMSE추출
    x_non_zero_ind = [non_zero[0] for non_zero in non_zeros] #np.nonzero함수를 사용해서 요소들 중 0이 아닌 값들의 index를 반환해 주는 함수를 사용한 것.. 같다.
    y_non_zero_ind = [non_zero[1] for non_zero in non_zeros]
    print(x_non_zero_ind)
    print(y_non_zero_ind)
    R_non_zeros = R[x_non_zero_ind, y_non_zero_ind]
    full_pred_matrix_non_zeros = full_pred_matrix[x_non_zero_ind, y_non_zero_ind]
    
    #mse(평균제곱오차) : 오차의 제곱에 대한 평균을 취한 값으로 통계적 추정의 정확성의 질 평가, 작을수록 추정의 정확성이 높아짐
    mse = mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros)
    #rmse(root mse)
    rmse = np.sqrt(mse)
    
    return rmse

In [3]:
#SGD(경사하강법)을 기반으로 행렬 분해를 수행.

#R>0인 행 위치, 열 위치, 값을 non_zeros 리스트에 저장
non_zeros = [(i,j,R[i,j]) for i in range(num_users) for j in range(num_items) if R[i,j] > 0]
print("non_zeros = ", non_zeros)

#SGD를 반복해서 업데이트할 횟수
steps = 1000
#SGD의 학습률
learning_rate = 0.01
#L2 Regularization 계수
r_lambda=0.01

#SGD기법으로 P와 Q의 행렬을 계속 업데이트
for step in range(steps) :
    for i,j,r in non_zeros :
        #실제 값과 예측 값의 차이인 오류 값 구함
        eij = r -np.dot(P[i,:],Q[j,:].T)
        
        #Regularization을 반영한 SGD 업데이트 공식 적용
        P[i,:] = P[i,:] + learning_rate *(eij * Q[j,:] - r_lambda*P[i,:])
        Q[j,:] = Q[j,:] + learning_rate *(eij * P[i,:] - r_lambda*Q[j,:])
        
    rmse=get_rmse(R,P,Q,non_zeros)
    if(step%50) == 0 :
        print("### iteration step : ", step, "rmse : ", rmse)

non_zeros =  [(0, 0, 4.0), (0, 3, 2.0), (1, 1, 5.0), (1, 3, 3.0), (1, 4, 1.0), (2, 2, 3.0), (2, 3, 4.0), (2, 4, 4.0), (3, 0, 5.0), (3, 1, 2.0), (3, 2, 1.0), (3, 3, 2.0)]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
### iteration step :  0 rmse :  3.2388050277987723
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0

[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[

[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[

[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
### iteration step :  600 rmse :  0.01670132290188466
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3,

[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[

[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
[0, 3, 1, 3, 4, 2, 3, 4, 0, 1, 2, 3]


In [4]:
pred_matrix = np.dot(P,Q.T)
print('예측 행렬:\n', np.round(pred_matrix,3))

예측 행렬:
 [[3.991 0.897 1.306 2.002 1.663]
 [6.696 4.978 0.979 2.981 1.003]
 [6.677 0.391 2.987 3.977 3.986]
 [4.968 2.005 1.006 2.017 1.14 ]]


### 1. 아이템 기반 - 최근접 이웃 협업 필터링

In [5]:
import pandas as pd
import numpy as np

In [102]:
ratings = pd.read_csv('../DataSet/ratings_small/ratings_small.csv', low_memory=False)
movies = pd.read_csv('../DataSet/movies_metadata/movies_metadata.csv', low_memory=False)

print(ratings.shape)
print(movies.shape)

(100004, 4)
(45466, 24)


In [7]:
ratings.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182


In [8]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100004 entries, 0 to 100003
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100004 non-null  int64  
 1   movieId    100004 non-null  int64  
 2   rating     100004 non-null  float64
 3   timestamp  100004 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [9]:
movies.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [10]:
ratings = ratings[['userId', 'movieId', 'rating']] #timestamp는 사용하지 않음.
#ratings.astype('float')
ratings_matrix = ratings.pivot_table('rating', index='userId', columns='movieId')  #행은 사용자, 열은 아이템이 오도록 재구성
print(ratings_matrix.shape)
ratings_matrix.head(3)  #희소행렬의 모습이 보임

(671, 9066)


movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,4.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,


In [11]:
ratings.describe()

Unnamed: 0,userId,movieId,rating
count,100004.0,100004.0,100004.0
mean,347.01131,12548.664363,3.543608
std,195.163838,26369.198969,1.058064
min,1.0,1.0,0.5
25%,182.0,1028.0,3.0
50%,367.0,2406.5,4.0
75%,520.0,5418.0,4.0
max,671.0,163949.0,5.0


사용자가 평점을 매개지 않은 영화가 칼럼으로 변환되면서 NaN의 값으로 할당되었다. rating의 경우 0.5가 최소점 5.0이 최대임을 알 수 있다. 
이를 통해서 NaN은 이 값에 지장을 주지 않기 위해서 0으로 변환할 것이다.
그러나 칼럼명이 현재 movieId가 숫자로 할당되어 있어 사용자가 평점을 준 영화가 어떤 영환지 알 수 없다. 식별할 수 있도록 movies_metadata를 이용할 것이다.

In [12]:
print(movies.shape)
movies.head(3)

(45466, 24)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [13]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [14]:
movies.describe()

Unnamed: 0,revenue,runtime,vote_average,vote_count
count,45460.0,45203.0,45460.0,45460.0
mean,11209350.0,94.128199,5.618207,109.897338
std,64332250.0,38.40781,1.924216,491.310374
min,0.0,0.0,0.0,0.0
25%,0.0,85.0,5.0,3.0
50%,0.0,95.0,6.0,10.0
75%,0.0,107.0,6.8,34.0
max,2787965000.0,1256.0,10.0,14075.0


In [103]:
#movies_metadata 정리부터
movies.rename(columns={'id':'movieId'}, inplace=True)  #칼럼명 맞추기
movies.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,movieId,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [105]:
contains = movies['movieId'].str.contains("-|~")
movies[contains]

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,movieId,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
19730,- Written by Ørnås,0.065736,/ff9qCepilowshEtG2GYWwzt2bs4.jpg,"[{'name': 'Carousel Productions', 'id': 11176}...","[{'iso_3166_1': 'CA', 'name': 'Canada'}, {'iso...",1997-08-20,0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,...,1,,,,,,,,,
29503,Rune Balot goes to a casino connected to the ...,1.931659,/zV8bHuSL6WXoD6FWogP9j4x80bL.jpg,"[{'name': 'Aniplex', 'id': 2883}, {'name': 'Go...","[{'iso_3166_1': 'US', 'name': 'United States o...",2012-09-29,0,68.0,"[{'iso_639_1': 'ja', 'name': '日本語'}]",Released,...,12,,,,,,,,,
35587,Avalanche Sharks tells the story of a bikini ...,2.185485,/zaSf5OG7V8X8gqFvly88zDdRm46.jpg,"[{'name': 'Odyssey Media', 'id': 17161}, {'nam...","[{'iso_3166_1': 'CA', 'name': 'Canada'}]",2014-01-01,0,82.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,...,22,,,,,,,,,


In [108]:
movies['movieId'].isnull().sum()

0

위에서 보듯 이상한 데이터들이 조금 보입니다. 이는 형변환을 막게 될 것이므로 삭제하겠습니다.

In [110]:
movies = movies.drop([19730,29503,35587])

In [112]:
contains_ = movies['movieId'].str.contains("-|~")
movies[contains]

  movies[contains]


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,movieId,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count


삭제 완료

In [113]:
movies_small=movies[['movieId', 'title']]

#형변환
#movies_small=movies_small.astype({'movieId':'float'})
movies_small=movies_small.astype({'movieId':'int'})
print(movies_small.shape)

(45463, 2)


In [18]:
#movieId로 조인하기(merge는 how=inner가 default이므로 영화중에 rating을 받은 것만 결과로 나오게 된다.)
rating_movies = pd.merge(ratings,movies_small, on='movieId')
print(rating_movies)

#columns="title"로 title칼럼으로 피벗 수행
ratings_matrix = rating_movies.pivot_table('rating', index='userId', columns='title')

#NaN값을 모두 0으로 변환(0.5가 최소값이므로 영향이 없는 수로 변환)
ratings_matrix = ratings_matrix.fillna(0)

print(ratings_matrix.shape)
ratings_matrix.head(3)

       userId  movieId  rating              title
0           1     1371     2.5          Rocky III
1           4     1371     4.0          Rocky III
2           7     1371     3.0          Rocky III
3          19     1371     4.0          Rocky III
4          21     1371     3.0          Rocky III
...       ...      ...     ...                ...
44989     652   129009     4.0     Love Is a Ball
44990     653     2103     3.0            Solaris
44991     659      167     4.0              K-PAX
44992     659      563     3.0  Starship Troopers
44993     665      129     3.0      Spirited Away

[44994 rows x 4 columns]
(671, 2794)


title,!Women Art Revolution,'Gator Bait,'Twas the Night Before Christmas,...And God Created Woman,00 Schneider - Jagd auf Nihil Baxter,10 Items or Less,10 Things I Hate About You,"10,000 BC",11'09''01 - September 11,12 Angry Men,...,Zodiac,Zombie Flesh Eaters,Zombie Holocaust,Zozo,eXistenZ,xXx,¡Three Amigos!,À nos amours,Ödipussi,Şaban Oğlu Şaban
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 코사인 유사도
두 벡터 간의 코사인 각도를 이용하여 구할 수 있는 두 백터의 유사도를 의미합니다. 직관적으로 이해하면 두 벡터가 가리키는 방향이 얼마나 유사한지를 의미하는 것으로 -1부터 1의 값이 나옵니다. -1의 경우 반대의 방향을 의미합니다.
코사인 유사도를 구하는 메서드를 사용하기 위해 사이킷런의 cosine_similarity()를 이용하겠습니다. 이 메서드는 행을 기준으로 서로 다른 행을 비교해서 유사도를 산출합니다. ratings_matrix는 userId가 기준인 행 레벨 데이터입니다. 그대로 메서드를 적용한다면 사용자 간의 유사도가 나오게 됩니다. 우리의 목적에 맞게 영화 간의 유사도를 산출할 수 있도록 transpose()를 사용해서 original_title이 기준이 되도록 해보겠습니다.

In [19]:
ratings_matrix_T = ratings_matrix.transpose()
ratings_matrix_T.head()

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
!Women Art Revolution,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Gator Bait,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Twas the Night Before Christmas,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...And God Created Woman,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00 Schneider - Jagd auf Nihil Baxter,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
from sklearn.metrics.pairwise import cosine_similarity

item_similar = cosine_similarity(ratings_matrix_T,ratings_matrix_T)
item_similar

array([[1.        , 0.        , 0.51370361, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.03818018, 0.        ,
        0.        ],
       [0.51370361, 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.03818018, 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [21]:
#item_similar(넘파이 행렬)을 영화명에 매핑해서 DF로 변환
item_sim_df = pd.DataFrame(data=item_similar, index = ratings_matrix.columns, columns = ratings_matrix.columns)

print(item_sim_df.shape)
item_sim_df.head(3)

(2794, 2794)


title,!Women Art Revolution,'Gator Bait,'Twas the Night Before Christmas,...And God Created Woman,00 Schneider - Jagd auf Nihil Baxter,10 Items or Less,10 Things I Hate About You,"10,000 BC",11'09''01 - September 11,12 Angry Men,...,Zodiac,Zombie Flesh Eaters,Zombie Holocaust,Zozo,eXistenZ,xXx,¡Three Amigos!,À nos amours,Ödipussi,Şaban Oğlu Şaban
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
!Women Art Revolution,1.0,0.0,0.513704,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.118456,0.0,0.0,0.0,0.0
'Gator Bait,0.0,1.0,0.0,0.0,0.0,0.20739,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.182018,0.0,0.03818,0.0,0.0
'Twas the Night Before Christmas,0.513704,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
item_sim_df["Zombie Holocaust"].sort_values(ascending=False)[:6]

title
Zombie Holocaust            1.000000
Doppelganger                0.891067
Germany Year Zero           0.887148
A Woman Is a Woman          0.812600
Far Away                    0.742201
The Legend of Suriyothai    0.728811
Name: Zombie Holocaust, dtype: float64

In [114]:
item_sim_df["Rocky III"].sort_values(ascending=False)[1:6]

title
Rocky V                    0.642748
Rocky IV                   0.556627
The Discovery of Heaven    0.554050
Sweet Sixteen              0.552371
Blood Diamond              0.466559
Name: Rocky III, dtype: float64

영화 조디악의 경우 상위 5개를 불러왔지만 유사도가 기본적으로 낮게 나오는 모습을 볼 수 있습니다. 이유가 무엇일지 알아봐야 할 필요가 있어 보입니다. 그리고 평가를 하기 위해서는 어떻게 바꿔야 할지 생각 해볼 필요가 있음.

## 아이템 기반 최근접 이웃 협업 필터링
앞에서는 영화의 유사도를 생성하여 영화를 추천했습니다. 이는 개인적인 취향을 반영하지 않고 영화 간의 유사도만을 가지고 추천한 것입니다. 이번에는 영화 유사도 데이터를 이용해 최근접 이웃 협업 필터링으로 개인에게 최적화된 영화 추천을 구현해보겠습니다. 개인화된 영화 추천의 가장 큰 특징은 아직 관람하지 않은 영화를 추천한다는 것입니다. 아직 관람하지 않은 영화에 대해서 아이템 유사도와 기존에 관람한 영화의 평점 데이터를 기반으로 해, 새롭게 모든 영화의 예측 평점을 계산한 후에 높은 예측 평점을 가진 영화를 추천하는 방식입니다.

In [24]:
def predict_rating(ratings_arr, item_sim_arr):
    ratings_pred=ratings_arr.dot(item_sim_arr)/np.array([np.abs(item_sim_arr).sum(axis=1)])
    return ratings_pred

In [89]:
ratings_pred = predict_rating(ratings_matrix.values,item_sim_df.values)  #df.values는 numpy array로 변환해서 row를 묶어 반환
ratings_pred_matrix = pd.DataFrame(data=ratings_pred, index=ratings_matrix.index, columns=ratings_matrix.columns)
ratings_pred_matrix.head(3)

title,!Women Art Revolution,'Gator Bait,'Twas the Night Before Christmas,...And God Created Woman,00 Schneider - Jagd auf Nihil Baxter,10 Items or Less,10 Things I Hate About You,"10,000 BC",11'09''01 - September 11,12 Angry Men,...,Zodiac,Zombie Flesh Eaters,Zombie Holocaust,Zozo,eXistenZ,xXx,¡Three Amigos!,À nos amours,Ödipussi,Şaban Oğlu Şaban
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.009557,0.013195,0.012196,0.002702,0.003641,0.004884,0.009138,0.006112,0.0,0.0,...,0.009424,0.002702,0.002797,0.004952,0.004044,0.007195,0.002702,0.010546,0.013637,0.011634
2,0.062819,0.077315,0.06577,0.044663,0.041264,0.071216,0.166476,0.101221,0.059825,0.334201,...,0.128946,0.044663,0.058667,0.049208,0.077035,0.100981,0.044663,0.107261,0.05838,0.068548
3,0.052401,0.047788,0.060794,0.01763,0.019313,0.035306,0.088865,0.050562,0.029333,0.126047,...,0.044671,0.01763,0.023071,0.020874,0.040708,0.065095,0.01763,0.055708,0.140506,0.035313


예측 평점이 사용자별 영화의 실제 평점과 영화의 코사인 유사도를 내적한 값이기 때문에 기존에 영화를 관람하지 않아 0에 해당했던 실제 영화 평점이 예측에서는 값이 부여되는 경우가 많이 발생합니다. 예측 평점이 실제 평점에 비해 작을 수 있습니다. 이는 내적 결과를 코사인 유사도 벡터 합으로 나누었기 때문에 생기는 현상입니다.
예측 평가 지표로 MSE를 사용하여 실제 평점과 얼마나 차이가 있는지 확인해보겠습니다. 결측치를 0으로 점수를 부여했습니다. 평점을 주지 않아 0으로 세팅된 영화 유사도에 기반해서 예측된 점수를 평가해야 하므로 실제로 0.5와 5.0사이의 값을 받은 영화에 대해서만 오차 정도를 측정하겠습니다.

In [90]:
np.where(np.isnan(ratings_pred))

(array([], dtype=int64), array([], dtype=int64))

In [91]:
from sklearn.metrics import mean_squared_error

def get_mse(pred,actual):
    pred=pred[actual.nonzero()].flatten()
    actual=actual[actual.nonzero()].flatten()
    return mean_squared_error(pred,actual)
#flatten()은 모든 차원의 array를 1차로 바꿔줌.

print('아이템 기반 모든 최근접 이웃 MSE : ',get_mse(ratings_pred, ratings_matrix.values))

아이템 기반 모든 최근접 이웃 MSE :  10.083700027356814


이제 이 오차를 줄여야 합니다. 앞의 predict_rating()함수는 해당 영화와 다른 모든 영화와의 유사도 벡터를 적용했으므로 영화의 개수가 많아 상대적으로 평점 예측이 떨어집니다. 특정 영화와 가장 비슷한 유사도를 가지는 영화에 대해서만 유사도 벡터를 적용하는 함수로 변경하겠습니다.

In [27]:
def predict_rating_topsim(ratings_arr, item_sim_arr, n=20) : #top-n 유사도
    #사용자-아이템 평점 행렬 크기만큼 0으로 채운 예측 행렬 초기화
    pred=np.zeros(ratings_arr.shape)
    
    #사용자-아이템 평점 행렬의 열 크기만큼 루프 수행
    for col in range(ratings_arr.shape[1]) :
        #유사도 행렬에서 유사도가 큰 순으로 n개 데이터 행렬의 인덱스 반환
        top_n_items = [np.argsort(item_sim_arr[:,col][:-n-1:-1])]
        #개인화된 예측 평점을 계산
        for row in range(ratings_arr.shape[0]) :
            pred[row,col] = item_sim_arr[col, :][top_n_items].dot(ratings_arr[row,:][top_n_items].T)
            
            pred[row,col] /= np.sum(np.abs(item_sim_arr[col, :][top_n_items]))
        
    return pred

In [28]:
item_sim_df.describe()

title,!Women Art Revolution,'Gator Bait,'Twas the Night Before Christmas,...And God Created Woman,00 Schneider - Jagd auf Nihil Baxter,10 Items or Less,10 Things I Hate About You,"10,000 BC",11'09''01 - September 11,12 Angry Men,...,Zodiac,Zombie Flesh Eaters,Zombie Holocaust,Zozo,eXistenZ,xXx,¡Three Amigos!,À nos amours,Ödipussi,Şaban Oğlu Şaban
count,2794.0,2794.0,2794.0,2794.0,2794.0,2794.0,2794.0,2794.0,2794.0,2794.0,...,2794.0,2794.0,2794.0,2794.0,2794.0,2794.0,2794.0,2794.0,2794.0,2794.0
mean,0.041523,0.035513,0.047828,0.094189,0.086282,0.070016,0.041055,0.022872,0.011733,0.004199,...,0.081169,0.094189,0.075681,0.104087,0.091184,0.067819,0.094189,0.073542,0.010826,0.040299
std,0.111495,0.10678,0.130566,0.219805,0.184091,0.103784,0.081966,0.077118,0.068605,0.043023,...,0.109991,0.219805,0.154236,0.1929,0.120973,0.081345,0.219805,0.098624,0.065735,0.133307
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.012473,0.04295,0.0,0.00792,0.0,0.0
75%,0.0,0.0,0.0,0.061076,0.08806,0.112835,0.055797,0.0,0.0,0.0,...,0.146209,0.061076,0.087615,0.136493,0.165345,0.117944,0.061076,0.133383,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [31]:
item_sim_df.isnull().any()

title
!Women Art Revolution                   False
'Gator Bait                             False
'Twas the Night Before Christmas        False
...And God Created Woman                False
00 Schneider - Jagd auf Nihil Baxter    False
                                        ...  
xXx                                     False
¡Three Amigos!                          False
À nos amours                            False
Ödipussi                                False
Şaban Oğlu Şaban                        False
Length: 2794, dtype: bool

In [88]:
ratings_matrix.values.dtype

dtype('float64')

In [79]:
ratings_pred = predict_rating_topsim(ratings_matrix.values , item_sim_df_text.values, n=20)

  pred[row,col] = item_sim_arr[col, :][top_n_items].dot(ratings_arr[row,:][top_n_items].T)
  pred[row,col] /= np.sum(np.abs(item_sim_arr[col, :][top_n_items]))
  pred[row,col] /= np.sum(np.abs(item_sim_arr[col, :][top_n_items]))


In [87]:
ratings_pred.dtype

dtype('float64')

In [40]:
ratings_pred.shape

(671, 2794)

In [44]:
ratings_pred_doub = ratings_pred.astype(np.complex64)

In [45]:
ratings_pred_doub.dtype

dtype('complex64')

In [56]:
#NaN은 없는 것으로 확인되나 infinite값이 있어 해결중
x1 = np.isfinite(ratings_pred)
x1

array([[ True,  True,  True, ...,  True, False,  True],
       [ True,  True,  True, ...,  True, False,  True],
       [ True,  True,  True, ...,  True, False,  True],
       ...,
       [ True,  True,  True, ...,  True, False,  True],
       [ True,  True,  True, ...,  True, False,  True],
       [ True,  True,  True, ...,  True, False,  True]])

In [63]:
np.where(x1==False)

(array([  0,   0,   0, ..., 670, 670, 670], dtype=int64),
 array([  28,   30,   33, ..., 2779, 2780, 2792], dtype=int64))

In [57]:
x2 = np.isfinite(ratings_matrix.values)
x2

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])

In [58]:
np.where(x2==False) #ratings_matrix는 당연히.. 무한값은 없다

(array([], dtype=int64), array([], dtype=int64))

In [69]:
#무한값 결정해주기
np.isinf(ratings_pred)

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [70]:
np.where(np.isinf(ratings_pred)) #없..다?

(array([], dtype=int64), array([], dtype=int64))

In [71]:
#결측치는?
np.where(np.isnan(ratings_pred)) # 이건 또 왜나와..

(array([  0,   0,   0, ..., 670, 670, 670], dtype=int64),
 array([  28,   30,   33, ..., 2779, 2780, 2792], dtype=int64))

In [117]:
# 값을 먼저 보고 NaN값을 뭘로 대체할지 한번 보자
print(ratings_pred)
print(ratings_pred.max())
print(ratings_pred.min())

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.19443802 0.18815014 0.08974532 ... 0.1904832  0.         0.60408816]
 [0.         0.         0.         ... 0.09237598 0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.11666281 0.06902584 0.05384719 ... 0.08384381 0.         0.20199738]
 [0.19841133 0.17710295 0.18289535 ... 0.09757325 0.40774844 0.36279213]]
5.000000000000001
0.0


In [80]:
#평균값으로 결정
ratings_pred=np.nan_to_num(ratings_pred, nan=np.nanmean(ratings_pred) )

In [72]:
np.where(np.isinf(ratings_matrix))

(array([], dtype=int64), array([], dtype=int64))

In [73]:
np.where(np.isnan(ratings_matrix))

(array([], dtype=int64), array([], dtype=int64))

In [81]:
print('아이템 기반 인접 TOP-20 이웃 MSE: ', get_mse(ratings_pred, ratings_matrix.values ))

아이템 기반 인접 TOP-20 이웃 MSE:  11.198950505653512


원래 더 낮게 나와야하는데 predict을 한 후 결측치가 생기고 이를 처리하는 과정에서 오차율이 더 높아졌습니다. N을 더 높여서 테스트해보겠습니다.

In [92]:
ratings_pred = predict_rating_topsim(ratings_matrix.values , item_sim_df_text.values, n=40)
ratings_pred=np.nan_to_num(ratings_pred, nan=np.nanmean(ratings_pred) )
print('아이템 기반 인접 TOP-40 이웃 MSE: ', get_mse(ratings_pred, ratings_matrix.values ))

  pred[row,col] = item_sim_arr[col, :][top_n_items].dot(ratings_arr[row,:][top_n_items].T)
  pred[row,col] /= np.sum(np.abs(item_sim_arr[col, :][top_n_items]))
  pred[row,col] /= np.sum(np.abs(item_sim_arr[col, :][top_n_items]))


아이템 기반 인접 TOP-40 이웃 MSE:  9.51435187837491


In [83]:
#이번엔 10
ratings_pred = predict_rating_topsim(ratings_matrix.values , item_sim_df_text.values, n=10)
ratings_pred=np.nan_to_num(ratings_pred, nan=np.nanmean(ratings_pred) )
print('아이템 기반 인접 TOP-40 이웃 MSE: ', get_mse(ratings_pred, ratings_matrix.values ))

  pred[row,col] = item_sim_arr[col, :][top_n_items].dot(ratings_arr[row,:][top_n_items].T)
  pred[row,col] /= np.sum(np.abs(item_sim_arr[col, :][top_n_items]))
  pred[row,col] /= np.sum(np.abs(item_sim_arr[col, :][top_n_items]))


아이템 기반 인접 TOP-40 이웃 MSE:  12.957392552538005


n이 낮아질수록 오차가 올라가고 있다.

In [84]:
#이번엔 100
ratings_pred = predict_rating_topsim(ratings_matrix.values , item_sim_df_text.values, n=100)
ratings_pred=np.nan_to_num(ratings_pred, nan=np.nanmean(ratings_pred) )
print('아이템 기반 인접 TOP-100 이웃 MSE: ', get_mse(ratings_pred, ratings_matrix.values ))

  pred[row,col] = item_sim_arr[col, :][top_n_items].dot(ratings_arr[row,:][top_n_items].T)
  pred[row,col] /= np.sum(np.abs(item_sim_arr[col, :][top_n_items]))
  pred[row,col] /= np.sum(np.abs(item_sim_arr[col, :][top_n_items]))


아이템 기반 인접 TOP-100 이웃 MSE:  9.749952233171873


In [101]:
#이번엔 40
ratings_pred = predict_rating_topsim(ratings_matrix.values , item_sim_df_text.values, n=50)
ratings_pred=np.nan_to_num(ratings_pred, nan=0 )
print('아이템 기반 인접 TOP-100 이웃 MSE: ', get_mse(ratings_pred, ratings_matrix.values ))

  pred[row,col] = item_sim_arr[col, :][top_n_items].dot(ratings_arr[row,:][top_n_items].T)
  pred[row,col] /= np.sum(np.abs(item_sim_arr[col, :][top_n_items]))
  pred[row,col] /= np.sum(np.abs(item_sim_arr[col, :][top_n_items]))


아이템 기반 인접 TOP-100 이웃 MSE:  9.684453719357604


n이 40에 비해서 오차율이 더 올라갔습니다.

In [94]:
# 계산된 예측 평점 데이터는 DataFrame으로 재생성(n=40)
ratings_pred_matrix = pd.DataFrame(data=ratings_pred, index= ratings_matrix.index,
                                   columns = ratings_matrix.columns)
ratings_pred_matrix

title,!Women Art Revolution,'Gator Bait,'Twas the Night Before Christmas,...And God Created Woman,00 Schneider - Jagd auf Nihil Baxter,10 Items or Less,10 Things I Hate About You,"10,000 BC",11'09''01 - September 11,12 Angry Men,...,Zodiac,Zombie Flesh Eaters,Zombie Holocaust,Zozo,eXistenZ,xXx,¡Three Amigos!,À nos amours,Ödipussi,Şaban Oğlu Şaban
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.224192,0.210559,0.105975,0.121282,0.104295,0.232328,0.217078,0.299323,0.0,0.171682,...,0.465011,0.121282,0.141387,0.160977,0.262873,0.352887,0.121282,0.241769,0.000000,0.733436
3,0.000000,0.000000,0.000000,0.168366,0.144784,0.143299,0.082734,0.160364,0.0,0.000000,...,0.318201,0.168366,0.226100,0.145516,0.100595,0.118091,0.168366,0.117247,0.000000,0.000000
4,0.093153,0.000000,0.110082,0.110235,0.094795,0.179432,0.094795,0.139994,0.0,0.000000,...,0.117088,0.110235,0.087724,0.095274,0.233537,0.142336,0.110235,0.245650,0.000000,0.577916
5,0.179354,0.102996,0.084780,0.097025,0.083436,0.131794,0.089396,0.168986,0.0,0.137346,...,0.186037,0.097025,0.057909,0.128782,0.159898,0.212280,0.097025,0.141891,0.000000,0.326999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,0.179354,0.102996,0.084780,0.097025,0.083436,0.131794,0.089396,0.168986,0.0,0.137346,...,0.186037,0.097025,0.057909,0.128782,0.159898,0.212280,0.097025,0.141891,0.000000,0.326999
668,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
669,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
670,0.134515,0.077247,0.063585,0.072769,0.062577,0.098846,0.067047,0.126740,0.0,0.103009,...,0.139528,0.072769,0.043432,0.096586,0.119923,0.159210,0.072769,0.106418,0.000000,0.245249


이제 특정 사용자에 대해 영화를 추천해보겠습니다.(userid=9인 사람에게 추천) 먼저 어떤 영화를 좋아하는지 보기 위해서 사용자의 평점을 높은순으로 나열.

In [95]:
user_rating_id = ratings_matrix.loc[9, :]
user_rating_id[ user_rating_id > 0].sort_values(ascending=False)[:10]

title
The Hidden Fortress         5.0
Men in Black II             5.0
Mothra vs. Godzilla         5.0
Once Were Warriors          5.0
Terminator Salvation        5.0
A Brief History of Time     4.0
Solaris                     4.0
Wedlock                     4.0
The Million Dollar Hotel    4.0
The Green Mile              4.0
Name: 9, dtype: float64

제가 아는 영화는 맨인블랙 모쓰라 vs 고질라 정도인데 SF영화의 성격들을 선호하지 않는가 싶습니다.
이 사용자에게 오차가 많이 나오긴하지만 아이템 기반 협업 필터링을 통해 영화를 추천해보겠습니다.

In [96]:
def get_unseen_movies(ratings_matrix, userId):
    # userId로 입력받은 사용자의 모든 영화정보 추출하여 Series로 반환함. 
    # 반환된 user_rating 은 영화명(title)을 index로 가지는 Series 객체임. 
    user_rating = ratings_matrix.loc[userId,:]
    
    # user_rating이 0보다 크면 기존에 관람한 영화임. 대상 index를 추출하여 list 객체로 만듬
    already_seen = user_rating[ user_rating > 0].index.tolist()
    
    # 모든 영화명을 list 객체로 만듬. 
    movies_list = ratings_matrix.columns.tolist()
    
    # list comprehension으로 already_seen에 해당하는 movie는 movies_list에서 제외함. 
    unseen_list = [ movie for movie in movies_list if movie not in already_seen]
    
    return unseen_list

In [97]:
def recomm_movie_by_userid(pred_df, userId, unseen_list, top_n=10):
    # 예측 평점 DataFrame에서 사용자id index와 unseen_list로 들어온 영화명 컬럼을 추출하여
    # 가장 예측 평점이 높은 순으로 정렬함. 
    recomm_movies = pred_df.loc[userId, unseen_list].sort_values(ascending=False)[:top_n]
    return recomm_movies
    
# 사용자가 관람하지 않는 영화명 추출   
unseen_list = get_unseen_movies(ratings_matrix, 9)

In [98]:
# 아이템 기반의 인접 이웃 협업 필터링으로 영화 추천 
recomm_movies = recomm_movie_by_userid(ratings_pred_matrix, 9, unseen_list, top_n=10)

In [99]:
# 평점 데이타를 DataFrame으로 생성. 
recomm_movies = pd.DataFrame(data=recomm_movies.values,index=recomm_movies.index,columns=['pred_score'])
recomm_movies

Unnamed: 0_level_0,pred_score
title,Unnamed: 1_level_1
The Discreet Charm of the Bourgeoisie,0.24662
The Promise,0.24662
Changeling,0.24662
Addams Family Values,0.24662
Ring of Fire II: Blood and Steel,0.24662
Bedazzled,0.24662
Kiss Kiss Bang Bang,0.24662
Aamdani Atthanni Kharcha Rupaiya,0.24662
Omagh,0.24662
House of Cards,0.24662


정말 절망적인 스코어입니다. 기본적으로 24%정도도 넘기지 못하는 쓰지 못하는 학습기를 만든 것 같습니다.