# SGD를 이용해 행렬 분해를 수행하는 예제를 파이썬으로 구현

In [18]:
import numpy as np

# 원본 행렬 R생성, 분해 행렬 P와 Q초기화, 잠재 요인 차원 K는 3으로 설정
R = np.array([[4, np.NaN, np.NaN, 2, np.NaN],
              [np.NaN, 5, np.NaN, 3, 1],
              [np.NaN, np.NaN, 3, 4, 4],
              [5, 2, 1, 2, np.NaN]])
num_users, num_items = R.shape
k = 3

print('사용자-아이템\n', R)
print('\nnum_users = {}, num_items = {}, k = {}'.format(num_users, num_items, k))

# P와 Q행렬의 크기를 지정하고 정뷰 분포를 가진 임의의 값으로 입력
np.random.seed(1)
P = np.random.normal(scale = 1./k, size = (num_users, k))
Q = np.random.normal(scale = 1./k, size = (num_items, k))

print('\n사용자-잠재요인\n', P)
print('\n잠재요인-아이템\n', Q)
print('\n행렬곱을 위해 Q를 전치함\n', Q.T)

사용자-아이템
 [[ 4. nan nan  2. nan]
 [nan  5. nan  3.  1.]
 [nan nan  3.  4.  4.]
 [ 5.  2.  1.  2. nan]]

num_users = 4, num_items = 5, k = 3

사용자-잠재요인
 [[ 0.54144845 -0.2039188  -0.17605725]
 [-0.35765621  0.28846921 -0.76717957]
 [ 0.58160392 -0.25373563  0.10634637]
 [-0.08312346  0.48736931 -0.68671357]]

잠재요인-아이템
 [[-0.1074724  -0.12801812  0.37792315]
 [-0.36663042 -0.05747607 -0.29261947]
 [ 0.01407125  0.19427174 -0.36687306]
 [ 0.38157457  0.30053024  0.16749811]
 [ 0.30028532 -0.22790929 -0.04096341]]

행렬곱을 위해 Q를 전치함
 [[-0.1074724  -0.36663042  0.01407125  0.38157457  0.30028532]
 [-0.12801812 -0.05747607  0.19427174  0.30053024 -0.22790929]
 [ 0.37792315 -0.29261947 -0.36687306  0.16749811 -0.04096341]]


In [6]:
from sklearn.metrics import mean_squared_error

def get_rmse(R, P, Q, non_zeros):
    error = 0
    
    # 두 개의 분해된 행렬 P와, Q, T의 내적으로 예측 R 행렬 생성
    full_pred_matrix = np.dot(P, Q.T)
    
    # 실제 R행렬에서 널이 아닌 값의 위치 인덱스 추출해 실제 R 행렬과 예측 행렬의 RMSE 추출
    x_non_zero_ind = [non_zero[0] for non_zero in non_zeros]
    y_non_zero_ind = [non_zero[1] for non_zero in non_zeros]     
    R_non_zeros = R[x_non_zero_ind, y_non_zero_ind]
    full_pred_matrix_non_zeros = full_pred_matrix[x_non_zero_ind, y_non_zero_ind]
    mse = mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros)
    rmse = np.sqrt(mse)
    
    return rmse

In [11]:
# R > 0인 행 위치, 열 위치, 값을 non_zeros 리스트에 저장
non_zeros = [(i, j, R[i, j]) for i in range(num_users) for j in range(num_items) if R[i, j] > 0]

steps = 1000
learning_rate = 0.01
r_lambda = 0.01

# SGD 기법으로 P와 q 매트릭스를 계속 업데이터
for step in range(steps):
    for i, j, r in non_zeros:
        # 실제 값과 예측 값의 차이인 오류 값 구함
        eij = r - np.dot(P[i, :], Q[j, :].T)
        # Regularization을 반영한 SGD 업데이트 공식 적용
        P[i, :] = P[i, :] + learning_rate * (eij * Q[j, :] - r_lambda * P[i, :])
        Q[j, :] = Q[i, :] + learning_rate * (eij * P[i, :] - r_lambda * Q[j, :])
        
    rmse = get_rmse(R, P, Q, non_zeros)
    if (step % 50) == 0:
        print('### iteration step : ', step, 'rmse : ', rmse)

### iteration step :  0 rmse :  1.2586324357232976
### iteration step :  50 rmse :  1.2548760897863982
### iteration step :  100 rmse :  1.2511823749911724
### iteration step :  150 rmse :  1.2475555881812777
### iteration step :  200 rmse :  1.2440005872317994
### iteration step :  250 rmse :  1.2405224011038414
### iteration step :  300 rmse :  1.2371259349564172
### iteration step :  350 rmse :  1.2338157547317115
### iteration step :  400 rmse :  1.230595937394563
### iteration step :  450 rmse :  1.2274699746387123
### iteration step :  500 rmse :  1.2244407194758475
### iteration step :  550 rmse :  1.2215104259286989
### iteration step :  600 rmse :  1.2189201699307273
### iteration step :  650 rmse :  6.037097462814543
### iteration step :  700 rmse :  1.2183753045216366
### iteration step :  750 rmse :  1.2160842876973332
### iteration step :  800 rmse :  1.2140317032868786
### iteration step :  850 rmse :  1.212152313554194
### iteration step :  900 rmse :  1.210420987283429


In [12]:
pred_matrix = np.dot(P, Q.T)
print('예측 행렬 :\n', pred_matrix)

예측 행렬 :
 [[2.78523689 1.48854503 1.411302   2.01101028 1.95305283]
 [2.87772222 2.34294956 2.31113444 2.55838252 2.53577798]
 [4.22621382 3.06141907 2.99207382 3.53070168 3.47991831]
 [2.66615514 1.41367759 1.33906779 1.91832847 1.86232501]]


# 콘텐츠 기반 필터링 실습 - TMDB 5000 영화 데이터 세트

## 장르 속성을 이용한 영화 콘텐츠 기반 필터링

콘텐츠 기반 필터링은 사용자가 특정 영화를 감상하고 그 영화를 좋아했다면 그 영화와 비슷한 특성/속성, 구성 요소 등을 가진 다른 영화를 추천하는 것이다

## 데이터 로딩 및 가공

In [19]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

movies = pd.read_csv('./data/tmdb_5000_movie_dataset/tmdb_5000_movies.csv')
print(movies.shape)
movies.head(1)

(4803, 20)


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


id, title, genres, vote_average, vote_count, popularity, keywords, overview만 가져오기

In [21]:
movies_df = movies[['id', 'title', 'genres', 'vote_average', 'vote_count', 'popularity', 
                   'keywords', 'overview']]

pd.set_option('max_colwidth', 100)
movies_df[['genres', 'keywords']][:1]

Unnamed: 0,genres,keywords
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""name"": ""Fantasy""}, {...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"": 2964, ""name"": ""future""}, {""id"": 3386, ""name"": ""sp..."


genres컬럼의 문자열을 분해해서 개별 장르를 파이썬 리스트 객체로 추출

In [22]:
from ast import literal_eval

movies_df['genres'] = movies_df['genres'].apply(literal_eval)
movies_df['keywords'] = movies_df['keywords'].apply(literal_eval)

장르명만 리스트 객체로 추출

In [24]:
movies_df['genres'] = movies_df['genres'].apply(lambda x : [y['name'] for y in x])
movies_df['keywords'] = movies_df['keywords'].apply(lambda x : [y['name'] for y in x])
movies_df[['genres', 'keywords']][:1]

Unnamed: 0,genres,keywords
0,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colony, society, space travel, futuristic, romance, spa..."


## 장르 콘텐츠 유사도 측정

genres 컬럼을 문자열로 변환한 뒤 사이킷런의 CountVectorizer를 이용해 피처 벡터 형태 행렬로 만들기

In [28]:
from sklearn.feature_extraction.text import CountVectorizer

# CountVectorizer를 적용하기 위해 공백 문자로 word 단위가 구분되는 문자열로 반환
movies_df['genres_literal'] = movies_df['genres'].apply(lambda x : (' '.join(x)))
count_vect = CountVectorizer(min_df = 0.0, ngram_range = (1, 2))
genre_mat = count_vect.fit_transform(movies_df['genres_literal'])
print(genre_mat.shape)

(4803, 276)


피처 벡터화된 행렬에 cosine_similairities()를 적용한 후 반환된 코사인 유사도 행렬의 크기 및 앞 2개 데이터만 추출

In [29]:
from sklearn.metrics.pairwise import cosine_similarity

genre_sim = cosine_similarity(genre_mat, genre_mat)
print(genre_sim.shape)
print(genre_sim[:2])

(4803, 4803)
[[1.         0.59628479 0.4472136  ... 0.         0.         0.        ]
 [0.59628479 1.         0.4        ... 0.         0.         0.        ]]


In [30]:
genre_sim_sorted_ind = genre_sim.argsort()[:, ::-1]
print(genre_sim_sorted_ind[:1])

[[   0 3494  813 ... 3038 3037 2401]]


## 장르 콘텐츠 필터링을 이용한 영화 추천

In [31]:
def find_sim_movie(df, sorted_ind, title_name, top_n = 10):
    # 인자로 입력된 movies_df DataFrame에서 'title'컬럼이 입력된 title_name 값이 DataFrame 추출
    title_movie = df[df['title'] == title_name]
    
    # title_name을 가진 DataFrame의 index 객체를 ndarray로 반환하고 
    # sorted_ind 인자로 입력된 genre_sim_sorted_ind 객체에서 유사도 순으로 top_n개의 index 추출
    title_index = title_movie.index.values
    similar_indexes = sorted_ind[title_index, :(top_n)]
    
    # 추출된 top_n index 출력, top_n index는 2차원 데이터
    # dataframe에서 index로 사용하기 위해서는 1차원 array로 변경
    print(similar_indexes)
    similar_indexes = similar_indexes.reshape(-1)
    
    return df.iloc[similar_indexes]

In [32]:
similar_movies = find_sim_movie(movies_df, genre_sim_sorted_ind, 'The Godfather', 10)
similar_movies[['title', 'vote_average']]

[[2731 1243 3636 1946 2640 4065 1847 4217  883 3866]]


Unnamed: 0,title,vote_average
2731,The Godfather: Part II,8.3
1243,Mean Streets,7.2
3636,Light Sleeper,5.7
1946,The Bad Lieutenant: Port of Call - New Orleans,6.0
2640,Things to Do in Denver When You're Dead,6.7
4065,Mi America,0.0
1847,GoodFellas,8.2
4217,Kids,6.8
883,Catch Me If You Can,7.7
3866,City of God,8.1


In [33]:
movies_df[['title', 'vote_average', 'vote_count']].sort_values('vote_average', ascending = False)[:10]

Unnamed: 0,title,vote_average,vote_count
3519,Stiff Upper Lips,10.0,1
4247,Me You and Five Bucks,10.0,2
4045,"Dancer, Texas Pop. 81",10.0,1
4662,Little Big Top,10.0,1
3992,Sardaarji,9.5,2
2386,One Man's Hero,9.3,2
2970,There Goes My Baby,8.5,2
1881,The Shawshank Redemption,8.5,8205
2796,The Prisoner of Zenda,8.4,11
3337,The Godfather,8.4,5893


In [34]:
C = movies_df['vote_average'].mean()
m = movies_df['vote_count'].quantile(0.6)
print('C : ', round(C, 3), 'm : ', round(m, 3))

C :  6.092 m :  370.2


In [36]:
percentile = 0.6
m = movies_df['vote_count'].quantile(percentile)
C = movies_df['vote_average'].mean()

def weighted_vote_average(record):
    v = record['vote_count']
    R = record['vote_average']
    
    return ((v / (v + m)) * R) + ((m / (m + v)) * C)

movies_df['weighted_vote'] = movies.apply(weighted_vote_average, axis = 1)

In [37]:
movies_df[['title', 'vote_average', 'weighted_vote', 'vote_count']].sort_values('weighted_vote', ascending = False)[:10]

Unnamed: 0,title,vote_average,weighted_vote,vote_count
1881,The Shawshank Redemption,8.5,8.396052,8205
3337,The Godfather,8.4,8.263591,5893
662,Fight Club,8.3,8.216455,9413
3232,Pulp Fiction,8.3,8.207102,8428
65,The Dark Knight,8.2,8.13693,12002
1818,Schindler's List,8.3,8.126069,4329
3865,Whiplash,8.3,8.123248,4254
809,Forrest Gump,8.2,8.105954,7927
2294,Spirited Away,8.3,8.105867,3840
2731,The Godfather: Part II,8.3,8.079586,3338


In [39]:
def find_sim_movie(df, sorted_ind, title_name, top_n = 10):
    title_movie = df[df['title'] == title_name]
    title_index = title_movie.index.values
    
    # top_n의ㅜ 2배에 해당하는 장르 유서성이 높은 인덱스 추출
    similar_indexes = sorted_ind[title_index, :(top_n * 2)]
    similar_indexes = similar_indexes.reshape(-1)
    
    # 기준 영화 인덱스는 제외
    similar_indexes = similar_indexes[similar_indexes != title_index]
    
    # top_n의 2배에 해당하는 후보군에서 weighted_vote가 높은 순으로 top_n만큼 추출
    return df.iloc[similar_indexes].sort_values('weighted_vote', ascending = False)[:top_n]

similar_movies = find_sim_movie(movies_df, genre_sim_sorted_ind, 'The Godfather', 10)
similar_movies[['title', 'vote_average', 'weighted_vote']]

Unnamed: 0,title,vote_average,weighted_vote
2731,The Godfather: Part II,8.3,8.079586
1847,GoodFellas,8.2,7.976937
3866,City of God,8.1,7.759693
1663,Once Upon a Time in America,8.2,7.657811
883,Catch Me If You Can,7.7,7.557097
281,American Gangster,7.4,7.141396
4041,This Is England,7.4,6.739664
1149,American Hustle,6.8,6.717525
1243,Mean Streets,7.2,6.626569
2839,Rounders,6.9,6.530427


# 아이템 기반 최근접 이웃 협업 필터링 학습

최근접 이웃 협업 필터링은 사용자 기반과 아이템 기반으로 분류한다

이 중 일반적으로 추천 정확도가 더 뛰어난 아이템 기반의 협업 필터링을 구현하자

## 데이터 가공 및 변환

In [1]:
import pandas as pd
import numpy as np

movies = pd.read_csv('./data/ml_latest_small/movies.csv')
ratings = pd.read_csv('./data/ml_latest_small/ratings.csv')

print(movies.shape)
print(ratings.shape)

display(movies.head(1))
display(ratings.head(1))

(9742, 3)
(100836, 4)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703


ratings.csv의 DataFrame인 ratings를 이용해 아이템 기반의 최근접 이웃 협업 필터링을 구현해 보자

In [3]:
ratings = ratings[['userId', 'movieId', 'rating']]
ratings_matrix = ratings.pivot_table('rating', columns = 'movieId', index = 'userId')

ratings_matrix.head(3)

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,


=> NaN값이 많은 이유는 사용자가 평점을 매기지 않은 영화가 컬럼으로 변환되면서 NaN으로 값이 할당됐기 때문이다

<br>

가독성을 높이기 위해 컬럼명을 movied가 아닌 영화명 title로 변경 -> ratings와 movies를 조인해 title를 가져온다

NaN값은 0으로 변환

In [7]:
# title 컬럼을 얻기 위해 movies와 조인
rating_movies = pd.merge(movies, ratings, on = 'movieId')

# columns = 'title'로 title 컬럼으로 피벗 수행
ratings_matrix = rating_movies.pivot_table('rating', index = 'userId', columns = 'title')


# NaN값을 모두 0으로 변환
ratings_matrix = ratings_matrix.fillna(0)

ratings_matrix.head(3)

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 영화 간 유사도 산출

변환된 사용자-영화 평점 행렬 데이터(ratings_matrix)세트를 이용해 영화 간의 유사도를 측정

영화 간의 유사도는 코사인 유사도를 기반으로 하고 사이킷런의 cosine_similarity()을 이용해 측정

cosine_similarity()함수는 행을 기준으로 서로 다른 행을 비교해 유사도를 산출한다 -> 지금은 영화간 유사도가 아닌 사용자 간의 유사도를 알 수 있는 DataFrame 형태이다

=> DataFrame의 행과 열의 위치를 변경하자

In [10]:
ratings_matrix_T = ratings_matrix.transpose()

ratings_matrix_T.head(3)

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
'Hellboy': The Seeds of Creation (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


ratings_matrix를 전치 행렬 형식으로 변경한 데이터 세트를 기반으로 영화의 코사인 유사도를 구하자

더 직관적인 영화의 유사도 값을 표현하기 위해 cosine_similarity()로 반환된 넘파이 행렬에 영화명을 매핑해 DataFrame로 변환

In [11]:
from sklearn.metrics.pairwise import cosine_similarity

item_sim = cosine_similarity(ratings_matrix_T, ratings_matrix_T)

# cosine_similarity()로 반환된 넘파이 행렬에 영화명을 매핑해 DataFrame로 변환
item_sim_df = pd.DataFrame(item_sim, index = ratings_matrix.columns, columns = ratings_matrix.columns)

print(item_sim_df.shape)
display(item_sim_df.head(2))

(9719, 9719)


title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.141653,0.0,...,0.0,0.342055,0.543305,0.707107,0.0,0.0,0.139431,0.327327,0.0,0.0
'Hellboy': The Seeds of Creation (2004),0.0,1.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


영화의 유사도 DataFrame(item_sim_df)를 이용해 영화 '대부'와 유사도가 높은 상위 6개 영화를 추출

In [14]:
item_sim_df['Godfather, The (1972)'].sort_values(ascending = False)[:6]

title
Godfather, The (1972)                        1.000000
Godfather: Part II, The (1974)               0.821773
Goodfellas (1990)                            0.664841
One Flew Over the Cuckoo's Nest (1975)       0.620536
Star Wars: Episode IV - A New Hope (1977)    0.595317
Fargo (1996)                                 0.588614
Name: Godfather, The (1972), dtype: float64

다른 영화 '인셉션'과 유사도가 높은 영화를 추출

In [16]:
item_sim_df['Inception (2010)'].sort_values(ascending = False)[1:6] # 자기 자신 빼고

title
Dark Knight, The (2008)          0.727263
Inglourious Basterds (2009)      0.646103
Shutter Island (2010)            0.617736
Dark Knight Rises, The (2012)    0.617504
Fight Club (1999)                0.615417
Name: Inception (2010), dtype: float64

이번에는 이 아이템 기반 유사도 데이터(item_sim_df)를 이용해 개인에게 특화된 영화 추천 알고리즘 만들기

## 아이템 기반 최근접 이웃 협업 필터링으로 개인화된 영화 추천

개인화된 영화 추천의 가장 큰 특징은 개인이 아직 관람하지 않은 영화를 추천한다는 것이다

아직 관람하지 않은 영화에 대해서 아이템 유사도와 기존에 관람한 영화의 평점 데이터를 기반으로 해 새롭게 모든 영화의 예측 평점을 계산한 후 높은 예측 평점을 가진 영화를 추천하는 방식이다

영화 간의 유사도를 가지는 DataFrame인 item_sim_df와 사용지-영화 평점 DataFrmae인 ratings_matrix변수를 계속 활용해 사용자별로 최적화된 평점 스코어를 예측하는 함수 생성

In [17]:
def predict_rating(ratings_arr, item_sim_arr):
    ratings_pred = ratings_arr.dot(item_sim_arr) / np.array([np.abs(item_sim_arr).sum(axis = 1)])
    return ratings_pred

predict_rating() 함수를 이용해 개인화된 예측 평점 구하기

In [19]:
ratings_pred = predict_rating(ratings_matrix.values, item_sim_df.values)
ratings_pred_matrix = pd.DataFrame(data = ratings_pred, index = ratings_matrix.index, columns = ratings_matrix.columns)

ratings_pred_matrix.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.070345,0.577855,0.321696,0.227055,0.206958,0.194615,0.249883,0.102542,0.157084,0.178197,...,0.113608,0.181738,0.133962,0.128574,0.006179,0.21207,0.192921,0.136024,0.292955,0.720347
2,0.01826,0.042744,0.018861,0.0,0.0,0.035995,0.013413,0.002314,0.032213,0.014863,...,0.01564,0.020855,0.020119,0.015745,0.049983,0.014876,0.021616,0.024528,0.017563,0.0
3,0.011884,0.030279,0.064437,0.003762,0.003749,0.002722,0.014625,0.002085,0.005666,0.006272,...,0.006923,0.011665,0.0118,0.012225,0.0,0.008194,0.007017,0.009229,0.01042,0.084501
4,0.049145,0.277628,0.160448,0.206892,0.309632,0.042337,0.130048,0.116442,0.099785,0.097432,...,0.051269,0.076051,0.055563,0.054137,0.008343,0.159242,0.100941,0.062253,0.146054,0.231187
5,0.007278,0.066951,0.041879,0.01388,0.024842,0.01824,0.026405,0.018673,0.021591,0.018841,...,0.009689,0.022246,0.01336,0.012378,0.0,0.025839,0.023712,0.018012,0.028133,0.052315


=> 예측 평점이 사용자별 영화의 실제 평점과 영화의 코사인 유사도를 내적한 값이기 때문에 기존에 영화를 관람하지 않아 0에 해당했던 실제 영화 평점이 예측에서는 값이 부여되는 경우가 많아 발생

=> 예측 평점이 실제 평점에 비해 작을 수 있다 -> 내적 결과를 코사인 유사도 벡터 합으로 나누었기 때문이다

<br>
이 예측 결과가 원래의 실제 평점과 얼마나 차이가 나는지 확인

앞에서 개인화된 예측 점수는 평점을 주지 않은 영화에 대해서도 아이템 유사도에 기반해 평점을 예측했다

=> 실제와 예측 평점의 차이는 기존에 평점이 부여된 데이터에 대해서만 오차 정도를 측정한다

=> 예측 평가 지표인 MSE를 계산하는 get_mse()함수를 만들고 결과 확인

In [20]:
from sklearn.metrics import mean_squared_error

# 사용자가 평점을 부여한 영화에 대해서만 예측 성능 평가 mse를 구함
def get_mse(pred, actual):
    # 평점이 있는 실제 영화만 추출
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual)

print('아이템 기반 모든 최근접 이웃 MSE : ', get_mse(ratings_pred, ratings_matrix.values))

아이템 기반 모든 최근접 이웃 MSE :  9.895354759094706


=> 많은 영화의 유사도 벡터를 이용하다 보니 상대적으로 평점 예측이 떨어졌다

특정 영화와 가장 비슷한 유사도를 가지는 영화에 대해서만 유사도 벡터를 적용하는 함수로 변경한다

개별 예측값을 구하기 위해 행, 열 별로 for루프를 반복 수행하면서 TOP_N 유사도 벡터를 계산

In [21]:
def predict_rating_topsim(ratings_arr, item_sim_arr, n = 20):
    # 사용자-아이템 평점 행렬 크기만큼 0으로 채운 예측 행렬 초기화
    pred = np.zeros(ratings_arr.shape)
    
    # 사용자-아이템 평점 행렬의 열 크기만큼 루프 수행
    for col in range(ratings_arr.shape[1]):
        # 유사도 행렬에서 유사도가 큰 순으로 n개의 데이터 행렬의 인덱스 반환
        top_n_itmes = [np.argsort(item_sim_arr[:, col])[:-n-1:-1]]
        # 개인화된 예측 평점을 계산
        for row in range(ratings_arr.shape[0]):
            pred[row, col] = item_sim_arr[col, :][top_n_itmes].dot(ratings_arr[row, :][top_n_itmes].T)
            pred[row, col] /= np.sum(np.abs(item_sim_arr[col, :][top_n_itmes]))
            
    return pred

predict_rating_topsim()함수를 이용해 예측 평점을 계산하고 실제 평점과의 MSE 구하기

In [22]:
ratings_pred = predict_rating_topsim(ratings_matrix.values, item_sim_df.values, n = 20)\

print('아이템 기반 최근접 Top-20 이웃 MSE : ', get_mse(ratings_pred, ratings_matrix.values))

# 계산된 예측 평점 데이터는 DataFrame으로 재생성
ratings_pred_matrix = pd.DataFrame(ratings_pred, index = ratings_matrix.index, columns = ratings_matrix.columns)

아이템 기반 최근접 Top-20 이웃 MSE :  3.695009387428144


특정 사용자 userId = 9인 사용자에 대해 영화 추천

사용자 9가 어떤 영화를 좋아하는지 확인 -> 사용자가 평점을 준 영화를 평점이 높은 순으로 나열

In [23]:
user_rating_id = ratings_matrix.loc[9, :]
user_rating_id[user_rating_id > 0].sort_values(ascending = False)[:10]

title
Adaptation (2002)                                                                 5.0
Citizen Kane (1941)                                                               5.0
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)    5.0
Producers, The (1968)                                                             5.0
Lord of the Rings: The Two Towers, The (2002)                                     5.0
Lord of the Rings: The Fellowship of the Ring, The (2001)                         5.0
Back to the Future (1985)                                                         5.0
Austin Powers in Goldmember (2002)                                                5.0
Minority Report (2002)                                                            4.0
Witness (1985)                                                                    4.0
Name: 9, dtype: float64

=> '반지의 제왕', '오스틴 파워' 등 대작 영화나 어드벤처 영화, 코미디 영화 등 전반적으로 흥행성이 좋은 영화에 높은 평점을 주고 있다

<br>

이 사용자에게 아이템 기반 협업 필터링을 통해 영화 추천

사용자가 이미 평점을 준 영화를 제외하고 추천할 수 있도록 평점을 주지 않은 영화를 리스트 객체로 반환하는 함수 생성

In [27]:
def get_unseen_movies(ratings_matrix, userId):
    # userId로 입력받은 사용자의 모든 영화 정보를 추출해 Series로 반환
    # 반환된 user_rating은 영화명을 인덱스로 가지는 Series 객체임
    user_rating = ratings_matrix.loc[userId, :]
    
    # user_rating이 0보다 크면 기존에 관람한 영화이다
    # 대상 인덱스를 추출해 list 객체로 만듬
    already_seen = user_rating[user_rating > 0].index.tolist()
    
    # 모든 영화명을 list 객체로 만듦
    movies_list = ratings_matrix.columns.tolist()
    
    # list comprehension으로 already_seen에 해당하는 영화는 movies_list에서 제외함
    unseen_list = [movie for movie in movies_list if movie not in already_seen]
    
    return unseen_list

사용자가 영화의 평점을 주지 않은 추천 대상 영화 정보와 predict_rating_tiosim()에서 추출한 사용자별 아이템 유사도에 기반한 예측 평점 데이터 세트를 이용해 최종적으로 사용자에게 영화를 추천하는 함수인 recomm_movie_by_userid()를 만들기

In [28]:
def recomm_movie_by_userid(pred_df, userId, unseen_list, top_n = 10):
    # 예측 평점 DataFrame에서 사용자id 인덱스와 unseen_list로 들어온 영화명 컬럼을 추출해 가장 예측 평점이 높은 순으로 정렬
    recomm_movies = pred_df.loc[userId, unseen_list].sort_values(ascending = False)[:top_n]
    return recomm_movies

# 사용자가 관람하지 않ㅇ른 영화명 추출
unseen_list = get_unseen_movies(ratings_matrix, 9)

# 아이템 기반의 최근접 이웃 협업 필터링으로 영화 추천
recomm_movies = recomm_movie_by_userid(ratings_pred_matrix, 9, unseen_list, top_n = 10)

# 평점 데이터를 DataFrame로 생성
recomm_movies = pd.DataFrame(recomm_movies.values, index = recomm_movies.index, columns = ['pred_score'])

recomm_movies

Unnamed: 0_level_0,pred_score
title,Unnamed: 1_level_1
Shrek (2001),0.866202
Spider-Man (2002),0.857854
"Last Samurai, The (2003)",0.817473
Indiana Jones and the Temple of Doom (1984),0.816626
"Matrix Reloaded, The (2003)",0.80099
Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001),0.765159
Gladiator (2000),0.740956
"Matrix, The (1999)",0.732693
Pirates of the Caribbean: The Curse of the Black Pearl (2003),0.689591
"Lord of the Rings: The Return of the King, The (2003)",0.676711


=> '슈렉', '스파이더 맨', '인디아나 존스-2편', '매트릭스'등 다양하지만 높은 흥행성을 가진 작품을 추천

# 행렬 분해를 이용한 잠재 요인 협업 필터링 실습

SGD 기반의 행렬 분해를 구현하고 이를 기반으로 사용자에게 영화를 추천

In [39]:
from sklearn.metrics import mean_squared_error

def get_rmse(R, P, Q, non_zeros):
    error = 0
    
    # 두 개의 분해된 행렬 P와, Q, T의 내적으로 예측 R 행렬 생성
    full_pred_matrix = np.dot(P, Q.T)
    
    # 실제 R행렬에서 널이 아닌 값의 위치 인덱스 추출해 실제 R 행렬과 예측 행렬의 RMSE 추출
    x_non_zero_ind = [non_zero[0] for non_zero in non_zeros]
    y_non_zero_ind = [non_zero[1] for non_zero in non_zeros]     
    R_non_zeros = R[x_non_zero_ind, y_non_zero_ind]
    full_pred_matrix_non_zeros = full_pred_matrix[x_non_zero_ind, y_non_zero_ind]
    mse = mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros)
    rmse = np.sqrt(mse)
    
    return rmse

In [40]:
def matrix_factorization(R, K, steps = 200, learning_rate = 0.01, r_lambda = 0.01):
    num_users, num_items = R.shape
    # P와 Q 매트릭스의 크기를 지정하고 정규 분포를 가진 랜덤한 값으로 입력
    np.random.seed(1)
    P = np.random.normal(scale = 1./K, size = (num_users, K))
    Q = np.random.normal(scale = 1./K, size = (num_items, K))
    
    # R >0 인 행 위치, 열 위치, 값을 non_zeros 리스트 객체에 저장
    non_zeros = [(i, j, R[i, j]) for i in range(num_users) for j in range(num_items) if R[i, j] > 0]
    
    # SGD 기법으로 P와 Q 매트릭스를 계속 업데이트
    for step in range(steps):
        for i, j, r in non_zeros:
            # 실제값과 예측 값의 차이인 오류 값 구함
            eij = r - np.dot(P[i, :], Q[j, :].T)
            # regularization을 반영한 SGD 업데이트 공식 적용
            P[i, :] = P[i, :] + learning_rate * (eij * Q[j, :] - r_lambda * P[i, :])
            Q[j, :] = Q[j, :] + learning_rate * (eij * P[i, :] - r_lambda * Q[j, :])
            
        rmse = get_rmse(R, P, Q, non_zeros)
        if (step % 10) == 0:
            print('## iteration step : ', step, 'rmse : ', rmse)
    
    return P, Q

영화 평점 행렬 데이터를 새롭게 DataFrame으로 로딩한 뒤에 다시 사용자-아이템 평점 행렬로 만들기

In [41]:
import pandas as pd
import numpy as np

movies = pd.read_csv('./data/ml_latest_small/movies.csv')
ratings = pd.read_csv('./data/ml_latest_small/ratings.csv')

ratings = ratings[['userId', 'movieId', 'rating']]
ratings_matrix = ratings.pivot_table('rating', index = 'userId', columns = 'movieId')

# title 컬럼을 얻기 위해 movies와 조인 수행
rating_movies = pd.merge(ratings, movies, on = 'movieId')

# columns = 'title'로 title 컬럼으로 pivot 수행
ratings_matrix = rating_movies.pivot_table('rating', index = 'userId', columns = 'title')

만들어진 사용자-아이템 평점 행렬을 matrix_factorization()함수를 이용해 행렬 분해

In [42]:
P, Q = matrix_factorization(ratings_matrix.values, K = 50, steps = 200, learning_rate = 0.01, r_lambda = 0.01)

pred_matrix = np.dot(P, Q.T)

## iteration step :  0 rmse :  2.9023619751336867
## iteration step :  10 rmse :  0.7335768591017927
## iteration step :  20 rmse :  0.5115539026853442
## iteration step :  30 rmse :  0.37261628282537446
## iteration step :  40 rmse :  0.2960818299181014
## iteration step :  50 rmse :  0.2520353192341642
## iteration step :  60 rmse :  0.22487503275269854
## iteration step :  70 rmse :  0.20685455302331537
## iteration step :  80 rmse :  0.19413418783028685
## iteration step :  90 rmse :  0.18470082002720403
## iteration step :  100 rmse :  0.17742927527209104
## iteration step :  110 rmse :  0.17165226964707486
## iteration step :  120 rmse :  0.1669518194687172
## iteration step :  130 rmse :  0.16305292191997542
## iteration step :  140 rmse :  0.15976691929679643
## iteration step :  150 rmse :  0.1569598699945732
## iteration step :  160 rmse :  0.15453398186715428
## iteration step :  170 rmse :  0.15241618551077643
## iteration step :  180 rmse :  0.15055080739628307
## iteratio

예측 사용자-아이템 평점 행렬을 영화 타이틀을 컬러명으로 가지는 DataFrame으로 변경

In [47]:
ratings_pred_matrix = pd.DataFrame(pred_matrix, index = ratings_matrix.index, columns = ratings_matrix.columns)

ratings_pred_matrix.head(3)

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.055084,4.092018,3.56413,4.502167,3.981215,1.271694,3.603274,2.333266,5.091749,3.972454,...,1.402608,4.208382,3.705957,2.720514,2.787331,3.475076,3.253458,2.161087,4.010495,0.859474
2,3.170119,3.657992,3.308707,4.166521,4.31189,1.275469,4.237972,1.900366,3.392859,3.647421,...,0.973811,3.528264,3.361532,2.672535,2.404456,4.232789,2.911602,1.634576,4.135735,0.725684
3,2.307073,1.658853,1.443538,2.208859,2.229486,0.78076,1.997043,0.924908,2.9707,2.551446,...,0.520354,1.709494,2.281596,1.782833,1.635173,1.323276,2.88758,1.042618,2.29389,0.396941


예측 사용자-아이템 평점 행렬 정보를 이용해 개인화된 영화 추천을 하자

사용자 아이디 9번에 대한 영화 추천을 이번에는 잠재 요인 협업 필터링으로 추천

In [48]:
# 시용자가 관람하지 않은 영화명 추출
unseen_list = get_unseen_movies(ratings_matrix, 9)

# 잠재 요인 협업 필터링으로 영화 추천
recomm_movies = recomm_movie_by_userid(ratings_pred_matrix, 9, unseen_list, top_n = 10)

# 평점 데이터를 DataFrame으로 생성
recomm_movies = pd.DataFrame(recomm_movies.values, index = recomm_movies.index, columns = ['pred_score'])

recomm_movies

Unnamed: 0_level_0,pred_score
title,Unnamed: 1_level_1
Rear Window (1954),5.704612
"South Park: Bigger, Longer and Uncut (1999)",5.4511
Rounders (1998),5.298393
Blade Runner (1982),5.244951
Roger & Me (1989),5.191962
Gattaca (1997),5.183179
Ben-Hur (1959),5.130463
Rosencrantz and Guildenstern Are Dead (1990),5.087375
"Big Lebowski, The (1998)",5.03869
Star Wars: Episode V - The Empire Strikes Back (1980),4.989601


=> 앞 아이템 기반 협업 필터링 결과와는 추천된 영화가 많이 다르다