In [26]:
import numpy as np
from sklearn.metrics import mean_squared_error

def get_rmse(R, P, Q, non_zeros):
    error = 0
    # 두개의 분해된 행렬 P와 Q.T의 내적 곱으로 예측 R 행렬 생성
    full_pred_matrix = np.dot(P,Q.T)
    
    # 실제 R 행렬에서 NULL이 아닌 값의 위치 인덱스 추출하여 실제 R 행렬과 예측 행렬의 RMSE 추출
    x_non_zero_ind = [non_zero[0] for non_zero in non_zeros]
    y_non_zero_ind = [non_zero[1] for non_zero in non_zeros]
    R_non_zeros = R[x_non_zero_ind, y_non_zero_ind]
    
    full_pred_matrix_non_zeros = full_pred_matrix[x_non_zero_ind,y_non_zero_ind]
    
    mse = mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros)
    rmse = np.sqrt(mse)
    
    return rmse

In [27]:
# 행렬 분해
def matrix_factorization(R, K, steps=200, learning_rate=0.01, r_lambda=0.01):
    num_users, num_items = R.shape
    np.random.seed(1)
    P = np.random.normal(scale=1./K, size = (num_users,K))
    Q = np.random.normal(scale=1./K, size = (num_items,K))
    
    break_count = 0
    
    # R > 0인 행 위치, 열 위치, 값을 non_zeros 리스트 객체에 저장
    non_zeros = [(i,j,R[i,j]) for i in range(num_users) for j in range(num_items) if R[i,j] > 0]
    
    # SGD 기법으로 P와 Q 매트릭스를 계속 업데이트
    for step in range(steps):  # steps는 SGD의 반복횟수
        for i, j, r in non_zeros:
            # 실제 값과 예측 값의 차이인 오류 값 구함
            eij = r - np.dot(P[i,:],Q[j,:].T)
            
            P[i,:] = P[i,:] + learning_rate*(eij * Q[j,:] - r_lambda*P[i,:])
            Q[j,:] = Q[j,:] + learning_rate*(eij * P[i,:] - r_lambda*Q[j,:])
            
        rmse = get_rmse(R,P,Q, non_zeros)
        if ( step % 10) == 0: # 10회 반복할 때마다 오류 값 출력
            print(f'iteration step: {step}, rmse: {rmse}')
    return P, Q

[과제] 주어진 데이터로 행렬 분해 기반의 잠재요인 협업필터링 추천을 아래와 같이 수행하세요.

- 행렬분해 사용자 함수 matrix_factorization 이용(steps 200, K 50, L2 계수 0.01), 예측 사용자-아이템 평점 행렬을 df로 작성
- 행렬 정보를 이용, 개인화된 영화 추천

In [28]:
import pandas as pd
import numpy as np

movies = pd.read_csv('dataset/ml-latest-small/movies.csv')
ratings = pd.read_csv('dataset/ml-latest-small/ratings.csv')


In [29]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [30]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [31]:
# timestamp는 필요가 없기에 삭제
ratings = ratings[['userId','movieId','rating']]
# title 컬럼을 얻기 이해 movies 와 조인 수행
rating_movies = pd.merge(ratings, movies, on='movieId')

# columns='title' 로 title 컬럼으로 pivot 수행. 
ratings_matrix = rating_movies.pivot_table('rating', index='userId', columns='title')

In [44]:
ratings_matrix

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,,,,,,,,,,,...,,,,,,,,,,
607,,,,,,,,,,,...,,,,,,,,,,
608,,,,,,,,,,,...,,,,,,4.5,3.5,,,
609,,,,,,,,,,,...,,,,,,,,,,


In [32]:
P, Q = matrix_factorization(ratings_matrix.values, steps=200, K=50,learning_rate=0.01, r_lambda = 0.01)
pred_matrix = np.dot(P, Q.T)

iteration step: 0, rmse: 2.9023619751336867
iteration step: 10, rmse: 0.7335768591017927
iteration step: 20, rmse: 0.5115539026853442
iteration step: 30, rmse: 0.37261628282537446
iteration step: 40, rmse: 0.29608182991810134
iteration step: 50, rmse: 0.2520353192341642
iteration step: 60, rmse: 0.22487503275269854
iteration step: 70, rmse: 0.20685455302331537
iteration step: 80, rmse: 0.19413418783028683
iteration step: 90, rmse: 0.18470082002720403
iteration step: 100, rmse: 0.17742927527209104
iteration step: 110, rmse: 0.1716522696470749
iteration step: 120, rmse: 0.1669518194687172
iteration step: 130, rmse: 0.16305292191997542
iteration step: 140, rmse: 0.15976691929679643
iteration step: 150, rmse: 0.1569598699945732
iteration step: 160, rmse: 0.15453398186715428
iteration step: 170, rmse: 0.15241618551077643
iteration step: 180, rmse: 0.1505508073962831
iteration step: 190, rmse: 0.1488947091323209


In [56]:
ratings_pred_matrix = pd.DataFrame(data=pred_matrix, index= ratings_matrix.index,
                                   columns = ratings_matrix.columns)

ratings_pred_matrix

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.055084,4.092018,3.564130,4.502167,3.981215,1.271694,3.603274,2.333266,5.091749,3.972454,...,1.402608,4.208382,3.705957,2.720514,2.787331,3.475076,3.253458,2.161087,4.010495,0.859474
2,3.170119,3.657992,3.308707,4.166521,4.311890,1.275469,4.237972,1.900366,3.392859,3.647421,...,0.973811,3.528264,3.361532,2.672535,2.404456,4.232789,2.911602,1.634576,4.135735,0.725684
3,2.307073,1.658853,1.443538,2.208859,2.229486,0.780760,1.997043,0.924908,2.970700,2.551446,...,0.520354,1.709494,2.281596,1.782833,1.635173,1.323276,2.887580,1.042618,2.293890,0.396941
4,2.628629,3.035550,2.575746,3.706912,3.430636,0.706441,3.330280,1.978826,4.560368,2.775710,...,1.046116,2.912178,2.479592,2.231915,1.888629,2.211364,0.645603,1.585734,3.542892,0.591540
5,2.116148,3.084761,2.747679,3.783490,3.946990,0.883259,1.958953,1.757317,2.054312,2.775258,...,0.956159,3.893975,2.717024,2.002443,2.053337,3.983639,2.099626,1.423718,2.490428,0.531403
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,3.153469,3.536398,3.128222,3.875599,3.821362,1.010045,2.284450,2.287194,4.148796,3.487517,...,1.105429,3.647394,3.342759,2.446343,2.406133,3.613071,1.849264,1.526109,2.860815,0.673099
607,2.502048,3.577547,3.095692,4.081789,4.038996,1.095224,3.698198,1.904581,5.347442,3.114650,...,0.963407,3.230442,3.526719,2.084052,2.119598,4.135631,3.124109,2.362195,3.421672,0.758574
608,2.154503,3.019060,2.679379,3.567550,3.483444,0.909505,2.387003,1.644313,3.090541,3.197815,...,0.818661,3.372644,2.936734,2.049884,2.097775,4.452331,3.504461,1.906708,2.419560,0.701739
609,2.566479,3.285659,2.910122,3.717481,3.665580,1.036884,2.908776,1.844237,2.868990,3.175409,...,1.003584,3.033540,2.943151,2.173251,2.234396,3.839926,2.520343,1.603675,2.970382,0.636880


In [34]:
def get_unseen_movies(ratings_matrix, userId):
    # userId로 입력받은 사용자의 모든 영화정보 추출하여 Series로 반환
    
    # 반환된 user_rating 은 영화명(title)을 index로 가지는 Series 객체 
    
    user_rating = ratings_matrix.loc[userId,:]
    
    # user_rating이 0보다 크면 기존에 관람한 영화임. 대상 index를 추출하여 list 객체로 만듬
    already_seen = user_rating[ user_rating > 0].index.tolist()
    
    # 모든 영화명을 list 객체로 만듬. 
    movies_list = ratings_matrix.columns.tolist()
    
    # list comprehension으로 already_seen에 해당하는 movie는 movies_list에서 제외
    unseen_list = [ movie for movie in movies_list if movie not in already_seen]
    
    return unseen_list

In [35]:
def recomm_movie_by_userid(pred_df, userId, unseen_list, top_n=10):
    # 예측 평점 DataFrame에서 사용자id index와 unseen_list로 들어온 영화명 컬럼을 추출하여 가장 예측 평점이 높은 순으로 정렬 
    
    recomm_movies = pred_df.loc[userId, unseen_list].sort_values(ascending=False)[:top_n]
    return recomm_movies

In [45]:
# 사용자가 관람하지 않는 영화명 추출   
unseen_list = get_unseen_movies(ratings_matrix, 602)
unseen_list

["'71 (2014)",
 "'Hellboy': The Seeds of Creation (2004)",
 "'Round Midnight (1986)",
 "'Salem's Lot (2004)",
 "'Til There Was You (1997)",
 "'Tis the Season for Love (2015)",
 "'burbs, The (1989)",
 "'night Mother (1986)",
 '(500) Days of Summer (2009)',
 '*batteries not included (1987)',
 '...All the Marbles (1981)',
 '...And Justice for All (1979)',
 '00 Schneider - Jagd auf Nihil Baxter (1994)',
 '1-900 (06) (1994)',
 '10 (1979)',
 '10 Cent Pistol (2015)',
 '10 Cloverfield Lane (2016)',
 '10 Items or Less (2006)',
 '10 Things I Hate About You (1999)',
 '10 Years (2011)',
 '10,000 BC (2008)',
 '100 Girls (2000)',
 '100 Streets (2016)',
 '101 Dalmatians (1996)',
 '101 Dalmatians (One Hundred and One Dalmatians) (1961)',
 "101 Dalmatians II: Patch's London Adventure (2003)",
 '101 Reykjavik (101 Reykjavík) (2000)',
 '102 Dalmatians (2000)',
 '10th & Wolf (2006)',
 '10th Kingdom, The (2000)',
 '10th Victim, The (La decima vittima) (1965)',
 '11\'09"01 - September 11 (2002)',
 '11:14 (2

In [46]:
# 아이템 기반의 인접 이웃 협업 필터링으로 영화 추천 
recomm_movies = recomm_movie_by_userid(ratings_pred_matrix, 602, unseen_list, top_n=10)
recomm_movies

title
Heathers (1989)                                          6.185448
No Country for Old Men (2007)                            6.009282
Platoon (1986)                                           5.682322
Lord of the Rings: The Return of the King, The (2003)    5.527965
Name of the Rose, The (Name der Rose, Der) (1986)        5.381674
Requiem for a Dream (2000)                               5.374450
Godfather: Part II, The (1974)                           5.270633
Sin City (2005)                                          5.187519
Shining, The (1980)                                      5.162690
City of God (Cidade de Deus) (2002)                      5.122047
Name: 602, dtype: float64

In [47]:
# 평점 데이타를 DataFrame으로 생성. 
recomm_movies = pd.DataFrame(data=recomm_movies.values,index=recomm_movies.index,columns=['pred_score'])
recomm_movies

Unnamed: 0_level_0,pred_score
title,Unnamed: 1_level_1
Heathers (1989),6.185448
No Country for Old Men (2007),6.009282
Platoon (1986),5.682322
"Lord of the Rings: The Return of the King, The (2003)",5.527965
"Name of the Rose, The (Name der Rose, Der) (1986)",5.381674
Requiem for a Dream (2000),5.37445
"Godfather: Part II, The (1974)",5.270633
Sin City (2005),5.187519
"Shining, The (1980)",5.16269
City of God (Cidade de Deus) (2002),5.122047


In [54]:
# 장르선호도도 보고싶어서 movies와 결합하여 genres 추출
recomm_movies1 = pd.merge(recomm_movies, movies, on='title')[['title','genres','pred_score']]

In [55]:
recomm_movies1

Unnamed: 0,title,genres,pred_score
0,Heathers (1989),Comedy,6.185448
1,No Country for Old Men (2007),Crime|Drama,6.009282
2,Platoon (1986),Drama|War,5.682322
3,"Lord of the Rings: The Return of the King, The...",Action|Adventure|Drama|Fantasy,5.527965
4,"Name of the Rose, The (Name der Rose, Der) (1986)",Crime|Drama|Mystery|Thriller,5.381674
5,Requiem for a Dream (2000),Drama,5.37445
6,"Godfather: Part II, The (1974)",Crime|Drama,5.270633
7,Sin City (2005),Action|Crime|Film-Noir|Mystery|Thriller,5.187519
8,"Shining, The (1980)",Horror,5.16269
9,City of God (Cidade de Deus) (2002),Action|Adventure|Crime|Drama|Thriller,5.122047
