In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.decomposition import TruncatedSVD
from scipy.sparse.linalg import svds

# Data 불러오기

In [3]:
data = pd.read_csv('./dataset/merged_data.csv')

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2291 entries, 0 to 2290
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   genres             2291 non-null   object 
 1   id                 2291 non-null   int64  
 2   original_language  2291 non-null   object 
 3   overview           2291 non-null   object 
 4   popularity         2291 non-null   float64
 5   spoken_languages   2290 non-null   object 
 6   title              2291 non-null   object 
 7   vote_average       2291 non-null   float64
 8   vote_count         2291 non-null   int64  
 9   score              2291 non-null   float64
 10  cast               2290 non-null   object 
 11  crew               2291 non-null   object 
 12  keywords           2266 non-null   object 
 13  movieId            2291 non-null   int64  
 14  rating             2291 non-null   float64
dtypes: float64(4), int64(3), object(8)
memory usage: 268.6+ KB


In [5]:
data.drop(['rating'], axis=1, inplace=True)

- 이미 ratings 파일과 merge되었기 때문에, 해당 열을 다시 삭제

In [6]:
len(data) == len(data['title'].unique())

False

- title이 같은 영화가 있음
- movieId를 사용하는 것이 가장 적절하나, 편의상 중복을 삭제하여 title을 사용

In [7]:
data.drop_duplicates('title')

Unnamed: 0,genres,id,original_language,overview,popularity,spoken_languages,title,vote_average,vote_count,score,cast,crew,keywords,movieId
0,Animation Comedy Family,862,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,English,Toy Story,7.7,5415,7.545539,TomHanks TimAllen DonRickles JimVarney Wallace...,JohnLasseter,jealousy toy boy friendship friends rivalry bo...,1
1,Adventure Fantasy Family,8844,en,When siblings Judy and Peter discover an encha...,17.015539,English Français,Jumanji,6.9,2413,6.704621,RobinWilliams JonathanHyde KirstenDunst Bradle...,JoeJohnston,board game disappearance based on children's b...,2
2,Action Crime Drama Thriller,949,en,"Obsessive master thief, Neil McCauley leads a ...",17.924927,English Español,Heat,7.7,1886,7.310584,AlPacino RobertDeNiro ValKilmer JonVoight TomS...,MichaelMann,robbery detective bank obsession chase shootin...,6
3,Adventure Action Thriller,710,en,James Bond must unmask the mysterious head of ...,14.686036,English Pусский Español,GoldenEye,6.6,1194,6.338302,PierceBrosnan SeanBean IzabellaScorupco FamkeJ...,MartinCampbell,cuba falsely accused secret identity computer ...,10
4,Drama Crime,524,en,The life of the gambling paradise – Las Vegas ...,10.137389,English,Casino,7.8,1343,7.267167,RobertDeNiro SharonStone JoePesci JamesWoods D...,MartinScorsese,poker drug abuse 1970s overdose illegal prosti...,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2286,ScienceFiction Thriller,406990,en,In a world where families are limited to one c...,60.581223,English,What Happened to Monday,7.3,598,6.592786,NoomiRapace MarwanKenzari ChristianRubeck Glen...,TommyWirkola,chase false identity overpopulation investigat...,173925
2287,Action Thriller,341013,en,An undercover MI6 agent is sent to Berlin duri...,14.455104,svenska English Deutsch Pусский,Atomic Blonde,6.1,748,5.923143,CharlizeTheron JamesMcAvoy SofiaBoutella Eddie...,DavidLeitch,berlin spy undercover cold war double agent un...,173941
2288,Action Drama History Thriller War,374720,en,The miraculous evacuation of Allied soldiers f...,30.938854,English Français Deutsch,Dunkirk,7.5,2712,7.240418,FionnWhitehead TomGlynn-Carney JackLowden Harr...,ChristopherNolan,france beach world war ii evacuation german pi...,174055
2289,Action ScienceFiction Thriller Adventure,335988,en,"Autobots and Decepticons are at war, with huma...",39.186819,English,Transformers: The Last Knight,6.2,1440,6.065291,MarkWahlberg JoshDuhamel LauraHaddock AnthonyH...,MichaelBay,knight transformers,174585


In [8]:
ratings = pd.read_csv('./dataset/ratings.csv')

In [9]:
ratings.drop('timestamp', axis=1, inplace=True)

In [10]:
data = pd.merge(data, ratings)
data.shape

(16441173, 16)

In [11]:
data.head()

Unnamed: 0,genres,id,original_language,overview,popularity,spoken_languages,title,vote_average,vote_count,score,cast,crew,keywords,movieId,userId,rating
0,Animation Comedy Family,862,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,English,Toy Story,7.7,5415,7.545539,TomHanks TimAllen DonRickles JimVarney Wallace...,JohnLasseter,jealousy toy boy friendship friends rivalry bo...,1,8,4.0
1,Animation Comedy Family,862,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,English,Toy Story,7.7,5415,7.545539,TomHanks TimAllen DonRickles JimVarney Wallace...,JohnLasseter,jealousy toy boy friendship friends rivalry bo...,1,9,4.5
2,Animation Comedy Family,862,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,English,Toy Story,7.7,5415,7.545539,TomHanks TimAllen DonRickles JimVarney Wallace...,JohnLasseter,jealousy toy boy friendship friends rivalry bo...,1,12,4.0
3,Animation Comedy Family,862,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,English,Toy Story,7.7,5415,7.545539,TomHanks TimAllen DonRickles JimVarney Wallace...,JohnLasseter,jealousy toy boy friendship friends rivalry bo...,1,20,4.0
4,Animation Comedy Family,862,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,English,Toy Story,7.7,5415,7.545539,TomHanks TimAllen DonRickles JimVarney Wallace...,JohnLasseter,jealousy toy boy friendship friends rivalry bo...,1,24,4.0


In [12]:
del ratings  # 메모리를 위해 삭제

# Pivoting

In [13]:
pivot = data.pivot_table('rating', index = 'title', columns = 'userId').fillna(0)

In [14]:
pivot.shape

(2240, 268071)

In [15]:
pivot.head()

userId,1,2,3,4,5,6,7,8,9,10,...,270887,270888,270889,270890,270891,270892,270893,270894,270895,270896
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
(500) Days of Summer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Cloverfield Lane,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Things I Hate About You,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"10,000 BC",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101 Dalmatians,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# SVD

여기서 이제 SVD를 사용합니다.   

**SVD(Singular Value Decomposion), 특이값 분해**란 m x n 크기의 데이터 행렬 A를 아래와 같이 분해하는 것을 말합니다.

![15](https://user-images.githubusercontent.com/24634054/73115129-93138c00-3f65-11ea-9a10-80abc59a8494.JPG)


출처 : https://ratsgo.github.io/from%20frequency%20to%20semantics/2017/04/06/pcasvdlsa/

행렬 U와 V에 속한 열벡터는 특이벡터(singular vector)로 불리고, 이 특이벡터들은 서로 직교하는 성질을 가지고 있습니다.
또한, 가운데 시그마 기호로 보이는 것은 이것도 행렬인데요. 이 행렬은 대각행렬(diagonal matrix) 성질을 가지고 있습니다. 그래서 대각 성분이 행렬 A의 특이값이고 나머지 성분이 0입니다.


그리고 사이킷런에서 제공해주는 truncated SVD는 이러한 SVD의 변형입니다.
**truncated SVD**는 시그마 행렬의 대각원소(특이값) 가운데 상위 n개만 골라낸 것입니다. 이렇게 하면 기존 행렬 A의 성질을 100% 원복할 수는 없지만, (그 만큼 데이터 정보를 압축) 행렬 A와 거의 근사한 값이 나오게 됩니다.



여기서는 scikit learn의 TruncatedSVD를 사용해봅니다.

In [16]:
SVD = TruncatedSVD(n_components=12)
matrix = SVD.fit_transform(pivot)
matrix.shape

(2240, 12)

In [17]:
matrix[0]

array([130.67406083, 130.87384737,  24.08061694,  11.74414542,
       -58.1655653 ,  17.13143127,  54.2301383 ,  -2.6455148 ,
        44.42035077,  16.51306256, -29.49733042, -21.72965473])

12개의 component로 차원을 축소했습니다. 

이제 이렇게 나온 데이터를 활용해서 피어슨 상관계수를 구합니다.

- 피어슨 상관관계

두 배열 간에 상관 관계를 알고 싶을 때 사용한다. 안경을 낀 것(x)과 성적(y)이 어느 정도 상관관계가 있는지 알아보고 싶을 때 상관계수를 구해보면 된다. 상관계수는 -1부터 1까지로 결과 값이 리턴되며 -1이나 1에 가까울수록 상관관계가 높은 것이다. 1은 그래프에서 / 과 같이 양의 기울기를 가지므로 안경을 낄수록 성적이 높아진다고 볼 수 있고, -1의 상관계수를 가진다면 안경을 낄수록 성적이 낮아진다고 해석할 수 있다.

한가지 해석에 있어 주의할 점은 상관계수가 높다는 것이 인과관계와는 상관이 없다는 것이다. 위 예의 경우 상관계수가 1일때 안경과 성적은 높은 상관이 있지만, 안경을 끼었기 때문에 성적이 높다고 말할 수는 없는 것이다.

# Pearson correlation coefficient

In [18]:
corr = np.corrcoef(matrix)
corr.shape

(2240, 2240)

이렇게 나온 상관계수를 이용해서 특정 영화와 상관계수가 높은 영화를 뽑아줍니다.

In [19]:
corr = pd.DataFrame(corr, index=pivot.index, columns=pivot.index)

In [20]:
corr.head()

title,(500) Days of Summer,10 Cloverfield Lane,10 Things I Hate About You,"10,000 BC",101 Dalmatians,12 Angry Men,12 Years a Slave,127 Hours,13 Going on 30,13 Hours: The Secret Soldiers of Benghazi,...,Zoolander,Zoolander 2,Zootopia,[REC],[REC]²,eXistenZ,xXx,xXx: Return of Xander Cage,xXx: State of the Union,Æon Flux
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
(500) Days of Summer,1.0,0.819926,0.512793,0.590139,0.228836,0.490681,0.889897,0.938079,0.638365,0.63509,...,0.698143,0.758212,0.777417,0.775423,0.67959,0.501247,0.424128,0.445938,0.362402,0.607811
10 Cloverfield Lane,0.819926,1.0,0.336559,0.671115,0.135876,0.349848,0.931906,0.925536,0.458316,0.861918,...,0.499568,0.941343,0.90019,0.873563,0.883838,0.579782,0.448809,0.781435,0.477892,0.680096
10 Things I Hate About You,0.512793,0.336559,1.0,0.520702,0.738086,0.377348,0.300275,0.379802,0.855679,0.279365,...,0.846069,0.489487,0.384635,0.413246,0.434078,0.613582,0.681555,0.329145,0.502511,0.544531
"10,000 BC",0.590139,0.671115,0.520702,1.0,0.477053,0.359093,0.589049,0.673495,0.713199,0.764876,...,0.762932,0.792157,0.63312,0.811271,0.8588,0.585512,0.919166,0.841547,0.951757,0.979843
101 Dalmatians,0.228836,0.135876,0.738086,0.477053,1.0,0.366891,0.072688,0.13108,0.719041,0.122594,...,0.64063,0.310165,0.141931,0.330282,0.374245,0.569043,0.626093,0.255556,0.506515,0.526107


In [21]:
movie_title = list(pivot.index)

In [22]:
title = ["X-Men Origins: Wolverine", "Harry Potter and the Half-Blood Prince", "The Dark Knight Rises", "The Avengers"]

In [23]:
for i in title :
    print(i)
    print()
    print(corr[corr[i] >= 0.9][i].sort_values(ascending=False)[1:11])
    print('===========================================================')

X-Men Origins: Wolverine

title
Transformers: Revenge of the Fallen    0.986754
Jumper                                 0.984875
2012                                   0.983883
The Incredible Hulk                    0.983065
Hancock                                0.979430
Hellboy II: The Golden Army            0.977167
Terminator Salvation                   0.976557
Prince of Persia: The Sands of Time    0.974874
Clash of the Titans                    0.974731
Iron Man 2                             0.973994
Name: X-Men Origins: Wolverine, dtype: float64
Harry Potter and the Half-Blood Prince

title
Harry Potter and the Deathly Hallows: Part 1    0.994434
Harry Potter and the Deathly Hallows: Part 2    0.989271
Kung Fu Panda                                   0.976989
How to Train Your Dragon                        0.976828
Harry Potter and the Order of the Phoenix       0.971708
Despicable Me                                   0.965319
Tangled                                         0.963

# User에게 개인 추천해주기
위에서는 하나의 영화에 대해서 비슷한 영화를 추천해주는 것을 적용했습니다.   
하지만, 보통 추천 시스템은 사용자에게 추천을 해주어야 합니다.  

사용자에게 추천을 해주기 위해서 사용자 맞춤 협업 필터링 행렬 분해를 적용해보겠습니다.

In [24]:
pivot_t = pivot.T
pivot_t.head()

title,(500) Days of Summer,10 Cloverfield Lane,10 Things I Hate About You,"10,000 BC",101 Dalmatians,12 Angry Men,12 Years a Slave,127 Hours,13 Going on 30,13 Hours: The Secret Soldiers of Benghazi,...,Zoolander,Zoolander 2,Zootopia,[REC],[REC]²,eXistenZ,xXx,xXx: Return of Xander Cage,xXx: State of the Union,Æon Flux
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


여기까지는 앞서 했던 것과 똑같습니다. 사용자-영화 pivot table을 만드는 것이죠.   

이제 아래와 같이 데이터를 조금 변경해서 진행하겠습니다.

1. pivot table을 matrix로 변환
2. np.mean(axis = 1)을 통해 각 사용자들이 매기는 평점 평균을 구함
3. 1에서 구한 값과 2에서 구한 값을 빼서 사용자-평균 데이터 값을 변경

In [25]:
pivot_t.shape

(268071, 2240)

In [26]:
matrix = pivot_t.values
matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [27]:
# user_ratings_mean은 사용자의 평균 평점 
user_ratings_mean = np.mean(matrix, axis=1)
user_ratings_mean

array([0.04709821, 0.01696429, 0.01116071, ..., 0.13102679, 0.03214286,
       0.42299107])

In [28]:
user_ratings_mean.shape

(268071,)

In [29]:
user_ratings_mean.reshape(-1, 1)

array([[0.04709821],
       [0.01696429],
       [0.01116071],
       ...,
       [0.13102679],
       [0.03214286],
       [0.42299107]])

In [30]:
# matrix_user_mean : 사용자-영화에 대해 사용자 평균 평점을 뺀 것.
matrix_user_mean = matrix - user_ratings_mean.reshape(-1, 1)
matrix_user_mean

array([[-0.04709821, -0.04709821, -0.04709821, ..., -0.04709821,
        -0.04709821, -0.04709821],
       [-0.01696429, -0.01696429, -0.01696429, ..., -0.01696429,
        -0.01696429, -0.01696429],
       [-0.01116071, -0.01116071, -0.01116071, ..., -0.01116071,
        -0.01116071, -0.01116071],
       ...,
       [-0.13102679, -0.13102679, -0.13102679, ..., -0.13102679,
        -0.13102679, -0.13102679],
       [-0.03214286, -0.03214286, -0.03214286, ..., -0.03214286,
        -0.03214286, -0.03214286],
       [-0.42299107, -0.42299107, -0.42299107, ..., -0.42299107,
        -0.42299107, -0.42299107]])

In [31]:
matrix_user_mean.shape

(268071, 2240)

여기까지 진행하면 초기에 만들었던 user-movie pivot table 값이 matrix_user_mean 변경되었습니다.

즉, 아래와 같이 변경된 것이죠.

1. 사용자가 영화에 대해 평점을 매긴 값이 존재
2. 사용자 각각 평균 평점을 구해서
3. 사용자의 영화에 대한 평점을 조금 변경 -> 1에서 구한 값 - 2에서 구한 값


**이제 SVD를 이용해 Matrix Factorization**을 진행해봅니다. 

앞서서는 scikit learn을 이용해 TruncatedSVD를 이용했는데요. 이번에는 scipy를 이용해 Truncated SVD를 구해봅니다.

이 둘의 차이점은 scikit learn에서 제공해주는 TruncatedSVD는 U, Sigma, Vt 반환 값을 제공하지 않습니다.   
하지만, Scipy를 이용하면 이 반환값들을 제공받을 수 있죠.

Scipy에서 제공해주는 Truncate SVD는 scipy.sparse.linalg.svds를 이용하면 됩니다. 저는 이것을 이용했습니다.

이렇게 사용하면 반환값이 U 행렬, Sigma 행렬, V 전치 행렬(Vt)가 나오게 됩니다.

In [32]:
# scipy에서 제공해주는 svd.  
# U 행렬, sigma 행렬, V 전치 행렬을 반환.

U, sigma, Vt = svds(matrix_user_mean, k = 12)   # 잠재 요인 12개

In [33]:
print(U.shape)
print(sigma.shape)
print(Vt.shape)

(268071, 12)
(12,)
(12, 2240)


현재 이 Sigma 행렬은 0이 아닌 값만 1차원 행렬로 표현된 상태입니다.  
**즉, 0이 포함된 대칭행렬로 변환할 때는 numpy의 diag를 이용해야 합니다.**

In [34]:
sigma = np.diag(sigma)

In [35]:
sigma.shape

(12, 12)

In [36]:
sigma[0]

array([1148.24472481,    0.        ,    0.        ,    0.        ,
          0.        ,    0.        ,    0.        ,    0.        ,
          0.        ,    0.        ,    0.        ,    0.        ])

In [37]:
sigma[1]

array([   0.        , 1151.26892988,    0.        ,    0.        ,
          0.        ,    0.        ,    0.        ,    0.        ,
          0.        ,    0.        ,    0.        ,    0.        ])

이렇게 대칭 행렬로 변환이 되었습니다.   

현재 까지 상황을 정리하면 아래와 같습니다.   

1. 원본 user-movie 평점 행렬이 있었음
2. 이를 user의 평균 점수를 빼서 matrix_user_mean 이라는 행렬로 만듬
3. 2번의 값을 SVD를 적용해 U, Sigma, Vt 행렬을 구했음
4. Sigma 행렬은 현재 0이 포함이 되지 않은 값으로만 구성되어 있음. 이를 대칭행렬로 변환


자! 이제 여기서 matrix_user_mean을 SVD를 적용해 분해를 한 상태입니다.  
이제, 다시 원본 행렬로 복구시켜야겠죠?

원본 행렬로 복구시키는 방법은 아래와 같습니다.

- U, Sigma, Vt의 내적을 수행 

즉, np.dot(np.dot(U, sigma), Vt)를 수행하면 됩니다. 

그리고 아까 사용자 평균을 빼주었으니 여기서는 더해줍니다. 


In [38]:
# U, Sigma, Vt의 내적을 수행하면, 다시 원본 행렬로 복원이 된다. 
# 거기에 + 사용자 평균 rating을 적용한다. 
svd_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)

In [39]:
df_svd_preds = pd.DataFrame(svd_user_predicted_ratings, index=pivot_t.index ,columns=pivot_t.columns)
df_svd_preds.head()

title,(500) Days of Summer,10 Cloverfield Lane,10 Things I Hate About You,"10,000 BC",101 Dalmatians,12 Angry Men,12 Years a Slave,127 Hours,13 Going on 30,13 Hours: The Secret Soldiers of Benghazi,...,Zoolander,Zoolander 2,Zootopia,[REC],[REC]²,eXistenZ,xXx,xXx: Return of Xander Cage,xXx: State of the Union,Æon Flux
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.253611,0.05215,0.18281,0.008102,0.016455,0.296292,0.133124,0.151607,0.05408,-0.014915,...,0.158877,-0.026364,0.22536,0.008453,-0.029188,-0.094558,0.026358,-0.032445,-0.019465,-0.00394
2,-0.008488,-0.001153,-0.079465,0.010509,0.256213,-0.019643,0.003141,0.003116,-0.023264,0.012865,...,-0.087864,0.009299,-0.024136,-0.003596,0.008651,0.00397,-0.015508,0.012225,0.010596,0.009826
3,-0.003647,-0.005665,0.035903,-0.004423,0.036754,0.12582,0.011462,-0.002085,0.009749,0.00179,...,-0.031877,-0.001437,0.02558,-0.018753,-0.004382,-0.037371,-0.015769,-0.000308,-0.003332,-0.023167
4,0.030982,-0.004697,0.562967,-0.01928,0.130967,-0.041254,-0.005617,-0.010966,0.098969,-0.006086,...,0.178542,0.000231,0.019882,-0.023974,-0.003052,0.162643,0.077521,-0.001388,-0.005244,-0.024774
5,-0.060771,-0.019485,-0.186304,0.003862,0.018974,0.668252,-0.006662,-0.03095,-0.038382,0.003281,...,-0.04894,0.003562,-0.019289,0.004186,0.003649,-0.018705,-0.03634,0.005517,0.003236,-0.016079


In [40]:
df_svd_preds.tail()

title,(500) Days of Summer,10 Cloverfield Lane,10 Things I Hate About You,"10,000 BC",101 Dalmatians,12 Angry Men,12 Years a Slave,127 Hours,13 Going on 30,13 Hours: The Secret Soldiers of Benghazi,...,Zoolander,Zoolander 2,Zootopia,[REC],[REC]²,eXistenZ,xXx,xXx: Return of Xander Cage,xXx: State of the Union,Æon Flux
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
270892,-0.002269,-0.007513,0.192867,0.013469,0.283871,0.127054,-0.037542,-0.043569,-0.002581,0.023647,...,-0.070082,0.026232,0.107797,-0.035202,0.015129,-0.089698,-0.066379,0.030811,0.014353,-0.043311
270893,-0.161807,0.039457,0.643499,-0.007803,0.16433,0.698483,-0.122283,-0.071838,-0.057053,-0.040489,...,0.502216,-0.022389,0.0281,0.066709,-0.014788,0.399938,0.179948,-0.033299,-0.005977,0.018517
270894,0.158585,0.139395,-0.23334,-0.024434,-0.238247,0.850472,0.272544,0.249419,-0.241206,0.001343,...,-0.13177,-0.019266,0.246269,0.084372,-0.008966,0.258049,-0.071176,-0.024193,-0.040589,-0.02246
270895,0.045371,-0.004745,-0.008726,-0.009276,-0.101455,0.135628,0.07476,0.038397,0.039914,0.024873,...,-0.137656,0.010547,0.020722,-0.047577,0.002048,-0.130066,-0.067115,0.014135,-0.002464,-0.052603
270896,1.296788,-0.097836,1.328736,-0.089833,0.338089,1.094168,0.23659,0.452577,0.382968,-0.174731,...,1.333903,-0.162516,-0.121211,-0.021139,-0.157389,0.308499,0.223937,-0.187935,-0.147026,0.041306


In [41]:
df_svd_preds.shape

(268071, 2240)

자! 이제 함수를 하나 만듭니다. 이 함수의 기능은 아래와 같습니다.

- 인자로 사용자 아이디, 영화 정보 테이블, 평점 테이블 등을 받음
- 사용자 아이디에 SVD로 나온 결과의 영화 평점이 가장 높은 데이터 순으로 정렬
- 사용자가 본 데이터를 제외
- 사용자가 안 본 영화에서 평점이 높은 것을 추천

In [42]:
def recommend_movies(df_svd_preds, data, user_id, num_recommendations=5):
      
    # 최종적으로 만든 pred_df에서 사용자 index에 따라 영화 데이터 정렬 -> 영화 평점이 높은 순으로 정렬 됌
    sorted_user_predictions = df_svd_preds.loc[user_id].sort_values(ascending=False)
    
    # 원본 평점 데이터에서 user id에 해당하는 데이터를 뽑아낸다. 
    user_history = data[data.userId == user_id].sort_values(['rating'], ascending=False)
    
    # rating data와 merge하면서 늘어난 중복 row를 삭제
    data = data.drop_duplicates('title')
    
    # 원본 영화 데이터에서 사용자가 본 영화 데이터를 제외한 데이터를 추출
    recommendations = data[~data['title'].isin(user_history['title'])]
    
    # 사용자의 영화 평점이 높은 순으로 정렬된 데이터와 위 recommendations을 합친다. 
    recommendations = recommendations.merge(sorted_user_predictions.reset_index())
    
    # 컬럼 이름 바꾸고 정렬해서 return
    recommendations = recommendations.rename(columns={user_id: 'Predictions'}).sort_values('Predictions', ascending = False).iloc[:num_recommendations, :]
                      
    return user_history, recommendations

In [43]:
user_id = 302

In [44]:
sorted_user_predictions = df_svd_preds.loc[user_id].sort_values(ascending=False)
sorted_user_predictions

title
Star Wars                  1.662909
Return of the Jedi         1.315186
Toy Story                  1.225965
The Empire Strikes Back    1.207525
Raiders of the Lost Ark    1.031315
                             ...   
Collateral                -0.147977
Scarface                  -0.150737
Mulholland Drive          -0.160860
Natural Born Killers      -0.161316
28 Days Later             -0.176492
Name: 302, Length: 2240, dtype: float64

In [45]:
a = sorted_user_predictions.reset_index()
a

Unnamed: 0,title,302
0,Star Wars,1.662909
1,Return of the Jedi,1.315186
2,Toy Story,1.225965
3,The Empire Strikes Back,1.207525
4,Raiders of the Lost Ark,1.031315
...,...,...
2235,Collateral,-0.147977
2236,Scarface,-0.150737
2237,Mulholland Drive,-0.160860
2238,Natural Born Killers,-0.161316


In [46]:
type(a)

pandas.core.frame.DataFrame

In [47]:
already_rated, predictions = recommend_movies(df_svd_preds, data, 330, 10)

In [48]:
len(already_rated)

298

In [49]:
already_rated.head(10)

Unnamed: 0,genres,id,original_language,overview,popularity,spoken_languages,title,vote_average,vote_count,score,cast,crew,keywords,movieId,userId,rating
12700581,Action Adventure Animation Family,9806,en,Bob Parr has given up his superhero days to lo...,22.220214,Français English,The Incredibles,7.4,5290,7.264912,CraigT.Nelson HollyHunter SamuelL.Jackson Jaso...,BradBird,secret identity secret hero island wretch supe...,8961,330,5.0
14326502,Horror Drama,13310,sv,"Set in 1982 in the suburb of Blackeberg, Stock...",7.274237,svenska,Let the Right One In,7.5,997,6.929319,KåreHedebrant LinaLeandersson PerRagnar Henrik...,TomasAlfredson,female nudity vampire castration bullying chil...,61240,330,5.0
4852021,Drama War,600,en,A pragmatic U.S. Marine observes the dehumaniz...,13.94148,English Tiếng Việt,Full Metal Jacket,7.9,2595,7.573079,MatthewModine AdamBaldwin VincentD'Onofrio R.L...,StanleyKubrick,suicide prostitute helicopter based on novel j...,1222,330,5.0
1990475,Comedy Drama,9571,en,The adventures of a group of Texas teens on th...,6.8652,English,Dazed and Confused,7.4,588,6.6434,JasonLondon RoryCochrane WileyWiggins SashaJen...,RichardLinklater,1970s texas high school comedy coming of age s...,441,330,5.0
14130046,Drama Action Crime Thriller,155,en,Batman raises the stakes in his war on crime. ...,123.167259,English 普通话,The Dark Knight,8.3,12269,8.20838,ChristianBale MichaelCaine HeathLedger AaronEc...,ChristopherNolan,dc comics crime fighter secret identity scarec...,58559,330,5.0
1218627,Thriller Crime,680,en,"A burger-loving hit man, his philosophical par...",140.950236,English Español Français,Pulp Fiction,8.3,8670,8.172161,JohnTravolta SamuelL.Jackson UmaThurman BruceW...,QuentinTarantino,transporter brothel drug dealer boxer massage ...,296,330,5.0
13988118,Crime Drama Thriller,6977,en,"Llewelyn Moss stumbles upon dead bodies, $2 mi...",15.565484,English Español,No Country for Old Men,7.7,3083,7.443121,TommyLeeJones JavierBardem JoshBrolin WoodyHar...,JoelCoen EthanCoen,texas drug traffic hitman united states–mexico...,55820,330,5.0
9816234,Comedy Crime,37136,en,When the incompetent Officer Frank Drebin seek...,15.698222,English,The Naked Gun: From the Files of Police Squad!,7.1,1020,6.657741,LeslieNielsen PriscillaPresley RicardoMontalba...,DavidZucker,baseball aquarium queen elisabeth ii terrorism...,3868,330,5.0
2421748,Fantasy Animation Family,9479,en,Tired of scaring humans every October 31 with ...,17.730913,English,The Nightmare Before Christmas,7.6,2135,7.265222,DannyElfman ChrisSarandon CatherineO'Hara Will...,HenrySelick,holiday fire santa claus magic halloween skele...,551,330,5.0
8536584,Comedy,813,en,"Alcoholic pilot, Ted Striker has developed a f...",13.063203,English,Airplane!,7.1,1104,6.681895,RobertHays JulieHagerty KareemAbdul-Jabbar Llo...,JimAbrahams DavidZucker JerryZucker,chicago alcohol cataclysm guitar medicine taxi...,2791,330,5.0


In [50]:
predictions

Unnamed: 0,genres,id,original_language,overview,popularity,spoken_languages,title,vote_average,vote_count,score,cast,crew,keywords,movieId,userId,rating,Predictions
147,Adventure Action,89,en,When Dr. Henry Jones Sr. suddenly goes missing...,14.788987,Deutsch ελληνικά English,Indiana Jones and the Last Crusade,7.6,3221,7.364694,HarrisonFord SeanConnery DenholmElliott Alison...,StevenSpielberg,saving the world venice holy grail library rid...,1291,9,3.5,3.253061
8,Drama Crime Thriller,629,en,"Held in an L.A. interrogation room, Verbal Kin...",16.302466,Español English Français Magyar,The Usual Suspects,8.1,3334,7.81416,StephenBaldwin GabrielByrne ChazzPalminteri Ke...,BryanSinger,law relatives theft criminal criminal mastermi...,50,12,5.0,3.247964
175,ScienceFiction Comedy Crime,816,en,As a swingin' fashion photographer by day and ...,11.749159,English,Austin Powers: International Man of Mystery,6.5,1033,6.239165,MikeMyers ElizabethHurley MichaelYork MimiRoge...,JayRoach,android undercover missile group therapy airpl...,1517,12,4.0,2.923533
26,Comedy,8467,en,Lloyd and Harry are two men whose stupidity is...,9.844558,English,Dumb and Dumber,6.5,1894,6.335634,JimCarrey JeffDaniels LaurenHolly TeriGarr Mik...,PeterFarrelly BobbyFarrelly,gas station motel utah stupidity pill prank ci...,231,11,2.5,2.84973
544,Adventure Action ScienceFiction Thriller,36658,en,Professor Charles Xavier and his team of genet...,0.913096,English Deutsch Italiano,X2,6.8,3572,6.671981,PatrickStewart HughJackman IanMcKellen HalleBe...,BryanSinger,mutant marvel comic superhero based on comic s...,6333,9,4.0,2.69225
226,Comedy Drama,2108,en,"Five high school students, all different stere...",12.627678,English,The Breakfast Club,7.8,2189,7.439022,EmilioEstevez AnthonyMichaelHall JuddNelson Mo...,JohnHughes,high school tardy hall teenager detention,1968,1,4.0,2.656939
119,Drama Crime,769,en,"The true story of Henry Hill, a half-Irish, ha...",15.424092,Italiano English,GoodFellas,8.2,3211,7.892608,RobertDeNiro RayLiotta JoePesci LorraineBracco...,MartinScorsese,prison based on novel florida 1970s mass murde...,1213,12,4.0,2.482377
59,Fantasy Action,268,en,The Dark Knight of Gotham City begins his war ...,19.10673,English Français,Batman,7.0,2145,6.767489,JackNicholson MichaelKeaton KimBasinger Michae...,TimBurton,double life dc comics dual identity chemical c...,592,10,4.0,2.454745
765,Drama Mystery Thriller,1124,en,A mysterious story of two magicians whose inte...,16.94556,English,The Prestige,8.0,4510,7.790929,HughJackman ChristianBale MichaelCaine Scarlet...,ChristopherNolan,competition secret obsession magic dying and d...,48780,11,4.5,2.375017
70,Drama Crime,627,en,"Renton, deeply immersed in the Edinburgh drug ...",19.348466,English,Trainspotting,7.8,2737,7.501405,EwanMcGregor EwenBremner JonnyLeeMiller Robert...,DannyBoyle,london england alcohol sex based on novel drug...,778,12,5.0,2.350469


## my try
- 위와 추천 영화가 큰 차이없으나, 예상 평점을 예측하려면 위의 방법을 사용해야 함

In [51]:
U[0]

array([ 2.01260133e-03, -1.10262833e-03,  6.17532951e-04,  9.06296586e-04,
        9.79917030e-04, -9.29785879e-05,  4.34308390e-04, -6.00951587e-04,
        8.12000692e-04, -7.53762716e-05, -1.45220542e-03, -6.04132993e-04])

In [52]:
Vt[:, 0]

array([-0.02476449, -0.00300269, -0.00676206,  0.03744929,  0.01141148,
        0.03205645,  0.02808727,  0.01222225,  0.01426039,  0.00954598,
       -0.03761323, -0.00294891])

In [53]:
np.dot(U[0], Vt[:, 0])

6.355306192167007e-05

In [54]:
table = np.dot(U, Vt)

In [55]:
table_mean = table + user_ratings_mean.reshape(-1, 1)   # 

In [56]:
my_preds = pd.DataFrame(table_mean, index=pivot_t.index ,columns=pivot_t.columns)
my_preds.head()

title,(500) Days of Summer,10 Cloverfield Lane,10 Things I Hate About You,"10,000 BC",101 Dalmatians,12 Angry Men,12 Years a Slave,127 Hours,13 Going on 30,13 Hours: The Secret Soldiers of Benghazi,...,Zoolander,Zoolander 2,Zootopia,[REC],[REC]²,eXistenZ,xXx,xXx: Return of Xander Cage,xXx: State of the Union,Æon Flux
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.047162,0.047107,0.047207,0.047097,0.047113,0.047195,0.047121,0.047131,0.047122,0.047084,...,0.047174,0.04708,0.047186,0.047089,0.047078,0.047009,0.047106,0.047078,0.047086,0.047079
2,0.01698,0.016962,0.016892,0.016969,0.01715,0.016933,0.016976,0.016976,0.016951,0.016973,...,0.016891,0.016971,0.016939,0.016962,0.016971,0.016955,0.01694,0.016973,0.01697,0.016965
3,0.011152,0.011158,0.011176,0.011161,0.011181,0.011208,0.011165,0.011157,0.011169,0.011164,...,0.011136,0.011162,0.011175,0.01115,0.011161,0.011131,0.011154,0.011163,0.011162,0.011148
4,0.07232,0.072317,0.0726,0.072292,0.072353,0.072166,0.072321,0.072308,0.072351,0.072309,...,0.072339,0.072312,0.072347,0.072293,0.07231,0.072382,0.072318,0.072312,0.072302,0.072278
5,0.033412,0.033457,0.033347,0.03349,0.033491,0.033799,0.03345,0.03344,0.033466,0.033483,...,0.033449,0.033483,0.033452,0.033479,0.033483,0.03343,0.033474,0.033485,0.033489,0.033478


In [57]:
already_rated, my_predictions = recommend_movies(my_preds, data, 330, 10)

In [58]:
my_predictions

Unnamed: 0,genres,id,original_language,overview,popularity,spoken_languages,title,vote_average,vote_count,score,cast,crew,keywords,movieId,userId,rating,Predictions
26,Comedy,8467,en,Lloyd and Harry are two men whose stupidity is...,9.844558,English,Dumb and Dumber,6.5,1894,6.335634,JimCarrey JeffDaniels LaurenHolly TeriGarr Mik...,PeterFarrelly BobbyFarrelly,gas station motel utah stupidity pill prank ci...,231,11,2.5,0.420751
175,ScienceFiction Comedy Crime,816,en,As a swingin' fashion photographer by day and ...,11.749159,English,Austin Powers: International Man of Mystery,6.5,1033,6.239165,MikeMyers ElizabethHurley MichaelYork MimiRoge...,JayRoach,android undercover missile group therapy airpl...,1517,12,4.0,0.420619
226,Comedy Drama,2108,en,"Five high school students, all different stere...",12.627678,English,The Breakfast Club,7.8,2189,7.439022,EmilioEstevez AnthonyMichaelHall JuddNelson Mo...,JohnHughes,high school tardy hall teenager detention,1968,1,4.0,0.420439
761,Comedy,496,en,Kazakh journalist Borat Sagdiyev travels to Am...,8.278336,English עִבְרִית қазақ,Borat: Cultural Learnings of America for Make ...,6.5,1617,6.313435,SachaBaronCohen KenDavitian Luenell PamelaAnde...,LarryCharles,male nudity usa california prostitute journali...,48385,20,3.5,0.420306
147,Adventure Action,89,en,When Dr. Henry Jones Sr. suddenly goes missing...,14.788987,Deutsch ελληνικά English,Indiana Jones and the Last Crusade,7.6,3221,7.364694,HarrisonFord SeanConnery DenholmElliott Alison...,StevenSpielberg,saving the world venice holy grail library rid...,1291,9,3.5,0.420287
59,Fantasy Action,268,en,The Dark Knight of Gotham City begins his war ...,19.10673,English Français,Batman,7.0,2145,6.767489,JackNicholson MichaelKeaton KimBasinger Michae...,TimBurton,double life dc comics dual identity chemical c...,592,10,4.0,0.420267
485,Comedy Drama,9428,en,An estranged family of former child prodigies ...,9.707542,English Italiano,The Royal Tenenbaums,7.4,1317,6.958398,GeneHackman AnjelicaHuston BenStiller GwynethP...,WesAnderson,forgiveness child prodigy terminal illness dys...,4979,8,4.0,0.420163
544,Adventure Action ScienceFiction Thriller,36658,en,Professor Charles Xavier and his team of genet...,0.913096,English Deutsch Italiano,X2,6.8,3572,6.671981,PatrickStewart HughJackman IanMcKellen HalleBe...,BryanSinger,mutant marvel comic superhero based on comic s...,6333,9,4.0,0.420136
119,Drama Crime,769,en,"The true story of Henry Hill, a half-Irish, ha...",15.424092,Italiano English,GoodFellas,8.2,3211,7.892608,RobertDeNiro RayLiotta JoePesci LorraineBracco...,MartinScorsese,prison based on novel florida 1970s mass murde...,1213,12,4.0,0.420099
25,Comedy,11017,en,Billy Madison is the 27 year-old son of Bryan ...,6.638116,Français Español English,Billy Madison,6.2,464,5.918881,AdamSandler DarrenMcGavin BridgetteWilson Brad...,TamraDavis,woman director,216,20,3.5,0.42006


In [59]:
pd.concat([predictions['title'], my_predictions['title']], axis=1)

Unnamed: 0,title,title.1
8,The Usual Suspects,
25,,Billy Madison
26,Dumb and Dumber,Dumb and Dumber
59,Batman,Batman
70,Trainspotting,
119,GoodFellas,GoodFellas
147,Indiana Jones and the Last Crusade,Indiana Jones and the Last Crusade
175,Austin Powers: International Man of Mystery,Austin Powers: International Man of Mystery
226,The Breakfast Club,The Breakfast Club
485,,The Royal Tenenbaums
