# Overview와 Tagline을 이용한 유사도 측정

줄거리와 태그라인의 단어 유사도를 통해 영화끼리의 유사도를 측정해보겠다.

In [1]:
%matplotlib inline
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from ast import literal_eval 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer 
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet

import warnings; warnings.simplefilter('ignore')

#필요한 라이브러리들 가져오기

데이터의 크기를 줄이지 않고 진행하였더니 MemoryError가 발생하여 데이터의 크기를 줄이는 작업을 행했다.

In [2]:
md = pd.read_csv('../movies_metadata.csv')
link_small = pd.read_csv('../links_small.csv')
link_small = link_small[link_small['tmdbId'].notnull()]['tmdbId'].astype('int')
md = md.drop([19730, 29503, 35587])
md['id'] = md['id'].astype('int')
movies = md[md['id'].isin(link_small)]

영화의 설명을 벡터화시킨다. TfidVectorizer를 이용한다. 

In [3]:
movies['tagline'] = movies['tagline'].fillna(' ')
movies['description'] = movies['overview'] + movies['tagline']
movies['description'].fillna(' ')

0        Led by Woody, Andy's toys live happily in his ...
1        When siblings Judy and Peter discover an encha...
2        A family wedding reignites the ancient feud be...
3        Cheated on, mistreated and stepped on, the wom...
4        Just when George Banks has recovered from his ...
                               ...                        
40224    From the mind behind Evangelion comes a hit la...
40503    The band stormed Europe in 1963, and, in 1964,...
44821    When Molly Hale's sadness of her father's disa...
44826    All your favorite Pokémon characters are back,...
45265    While holidaying in the French Alps, a Swedish...
Name: description, Length: 9099, dtype: object

In [4]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(movies['description'].values.astype('U'))

코사인 유사도를 이용해 두 영화 사이의 유사도를 구한다. 

In [5]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim[0]

array([1.        , 0.00680476, 0.        , ..., 0.        , 0.00344913,
       0.        ])

In [6]:
movies = movies.reset_index()
titles = movies['title']
indces = pd.Series(movies.index, index=titles)

제목을 인덱스로 하고 지수를 만든다. 선형 커널로 만든 코사인 유사도의 값을 구한다.

In [7]:
def getrecommandations(title):
    index = indces[title]
    movies_scores = list(enumerate(cosine_sim[index]))
    movies_scores = sorted(movies_scores, key=lambda x:x[1], reverse=True)
    movies_scores = movies_scores[1:31]
    movies_indices = [i[0] for i in movies_scores] 
    return titles.iloc[movies_indices]

인덱스 값과 입력된 다른 인덱스들이 얼마나 비슷한지 점수가 나온다. 

In [8]:
list(enumerate(cosine_sim[0]))

[(0, 0.9999999999999998),
 (1, 0.006804755671748422),
 (2, 0.0),
 (3, 0.0),
 (4, 0.0),
 (5, 0.0),
 (6, 0.0),
 (7, 0.0),
 (8, 0.0),
 (9, 0.0),
 (10, 0.0),
 (11, 0.0),
 (12, 0.0),
 (13, 0.0),
 (14, 0.0),
 (15, 0.0),
 (16, 0.006787806467693802),
 (17, 0.012752661542485257),
 (18, 0.0),
 (19, 0.0),
 (20, 0.004436513622462535),
 (21, 0.0),
 (22, 0.0035114754610620865),
 (23, 0.0),
 (24, 0.0),
 (25, 0.0),
 (26, 0.0),
 (27, 0.0),
 (28, 0.01477880238197027),
 (29, 0.0),
 (30, 0.0),
 (31, 0.0),
 (32, 0.008376177570083292),
 (33, 0.0),
 (34, 0.0),
 (35, 0.0),
 (36, 0.0),
 (37, 0.0),
 (38, 0.0),
 (39, 0.0),
 (40, 0.0),
 (41, 0.003515789361404667),
 (42, 0.0),
 (43, 0.0),
 (44, 0.0041709614499456414),
 (45, 0.0),
 (46, 0.0),
 (47, 0.0),
 (48, 0.006490627286836496),
 (49, 0.004844838982980996),
 (50, 0.0),
 (51, 0.0),
 (52, 0.00810046370192087),
 (53, 0.008466183599812202),
 (54, 0.00835779370046548),
 (55, 0.0),
 (56, 0.014066359858020279),
 (57, 0.0),
 (58, 0.0),
 (59, 0.0032263952735497176),
 (6

가장 유사하다고 판별된 영화를 30개(변경 가능) 선정한다.

In [9]:
getrecommandations('The Dark Knight')

7931                      The Dark Knight Rises
132                              Batman Forever
1113                             Batman Returns
8227    Batman: The Dark Knight Returns, Part 2
7565                 Batman: Under the Red Hood
524                                      Batman
7901                           Batman: Year One
2579               Batman: Mask of the Phantasm
2696                                        JFK
8165    Batman: The Dark Knight Returns, Part 1
6144                              Batman Begins
7933         Sherlock Holmes: A Game of Shadows
5511                            To End All Wars
4489                                      Q & A
7344                        Law Abiding Citizen
7242                  The File on Thelma Jordon
3537                               Criminal Law
2893                              Flying Tigers
1135                   Night Falls on Manhattan
8680                          The Young Savages
8917         Batman v Superman: Dawn of 