# 영화 추천 시스템 - 줄거리, 감독, 영화배우

In [18]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [19]:
movie = pd.read_csv('data/movies/movies_metadata.csv', low_memory=False)
info = pd.read_csv('data/movies/credits.csv')
info.head(2)

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844


In [20]:
movie.shape, info.shape

((45466, 24), (45476, 3))

In [21]:
df = movie[['id','title','overview']]
df[df.id=='1997-08-20']

Unnamed: 0,id,title,overview
19730,1997-08-20,,Released


In [22]:
df.dropna(inplace=True)
df['id'] = df.id.astype(int)

In [23]:
df

Unnamed: 0,id,title,overview
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...
...,...,...,...
45461,439050,Subdue,Rising and falling between a man and woman.
45462,111109,Century of Birthing,An artist struggles to finish his work while a...
45463,67758,Betrayal,"When one of her hits goes wrong, a professiona..."
45464,227506,Satan Triumphant,"In a small town live two brothers, one a minis..."


In [24]:
info['id'] = info.id.astype(int)

from ast import literal_eval
info['cast'] = info.cast.apply(literal_eval)
info.head(3)

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602


In [25]:
info.cast[862][0]['name']

'Ad van Kempen'

In [26]:
import re

In [27]:
def get_3cast(x):
    cast = []
    for item in x:
        if item['name'] not in cast:
            cast.append(item['name'])
    cast = cast if len(cast) <= 3 else cast[:3]
    cast = list(map(lambda x: re.sub(' ','',x), cast))
    return ' '.join(cast)

In [28]:
df['cast3'] = info.cast.apply(get_3cast)

In [29]:
info['crew'] = info.crew.apply(literal_eval)

In [30]:
def get_director(x):
    for item in x:
        if item['job'] == 'Director':
            return item['name'].replace(' ','')
    return ''

In [31]:
df['director'] = info.crew.apply(get_director)

In [32]:
df.reset_index(inplace=True)
df.head(3)

Unnamed: 0,index,id,title,overview,cast3,director
0,0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...",TomHanks TimAllen DonRickles,JohnLasseter
1,1,8844,Jumanji,When siblings Judy and Peter discover an encha...,RobinWilliams JonathanHyde KirstenDunst,JoeJohnston
2,2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,WalterMatthau JackLemmon Ann-Margret,HowardDeutch


## DTM 변환

In [33]:
df['total'] = df.overview + ' ' + df.director + ' ' + df.cast3
df.head(3)

Unnamed: 0,index,id,title,overview,cast3,director,total
0,0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...",TomHanks TimAllen DonRickles,JohnLasseter,"Led by Woody, Andy's toys live happily in his ..."
1,1,8844,Jumanji,When siblings Judy and Peter discover an encha...,RobinWilliams JonathanHyde KirstenDunst,JoeJohnston,When siblings Judy and Peter discover an encha...
2,2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,WalterMatthau JackLemmon Ann-Margret,HowardDeutch,A family wedding reignites the ancient feud be...


In [34]:
# Overview + 감독 + 주연배우
from sklearn.feature_extraction.text import TfidfVectorizer

tvect = TfidfVectorizer(stop_words='english')
tfidf_matrix = tvect.fit_transform(df.total)
tfidf_matrix.shape

(44506, 137201)

In [35]:
# 영화의 타이틀과 인덱스를 가진 테이블
indices = pd.Series(df.index, index=df.title).drop_duplicates()     #중복값 제거
indices.head()

title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
dtype: int64

In [36]:
# 코사인 유사도
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [37]:
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return df.title.iloc[movie_indices]

In [38]:
get_recommendations('The Dark Knight Rises')

12447                                      The Dark Knight
149                                         Batman Forever
1314                                        Batman Returns
10092                                        Batman Begins
15444                           Batman: Under the Red Hood
583                                                 Batman
21036    Batman Unmasked: The Psychology of the Dark Kn...
9203                    Batman Beyond: Return of the Joker
17930                                     Batman: Year One
4344                                          Criminal Law
Name: title, dtype: object

In [39]:
get_recommendations('Toy Story')

15282                                     Toy Story 3
2979                                      Toy Story 2
10271                          The 40 Year Old Virgin
24314                                       Small Fry
23644                     Andy Hardy's Blonde Trouble
42566                Andy Kaufman Plays Carnegie Hall
8303                                        The Champ
28891                                      Hot Splash
41887    Andy Peters: Exclamation Mark Question Point
26943                      Life Begins for Andy Hardy
Name: title, dtype: object