# 영화 추천 시스템 - 줄거리, 감독, 주연배우 포함

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
movie = pd.read_csv('data/movies/movies_metadata.csv', low_memory=False)
info = pd.read_csv('data/movies/credits.csv')
info.head(2)

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844


In [3]:
movie.shape, info.shape

((45466, 24), (45476, 3))

In [4]:
df = movie[['id','title','overview']]
df[df.id=='1997-08-20']

Unnamed: 0,id,title,overview
19730,1997-08-20,,Released


In [5]:
df.dropna(inplace=True)
df['id'] = df.id.astype(int)

In [6]:
info['id'] = info.id.astype(int)
df = df.merge(info, on='id')
df.set_index('id', inplace=True)
df.head(3)

Unnamed: 0_level_0,title,overview,cast,crew
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
8844,Jumanji,When siblings Judy and Peter discover an encha...,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."
15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de..."


In [None]:
# Colab 에서 수행하면 이 코드를 수행할 것
# df = df.head(20000)

In [7]:
df.cast[862]

"[{'cast_id': 14, 'character': 'Woody (voice)', 'credit_id': '52fe4284c3a36847f8024f95', 'gender': 2, 'id': 31, 'name': 'Tom Hanks', 'order': 0, 'profile_path': '/pQFoyx7rp09CJTAb932F2g8Nlho.jpg'}, {'cast_id': 15, 'character': 'Buzz Lightyear (voice)', 'credit_id': '52fe4284c3a36847f8024f99', 'gender': 2, 'id': 12898, 'name': 'Tim Allen', 'order': 1, 'profile_path': '/uX2xVf6pMmPepxnvFWyBtjexzgY.jpg'}, {'cast_id': 16, 'character': 'Mr. Potato Head (voice)', 'credit_id': '52fe4284c3a36847f8024f9d', 'gender': 2, 'id': 7167, 'name': 'Don Rickles', 'order': 2, 'profile_path': '/h5BcaDMPRVLHLDzbQavec4xfSdt.jpg'}, {'cast_id': 17, 'character': 'Slinky Dog (voice)', 'credit_id': '52fe4284c3a36847f8024fa1', 'gender': 2, 'id': 12899, 'name': 'Jim Varney', 'order': 3, 'profile_path': '/eIo2jVVXYgjDtaHoF19Ll9vtW7h.jpg'}, {'cast_id': 18, 'character': 'Rex (voice)', 'credit_id': '52fe4284c3a36847f8024fa5', 'gender': 2, 'id': 12900, 'name': 'Wallace Shawn', 'order': 4, 'profile_path': '/oGE6JqPP2xH4t

- 주연 배우

In [8]:
from ast import literal_eval
df['cast'] = df.cast.apply(literal_eval)
df.head(3)

Unnamed: 0_level_0,title,overview,cast,crew
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
8844,Jumanji,When siblings Judy and Peter discover an encha...,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."
15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de..."


In [9]:
df.cast[862][0]['name']

'Tom Hanks'

In [10]:
import re
a = ['a B', 'c D']
a = list(map(lambda x: re.sub(' ','',x).lower(), a))
a

['ab', 'cd']

In [11]:
def get_3cast(x):
    cast = []
    for item in x:
        if item['name'] not in cast:
            cast.append(item['name'])
    cast = cast if len(cast) <= 3 else cast[:3]
    cast = list(map(lambda x: re.sub(' ','',x).lower(), cast))
    return ' '.join(cast)

In [12]:
df['cast3'] = df.cast.apply(get_3cast)

- 감독

In [13]:
df['crew'] = df.crew.apply(literal_eval)

In [14]:
df.crew[862]

[{'credit_id': '52fe4284c3a36847f8024f49',
  'department': 'Directing',
  'gender': 2,
  'id': 7879,
  'job': 'Director',
  'name': 'John Lasseter',
  'profile_path': '/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg'},
 {'credit_id': '52fe4284c3a36847f8024f4f',
  'department': 'Writing',
  'gender': 2,
  'id': 12891,
  'job': 'Screenplay',
  'name': 'Joss Whedon',
  'profile_path': '/dTiVsuaTVTeGmvkhcyJvKp2A5kr.jpg'},
 {'credit_id': '52fe4284c3a36847f8024f55',
  'department': 'Writing',
  'gender': 2,
  'id': 7,
  'job': 'Screenplay',
  'name': 'Andrew Stanton',
  'profile_path': '/pvQWsu0qc8JFQhMVJkTHuexUAa1.jpg'},
 {'credit_id': '52fe4284c3a36847f8024f5b',
  'department': 'Writing',
  'gender': 2,
  'id': 12892,
  'job': 'Screenplay',
  'name': 'Joel Cohen',
  'profile_path': '/dAubAiZcvKFbboWlj7oXOkZnTSu.jpg'},
 {'credit_id': '52fe4284c3a36847f8024f61',
  'department': 'Writing',
  'gender': 0,
  'id': 12893,
  'job': 'Screenplay',
  'name': 'Alec Sokolow',
  'profile_path': '/v79vlRYi94BZUQnkkyzn

In [15]:
def get_director(x):
    for item in x:
        if item['job'] == 'Director':
            return item['name'].replace(' ','').lower()
    return ''

In [16]:
df['director'] = df.crew.apply(get_director)

In [17]:
df.reset_index(inplace=True)
df.head(3)

Unnamed: 0,id,title,overview,cast,crew,cast3,director
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",tomhanks timallen donrickles,johnlasseter
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",robinwilliams jonathanhyde kirstendunst,joejohnston
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",waltermatthau jacklemmon ann-margret,howarddeutch


## DTM 변환

In [18]:
df['total'] = df.overview + ' ' + df.director + ' ' + df.cast3
df.head(3)

Unnamed: 0,id,title,overview,cast,crew,cast3,director,total
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",tomhanks timallen donrickles,johnlasseter,"Led by Woody, Andy's toys live happily in his ..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",robinwilliams jonathanhyde kirstendunst,joejohnston,When siblings Judy and Peter discover an encha...
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",waltermatthau jacklemmon ann-margret,howarddeutch,A family wedding reignites the ancient feud be...


In [19]:
# Overview + 감독 + 주연배우
from sklearn.feature_extraction.text import TfidfVectorizer

tvect = TfidfVectorizer(stop_words='english')
tfidf_matrix = tvect.fit_transform(df.total)
tfidf_matrix.shape

(44581, 137079)

In [20]:
# 영화의 타이틀과 인덱스를 가진 테이블
indices = pd.Series(df.index, index=df.title).drop_duplicates()
indices.head()

title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
dtype: int64

In [21]:
# 코사인 유사도
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [22]:
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return df.title.iloc[movie_indices]

In [23]:
get_recommendations('The Dark Knight Rises')

12491                                      The Dark Knight
149                                         Batman Forever
1323                                        Batman Returns
10128                                        Batman Begins
15509                           Batman: Under the Red Hood
583                                                 Batman
21110    Batman Unmasked: The Psychology of the Dark Kn...
9237                    Batman Beyond: Return of the Joker
18002                                     Batman: Year One
4361                                          Criminal Law
Name: title, dtype: object

In [24]:
get_recommendations('Toy Story')

15347                                     Toy Story 3
2990                                      Toy Story 2
24387                                       Small Fry
10307                          The 40 Year Old Virgin
23714                     Andy Hardy's Blonde Trouble
42644                Andy Kaufman Plays Carnegie Hall
28975                                      Hot Splash
8338                                        The Champ
41966    Andy Peters: Exclamation Mark Question Point
37854    Superstar: The Life and Times of Andy Warhol
Name: title, dtype: object