# 영화추천 시스템
- 코사인 유사도 활용
- TfidfVectorizer 사용
- 영화의 줄거리

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
movie = pd.read_csv('data/movies/movies_metadata.csv', low_memory=False)
movie.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [3]:
movie.shape

(45466, 24)

In [4]:
df = movie[['title', 'overview']]
df.head(3)

Unnamed: 0,title,overview
0,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,Jumanji,When siblings Judy and Peter discover an encha...
2,Grumpier Old Men,A family wedding reignites the ancient feud be...


In [5]:
df.overview[0]

"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences."

### 데이터 전처리

In [6]:
# null data 확인
df.isnull().sum()

title         6
overview    954
dtype: int64

In [7]:
# 결측치 제거
df.dropna(inplace=True)
df.shape

(44506, 2)

In [8]:
# 중복 확인
df.title.nunique()

41371

In [9]:
# 중복 제거
df.drop_duplicates(subset=['title'], inplace=True)
df.shape

(41371, 2)

In [10]:
df.tail(3)

Unnamed: 0,title,overview
45462,Century of Birthing,An artist struggles to finish his work while a...
45464,Satan Triumphant,"In a small town live two brothers, one a minis..."
45465,Queerama,50 years after decriminalisation of homosexual...


In [11]:
df.iloc[41370,0]

'Queerama'

In [12]:
# 인덱스 정리(인덱스와 몇번째 행의 값을 일치)
df.set_index('title', inplace=True)
df.reset_index(inplace=True)
df.tail(3)

Unnamed: 0,title,overview
41368,Century of Birthing,An artist struggles to finish his work while a...
41369,Satan Triumphant,"In a small town live two brothers, one a minis..."
41370,Queerama,50 years after decriminalisation of homosexual...


In [13]:
# 20,000 개의 영화로 추천시스템 만들기
df = df.head(20000)

### 텍스트 전처리

In [14]:
# 숫자, 구둣점 제거
df['clean_doc'] = df.overview.str.replace('[^A-Za-z ]', '')
df.head(3)

Unnamed: 0,title,overview,clean_doc
0,Toy Story,"Led by Woody, Andy's toys live happily in his ...",Led by Woody Andys toys live happily in his ro...
1,Jumanji,When siblings Judy and Peter discover an encha...,When siblings Judy and Peter discover an encha...
2,Grumpier Old Men,A family wedding reignites the ancient feud be...,A family wedding reignites the ancient feud be...


### DTM 변환

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

tvect = TfidfVectorizer(stop_words='english')
dtm = tvect.fit_transform(df.overview)
dtm.shape

(20000, 47999)

In [16]:
dtm_clean = tvect.fit_transform(df.clean_doc)
dtm_clean.shape

(20000, 54842)

### 영화의 타이틀과 인덱스를 가진 테이블

In [17]:
indices = pd.Series(df.index, index=df.title)
indices.head()

title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
dtype: int64

In [18]:
indices['Jumanji']

1

### 코사인 유사도 - 유사 영화 도출

In [19]:
from sklearn.metrics.pairwise import linear_kernel

cosine_sim = linear_kernel(dtm, dtm)
cosine_clean = linear_kernel(dtm_clean, dtm_clean)

In [20]:
cosine_sim.shape, cosine_clean.shape

((20000, 20000), (20000, 20000))

- Overview로 도출

In [22]:
index = indices['The Dark Knight Rises']
index

17362

In [24]:
sim_scores = pd.Series(cosine_sim[index])
sim_scores.head()

0    0.000000
1    0.006117
2    0.000000
3    0.000000
4    0.000000
dtype: float64

In [26]:
sim_scores.sort_values(ascending=False).head(11).tail(10)

12041    0.319791
149      0.313563
1311     0.301648
14858    0.295149
583      0.277144
8966     0.237472
17165    0.208331
18775    0.205745
3042     0.193898
19171    0.187890
dtype: float64

In [27]:
movie_indices = sim_scores.sort_values(ascending=False).head(11).tail(10).index
movie_indices

Int64Index([12041, 149, 1311, 14858, 583, 8966, 17165, 18775, 3042, 19171], dtype='int64')

In [28]:
recommended_movies = df.title.iloc[movie_indices]
recommended_movies

12041                            The Dark Knight
149                               Batman Forever
1311                              Batman Returns
14858                 Batman: Under the Red Hood
583                                       Batman
8966          Batman Beyond: Return of the Joker
17165                           Batman: Year One
18775    Batman: The Dark Knight Returns, Part 1
3042                Batman: Mask of the Phantasm
19171    Batman: The Dark Knight Returns, Part 2
Name: title, dtype: object

In [29]:
def get_recommendation(title, cos_sim):
    index = indices[title]
    sim_scores = pd.Series(cosine_sim[index])
    movie_indices = sim_scores.sort_values(ascending=False).head(11).tail(10).index
    return df.title.iloc[movie_indices]

In [30]:
get_recommendation('Toy Story', cosine_sim)

14706               Toy Story 3
2945                Toy Story 2
9984     The 40 Year Old Virgin
1056      Rebel Without a Cause
11016    For Your Consideration
1910                  Condorman
3004            Man on the Moon
483                      Malice
11209              Factory Girl
16400                 Group Sex
Name: title, dtype: object

- clean doc으로 부터 가져오기

In [31]:
get_recommendation('Toy Story', cosine_clean)

14706               Toy Story 3
2945                Toy Story 2
9984     The 40 Year Old Virgin
1056      Rebel Without a Cause
11016    For Your Consideration
1910                  Condorman
3004            Man on the Moon
483                      Malice
11209              Factory Girl
16400                 Group Sex
Name: title, dtype: object