## 컨텐츠 기반 추천 - 영화 추천
- 영화의 줄거리 --> TfidfVectorizer --> 코사인 유사도

In [1]:
import numpy as np
import pandas as pd

#### 1. 데이터 탐색

In [3]:
df = pd.read_csv('data/movies_metadata.csv', low_memory=False)
df.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [4]:
df = df[['title', 'overview']]
df.head()

Unnamed: 0,title,overview
0,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,Jumanji,When siblings Judy and Peter discover an encha...
2,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,Father of the Bride Part II,Just when George Banks has recovered from his ...


In [5]:
df.shape

(45466, 2)

#### 2. 데이터 전처리

In [6]:
# 결측치 확인
df.isna().sum()

title         6
overview    954
dtype: int64

In [8]:
# 결측치 제거
df.dropna(how='any', inplace=True)
df.shape

(44506, 2)

In [10]:
# 중복 데이터 확인
df.title.nunique(), df.overview.nunique()

(41371, 44303)

In [11]:
# 중복 데이터 제거
df.drop_duplicates(subset=['overview'], inplace=True)
df.drop_duplicates(subset=['title'], inplace=True)
df.shape

(41218, 2)

In [12]:
# 인덱스 정리
df.set_index('title', inplace=True)
df.reset_index(inplace=True)
df.tail(3)

Unnamed: 0,title,overview
41215,Century of Birthing,An artist struggles to finish his work while a...
41216,Satan Triumphant,"In a small town live two brothers, one a minis..."
41217,Queerama,50 years after decriminalisation of homosexual...


- 모든 데이터로 하기에는 메모리 문제가 발생할 소지가 있음
- 20000 건의 데이터로 영화 추천시스템 만들기

In [13]:
df = df.head(20000)

#### 3. Feature 변환

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
tvect = TfidfVectorizer(stop_words='english')
overview_tv = tvect.fit_transform(df.overview)
overview_tv.shape

(20000, 48037)

In [15]:
# 검색을 위한 테이블
indices = pd.Series(df.index, index=df.title)

In [16]:
index = indices['Jumanji']
index

1

#### 4. 코사인 유사도

In [17]:
from sklearn.metrics.pairwise import linear_kernel

cos_sim_overview = linear_kernel(overview_tv, overview_tv)

In [18]:
sim_scores = pd.Series(cos_sim_overview[index])
sim_scores.head()

0    0.016035
1    1.000000
2    0.049245
3    0.000000
4    0.000000
dtype: float64

In [21]:
movie_indices = sim_scores.sort_values(ascending=False).head(6).tail(5).index
df.title[movie_indices]

6050          Brainscan
8558            Quintet
16402    The Dark Angel
9219          Word Wars
16063            DeVour
Name: title, dtype: object

In [24]:
def get_recommendation(title, cos_sim=cos_sim_overview):
    index = indices[title]
    sim_scores = pd.Series(cos_sim[index])
    movie_indices = sim_scores.sort_values(ascending=False).head(11).tail(10).index
    return df.title[movie_indices]

In [25]:
get_recommendation('Toy Story')

14687               Toy Story 3
2941                Toy Story 2
9972     The 40 Year Old Virgin
1054      Rebel Without a Cause
11003    For Your Consideration
1907                  Condorman
3000            Man on the Moon
483                      Malice
11195              Factory Girl
16372                 Group Sex
Name: title, dtype: object