# 데이터 불러오기 및 전처리

### 크롤링한 TMDB 영화 데이터 불러오기

In [None]:
import pandas as pd

In [None]:
movie_df = pd.read_json('tmdb_crawling.json')

In [None]:
movie_df.head(2)

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,keywords,cast,crew,success,status_code,status_message
0,0.0,/3Rfvhy1Nl6sSGJwyjb0QiZzZYlB.jpg,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000.0,"[{'id': 16, 'name': 'Animation'}, {'id': 12, '...",http://toystory.disney.com/toy-story,862.0,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",204.942,/uXDfjJbdP4ijW5hWSBrPrlKpxab.jpg,"[{'id': 3, 'logo_path': '/1TjvGVDMYsj6JBxOAkUH...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,Toy Story,0.0,8.0,14977.0,"[{'id': 779, 'name': 'martial arts'}, {'id': 9...","[{'adult': False, 'gender': 2, 'id': 31, 'know...","[{'adult': False, 'gender': 2, 'id': 7, 'known...",,,
1,0.0,/jVeKTyFBkm6CHm2bZMJ0KXRCyzp.jpg,"{'id': 495527, 'name': 'Jumanji Collection', '...",65000000.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",http://www.sonypictures.com/movies/jumanji/,8844.0,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,20.481,/6aGn2X51bahFoOI8wE1h2VGTgcH.jpg,"[{'id': 559, 'logo_path': '/jqWioYeGSyTLuHth01...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Roll the dice and unleash the excitement!,Jumanji,0.0,7.2,8732.0,"[{'id': 7035, 'name': 'giant insect'}, {'id': ...","[{'adult': False, 'gender': 2, 'id': 2157, 'kn...","[{'adult': False, 'gender': 2, 'id': 511, 'kno...",,,


### 전처리

- 리스트, 딕셔너리등으로 저장되어 있던 데이터를 필요한 형태로 전처리

In [None]:
movie_df = movie_df.drop(['success', 'status_code', 'status_message'], axis=1).dropna(how='all')
movie_df['genres'] = movie_df['genres'].apply(lambda x: [genre['name'] for genre in x] if isinstance(x, list) else [])
movie_df['keywords'] = movie_df['keywords'].apply(lambda x: [keyword['name'] for keyword in x] if isinstance(x, list) else [])
movie_df['cast_5'] = movie_df['cast'].apply(lambda x: [cast['name'] for cast in x][:5] if isinstance(x, list) else [])
movie_df['character_5'] = movie_df['cast'].apply(lambda x: [cast['character'] for cast in x][:5] if isinstance(x, list) else [])
movie_df['director'] = movie_df['crew'].apply(lambda x: [crew['name'] for crew in x if crew['job'] == 'Director'] if isinstance(x, list) else [])
movie_df['production_company'] = movie_df['production_companies'].apply(lambda x: [company['name'] for company in x] if isinstance(x, list) else [])
movie_df['id'] = movie_df['id'].astype(int)

# Word2Vec 학습 준비

### 필요 라이브러리 import

In [None]:
from gensim.models import Word2Vec
import os
import pandas as pd
from ast import literal_eval
import nltk
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re

import multiprocessing
import warnings
warnings.filterwarnings("ignore")

### 학습 데이터 전처리

영화간의 유사도를 구하기 위해 키워드, 장르, 제목, 감독, 배우, 캐릭터, 제작사를 한 문장(corpus)로 만들어 주기

In [None]:
features = ['keywords', 'genres', 'title', 'director', 'cast_5', 'character_5', 'production_company']

movie_df['corpus']  = ''

for feat in features:
    if feat == 'title':
        movie_df['corpus'] += " " + movie_df[feat].apply(lambda x : x.replace(' ', '').lower())
    elif feat in ['keywords', 'genres']:
        movie_df['corpus'] += " " + movie_df[feat].apply(lambda x: " ".join(x))
    else:
        movie_df['corpus'] += " " + movie_df[feat].apply(lambda x: " ".join([xx.replace(" ", "") for xx in x if isinstance(xx, str)]))

In [None]:
movie_df['corpus'][0]

' martial arts jealousy friendship bullying elementary school friends rivalry rescue mission buddy walkie talkie boy next door new toy neighborhood toy comes to life resourcefulness Animation Adventure Family Comedy toystory JohnLasseter TomHanks TimAllen DonRickles JimVarney WallaceShawn Woody(voice) BuzzLightyear(voice) Mr.PotatoHead(voice) SlinkyDog(voice) Rex(voice) Pixar'

corpus 전처리

In [None]:
# 모든 문자를 소문자로
movie_df['corpus'] = movie_df['corpus'].apply(lambda x : x.lower())

# 정규표현식 적용 -> 특수문자 제거
movie_df['corpus'] = movie_df['corpus'].apply(lambda x : re.sub("[^a-zA-Z]"," ",x))

tokenize

In [None]:
# corpus(말뭉치)를 tokenize
def tokenize(data):
    temp = list(data)
    temp = [str.lower(i) for i in temp]
    temp = [list(i.split(' ')) for i in temp]

    return temp

tokenized_data = tokenize(movie_df["corpus"])

불용어 제거

In [None]:
# 불용어 제거

# 불용어 목록 다운로드
nltk.download('stopwords')

def remove_stop_words(data):
    # 불용어(영어) 목록 불러오기
    stoplist=set(stopwords.words('english'))
    text=[]
    # 불용어 제거
    for i in data:
        text.append([word.replace('.', '') for word in i if word not in stoplist ])
    return text

# 공백 '' 제거
def remove_empty(data):
    text = []
    for i in data:
        text.append([word for word in i if word != ''])
    return text

tokenized_data = remove_stop_words(tokenized_data)
tokenized_data = remove_empty(tokenized_data)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Word2Vec 학습

In [None]:
# Word2Vec 모델 정의
# window : window size, 주변 몇개 단어까지 볼지
# min_count : min_count 개수 이하로 등장한 단어는 사용 안함
# sg : skipgram을 사용한다는 뜻 (CBOW 말고)
# workers : multiprocess? 계산하는데 좋다는데 잘 모르는데 복붙함
def train_model(data):
    model_=Word2Vec(data, window=10, min_count=1, sg=1, workers = multiprocessing.cpu_count())
    return model_

w2v_model = train_model(tokenized_data)


모델 저장

In [None]:
# w2v_model.save('word2vec_tmdb.model')

모델 불러오기

In [None]:
# from gensim.models import Word2Vec
# model = Word2Vec.load('word2vec_tmdb.model')

# 추천

각 영화에 해당하는 말뭉치(corpus) 간의 n_similarity 계산

In [None]:
# 추천 함수 정의
def recommend1(movie_id, model, topn=10):
    # 추천 목록 저장할 리스트 정의
    feats = ['title', 'keywords', 'genres', 'director', 'cast_5', 'character_5', 'poster_path', 'popularity', 'tokenized_data']
    similar_df = movie_df[feats].copy()

    corpus_search = movie_df.loc[movie_id]['tokenized_data']
    similar_df['score'] = similar_df['tokenized_data'].apply(lambda x: model.n_similarity(x, corpus_search))

    # 데이터프레임을 유사도순으로 정렬 후
    # 상위 topn(기본 10)개 만큼 출력
    return similar_df.sort_values(by='score',ascending=False)[:topn]#['title']

In [None]:
recommend1(0, model)

Unnamed: 0,title,keywords,genres,director,cast_5,character_5,poster_path,popularity,tokenized_data,score
0,Toy Story,"[martial arts, jealousy, friendship, bullying,...","[Animation, Adventure, Family, Comedy]",[John Lasseter],"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...","[Woody (voice), Buzz Lightyear (voice), Mr. Po...",/uXDfjJbdP4ijW5hWSBrPrlKpxab.jpg,204.942,"[martial, arts, jealousy, friendship, bullying...",1.0
3003,Toy Story 2,"[museum, prosecution, identity crisis, airplan...","[Animation, Comedy, Family]",[John Lasseter],"[Tom Hanks, Tim Allen, Joan Cusack, Kelsey Gra...","[Woody (voice), Buzz Lightyear (voice), Jessie...",/xVhEI1WCgNCCa5I86AqiwuZoog3.jpg,178.279,"[museum, prosecution, identity, crisis, airpla...",0.993318
4761,"Monsters, Inc.","[monster, cheating, kidnapping, infant, villai...","[Animation, Comedy, Family]",[Pete Docter],"[John Goodman, Billy Crystal, Mary Gibbs, Stev...","[James 'Sulley' Sullivan (voice), Mike Wazowsk...",/sgheSKxZkttIe8ONsf2sWXPgip3.jpg,215.682,"[monster, cheating, kidnapping, infant, villai...",0.992313
21195,Mr. Peabody & Sherman,"[parent child relationship, egypt, intelligenc...","[Animation, Adventure, Family]",[Rob Minkoff],"[Ty Burrell, Max Charles, Ariel Winter, Alliso...","[Mr. Peabody (voice), Sherman (voice), Penny P...",/c6kZC5pvwNIRSxiLL2JFGGc46He.jpg,46.865,"[parent, child, relationship, egypt, intellige...",0.989791
37082,Zootopia,"[allegory, lion, hippopotamus, fox, elephant, ...","[Animation, Adventure, Family, Comedy]","[Byron Howard, Rich Moore]","[Jason Bateman, Ginnifer Goodwin, Idris Elba, ...","[Nick Wilde (voice), Judy Hopps (voice), Chief...",/hlK0e0wAQ3VLuJcsfIYPvb4JVud.jpg,110.009,"[allegory, lion, hippopotamus, fox, elephant, ...",0.989486
12725,Madagascar: Escape 2 Africa,"[africa, jealousy, dance, hunger, lion, zoo, h...","[Family, Adventure, Animation, Comedy]","[Eric Darnell, Tom McGrath]","[Ben Stiller, Chris Rock, David Schwimmer, Jad...","[Alex (voice), Marty / Additional Zebras (voic...",/agRbLOHgN46TQO4YdKR462iR7To.jpg,79.366,"[africa, jealousy, dance, hunger, lion, zoo, h...",0.989058
29749,Inside Out,"[san francisco, california, minnesota, dream, ...","[Animation, Family, Adventure, Drama, Comedy]",[Pete Docter],"[Amy Poehler, Phyllis Smith, Bill Hader, Kaitl...","[Joy (voice), Sadness (voice), Fear (voice), R...",/2H1TmgdfNtsKlU9jKdeNyYL5y8T.jpg,115.19,"[san, francisco, california, minnesota, dream,...",0.988289
11219,Charlotte's Web,"[hero, barn, spider, pig, friendship, spring, ...","[Comedy, Family, Fantasy]",[Gary Winick],"[Dakota Fanning, Julia Roberts, Steve Buscemi,...","[Fern Arable, Charlotte the Spider (voice), Te...",/gqg3ruuEDQ6XmXjFGrTDrk58xJg.jpg,18.907,"[hero, barn, spider, pig, friendship, spring, ...",0.988009
18684,Wreck-It Ralph,"[support group, product placement, bullying, j...","[Family, Animation, Comedy, Adventure]",[Rich Moore],"[John C. Reilly, Sarah Silverman, Jack McBraye...","[Wreck-It Ralph (voice), Vanellope von Schweet...",/tlboPAzzBu04D89hJ57CZXmF1fx.jpg,136.867,"[support, group, product, placement, bullying,...",0.987947
39405,The Angry Birds Movie,"[island, pig, rivalry, anthropomorphism, based...","[Animation, Adventure, Comedy]","[Fergal Reilly, Clay Kaytis]","[Jason Sudeikis, Josh Gad, Danny McBride, Maya...","[Red (voice), Chuck (voice), Bomb (voice), Mat...",/nsaaZryqabtrdKwXcNud2Bm39mu.jpg,47.088,"[island, pig, rivalry, anthropomorphism, based...",0.987153


## Similarity Matrix 만들기

- 영화 추천시마다 n_similarity 계산하면 시간이 너무 오래 걸림
- Similarity Matrix를 미리 계산 후 저장해놓고 추천시 사용

### 데이터 전처리

>모든 영화 데이터간의(62316개)의 유사도를 모두 계산하기에는 메모리가 부족함  
 -> popularity가 20 이상인 영화만 골라서 계산

In [None]:
# tokenize 및 전처리 했던 단어들 데이터프레임에 저장
movie_df['tokenized_data'] = tokenized_data
# 결측치 제거
movie_df= movie_df.drop(movie_df[movie_df['tokenized_data'].apply(len) == 0].index)
# popularity 20 이상인 영화만 저장
movie_df = movie_df[movie_df.popularity > 20]

> 각 문장을 벡터로 변환  
`gensim`의 `n_similarity` 계산 방식: 각 문장 내 단어 벡터들의 평균으로 계산

In [None]:
from tqdm.notebook import tqdm
tqdm.pandas()

def text2vec(word_list, model=model):
    total_vec = np.zeros(100,)
    cnt = 0
    for word in word_list:
        try:
            total_vec += model.wv.word_vec(word)
            cnt += 1
        except:
            pass
    total_vec /= cnt
    return total_vec

movie_df['word_vec'] = movie_df['tokenized_data'].progress_apply(text2vec)

  0%|          | 0/22824 [00:00<?, ?it/s]

> 문장 벡터들간의 코사인 유사도 계산

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(np.array([list(x) for x in movie_df['word_vec']]))

> 저장

In [None]:
import pickle
pickle.dump(similarity_matrix,open('similarity_matrix.pkl','wb'))

# 영화 추천시 필요한 feature들만 뽑아서 저장
features = ['id', 'imdb_id', 'title', 'overview', 'genres', 'keywords', 'poster_path', 'popularity', 
            'production_company', 'director', 'character_5', 'cast_5', 
            'vote_count', 'vote_average', 'runtime', 'release_date',
            'production_companies', 'original_language']

movie_df[features].to_json('tmdb_popular.json')

### Similarity matrix 기반으로 영화 추천

In [None]:
def recommend_mat(movie_id):
    sim = sorted(list(enumerate(similarity_matrix[movie_id])), reverse=True, key = lambda x: x[1])
    for i in sim[:10]:
        print(movie_df.iloc[i[0]].title, i[0])

In [None]:
recommend_mat(12320)

Captain America: The Winter Soldier 12320
Iron Man 2 9717
Avengers: Age of Ultron 13533
Avengers: Infinity War 13541
Captain America: Civil War 13545
Iron Man 3 11675
The Avengers 10610
Iron Man 8518
Man of Steel 11734
Thor: Ragnarok 13543
