## Word2Vec 알고리즘


In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim 

In [2]:
import warnings
warnings.filterwarnings(action='ignore')

In [3]:
# 경로의 경우 각자의 환경에 맞게 설정해주면 됩니다. 
path = 'C:/Users/User/Desktop/추천시스템 입문하기/05. 추천시스템 실습하기/input/movies/'

In [4]:
movie = pd.read_csv(path + 'ratings.csv', low_memory=False)
movie.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179


In [5]:
movie = movie.sort_values(by='timestamp', ascending=True).reset_index(drop=True)
movie.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,383,21,3.0,789652009
1,383,47,5.0,789652009
2,383,1079,3.0,789652009
3,409,21,5.0,828212412
4,409,25,4.0,828212412


In [6]:
# 영화의 Metadata를 불러와서 movieID에 맞는 TITLE을 구해줍니다. 
meta = pd.read_csv(path + 'movies_metadata.csv', low_memory=False)
meta.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [7]:
meta.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [8]:
meta = meta.rename(columns={'id':'movieId'})
movie['movieId'] = movie['movieId'].astype(str)
meta['movieId'] = meta['movieId'].astype(str)

movie = pd.merge(movie, meta[['movieId', 'original_title']], how='left', on='movieId')

In [9]:
movie = movie[movie['original_title'].notnull()].reset_index(drop=True)

In [10]:
agg = movie.groupby(['userId'])['original_title'].agg({'unique'})
agg.head()

Unnamed: 0_level_0,unique
userId,Unnamed: 1_level_1
1,"[Jay and Silent Bob Strike Back, Vivement dima..."
2,"[Terminator 3: Rise of the Machines, The Conve..."
3,"[300, The Killing, Shortbus, Finding Neverland..."
4,"[David, The Wedding Planner, Casablanca, Sleep..."
5,"[Gleaming the Cube, Cool Hand Luke, Hidalgo, U..."


In [11]:
movie['original_title'].unique()

array(['The Endless Summer', 'Jarhead', '彼女の想いで', ...,
       'The Lonedale Operator', 'Violeta se fue a los cielos',
       'To Kill a Priest'], dtype=object)

Word2vec 적용

In [12]:
# int형식은 Word2vec에서 학습이 안되어서 String으로 변경해줍니다. 
sentence = []
for user_sentence in agg['unique'].values:
    sentence.append(list(map(str, user_sentence)))

In [13]:
# Word2vec의 학습을 진행해줍니다. 
from gensim.models import Word2Vec
embedding_model = Word2Vec(sentence, size=20, window = 5, 
                           min_count=1, workers=4, iter=200, sg=1)

In [14]:
embedding_model.wv.most_similar(positive=['Spider-Man 2'], topn=10)

[('Snow Cake', 0.8647751212120056),
 ('Sunrise: A Song of Two Humans', 0.7735385894775391),
 ('Face/Off', 0.764630913734436),
 ('Harry Potter and the Prisoner of Azkaban', 0.7302199602127075),
 ('Licence to Kill', 0.7103623747825623),
 ('The Godfather', 0.7101566791534424),
 ('薔薇の葬列', 0.7095184326171875),
 ('Domicile Conjugal', 0.7022347450256348),
 ('Rumor Has It...', 0.6998509168624878),
 ('Forrest Gump', 0.6897859573364258)]

## Doc2Vec 적용

![](https://drive.google.com/uc?export=view&id=1g2ausKfoaAT0jMwSatRUG3fiGWfDuysV
)

In [15]:
from gensim.models import doc2vec

In [16]:
meta = pd.read_csv(path + 'movies_metadata.csv', low_memory=False)
meta = meta[meta['original_title'].notnull()].reset_index(drop=True)
meta = meta[meta['overview'].notnull()].reset_index(drop=True)

In [55]:
import re
word_tokens = word_tokenize(words)
content_text = re.sub('[^a-z0-9]+', ' ', str(word_tokens))
content_text = content_text.strip()

In [70]:
from nltk.corpus import stopwords 
from tqdm.notebook import tqdm
from nltk.tokenize import word_tokenize, sent_tokenize
import re 
stop_words = set(stopwords.words('english')) 

overview = []
for words in tqdm(meta['overview']):
    word_tokens = word_tokenize(words)
    sentence = re.sub('[^A-Za-z0-9]+', ' ', str(word_tokens))
    sentence = sentence.strip()
    
    sentence_tokens = word_tokenize(sentence)
    result = ''
    for token in sentence_tokens: 
        if token not in stop_words:
            result += ' ' + token 
    result = result.strip().lower()
    overview.append(result)

HBox(children=(FloatProgress(value=0.0, max=44512.0), HTML(value='')))




In [71]:
meta['pre_overview'] = overview

In [83]:
doc_vectorizer = doc2vec.Doc2Vec(
    dm=0,            # PV-DBOW / default 1
    dbow_words=1,    # w2v simultaneous with DBOW d2v / default 0
    window=10,        # distance between the predicted word and context words
    size=100,        # vector size
    alpha=0.025,     # learning-rate
    seed=1234,
    min_count=5,    # ignore with freq lower
    min_alpha=0.025, # min learning-rate
    workers=4,   # multi cpu
    hs = 1,          # hierar chical softmax / default 0
    negative = 10   # negative sampling / default 5
)

In [84]:
from collections import namedtuple

agg = meta[['id', 'original_title', 'pre_overview']]
TaggedDocument = namedtuple('TaggedDocument', 'words tags')
tagged_train_docs = [TaggedDocument((c), [d]) for d, c in agg[['original_title', 'pre_overview']].values]

In [85]:
doc_vectorizer.build_vocab(tagged_train_docs)
print(str(doc_vectorizer))

Doc2Vec(dbow+w,d100,n10,hs,w10,mc5,s0.001,t4)


In [86]:
# 벡터 문서 학습
from time import time

start = time()

for epoch in tqdm(range(5)):
    doc_vectorizer.train(tagged_train_docs, total_examples=doc_vectorizer.corpus_count, epochs=doc_vectorizer.iter)
    doc_vectorizer.alpha -= 0.002 # decrease the learning rate
    doc_vectorizer.min_alpha = doc_vectorizer.alpha # fix the learning rate, no decay

#doc_vectorizer.train(tagged_train_docs, total_examples=doc_vectorizer.corpus_count, epochs=doc_vectorizer.iter)
end = time()
print("During Time: {}".format(end-start))

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


During Time: 302.60893034935


In [87]:
doc_vectorizer.docvecs.most_similar('Toy Story', topn=20)

[('It Stains the Sands Red', 0.7155202627182007),
 ('Spark: A Space Tail', 0.7068586945533752),
 ('Letzte Worte', 0.7039177417755127),
 ('El vendedor de humo', 0.67961585521698),
 ('Skazka o Poteryannom Vremeni', 0.6796030402183533),
 ('エクスマキナ', 0.6743360161781311),
 ('Milk Money', 0.6733628511428833),
 ('La moutarde me monte au nez', 0.6681728363037109),
 ('Children in the Surf at Coney Island', 0.662685751914978),
 ('Kader', 0.6612146496772766),
 ("Independents' Day", 0.6611530780792236),
 ('Meet Me in Venice', 0.6607648134231567),
 ('My Friends Need Killing', 0.6578397154808044),
 ('Live Forever as You Are Now with Alan Resnick', 0.6542983055114746),
 ('Burning Sands', 0.6528704166412354),
 ('Особенности национальной политики', 0.650513768196106),
 ('The Aristocats', 0.6483055949211121),
 ('Begegnung mit Fritz Lang', 0.6453357934951782),
 ('8 Pervykh Svidaniy', 0.644992470741272),
 ('Der Sandmann', 0.6428802013397217)]

In [88]:
doc_vectorizer.docvecs.most_similar('Harry Potter and the Deathly Hallows: Part 1', topn=20)

[('Never Let Me Go', 0.7440825700759888),
 ('Cold Weather', 0.7169094085693359),
 ('Who Is Harry Kellerman and Why Is He Saying Those Terrible Things About Me?',
  0.7128375768661499),
 ('Dillinger è morto', 0.7108343839645386),
 ('The Great Ecstasy of Robert Carmichael', 0.6964154839515686),
 ('Emmas Glück', 0.6804828643798828),
 ('밤과 낮', 0.6738446950912476),
 ('No Strings Attached', 0.6700600385665894),
 ('The Bachelor Party', 0.6670905351638794),
 ('Mirrors 2', 0.6669432520866394),
 ("Nora Roberts' Carolina Moon", 0.6662992835044861),
 ('$ Dollars', 0.6662683486938477),
 ('Tomorrow, When the War Began', 0.6573445200920105),
 ('Fantasma', 0.6558449864387512),
 ('Amer', 0.650338888168335),
 ('Der Räuber', 0.6501940488815308),
 ('The Prizefighter and the Lady', 0.6501848697662354),
 ('我知女人心', 0.649250864982605),
 ('Run of the Arrow', 0.6488679647445679),
 ('Handsome Harry', 0.6480263471603394)]