## [Word2vec](https://www.kaggle.com/code/chocozzz/00-word2vec-1)

In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim 

In [4]:
import warnings
warnings.filterwarnings(action='ignore')

In [7]:
path = "./data/movies/"

movie = pd.read_csv(path + 'ratings.csv', low_memory=False)
movie.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179


In [5]:
movie = movie.sort_values(by='timestamp', ascending=True).reset_index(drop=True)
movie.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,383,21,3.0,789652009
1,383,47,5.0,789652009
2,383,1079,3.0,789652009
3,409,21,5.0,828212412
4,409,25,4.0,828212412


In [6]:
# 영화의 Metadata를 불러와서 movieID에 맞는 TITLE을 구해줍니다. 
meta = pd.read_csv(path + 'movies_metadata.csv', low_memory=False)
meta.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [7]:
meta.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [8]:
meta = meta.rename(columns={'id':'movieId'})
movie['movieId'] = movie['movieId'].astype(str)
meta['movieId'] = meta['movieId'].astype(str)

movie = pd.merge(movie, meta[['movieId', 'original_title']], how='left', on='movieId')

In [None]:
# UserId 별 MovidID 구매 목록을 생성 
# item_agg = movie.groupby(['title'])['userId'].agg({'nunique'}).reset_index() 
# item_agg = item_agg[item_agg['nunique'] >= 5]['title'].values 
# user_agg = movie.groupby(['userId'])['title'].agg({'nunique'}).reset_index() 
# user_agg = user_agg[user_agg['nunique'] >= 5]['userId'].values

In [None]:
# 전처리 수행 
# 성능향상을 위해 5회 미만 구매된 상품, 5회 미만 구매한 고객은 제외하고 분석진행 
# movie['check1'] = 0 
# movie.loc[movie['title'].isin(item_agg), 'check1'] = 1 
# movie['check2'] = 0 
# movie.loc[movie['userId'].isin(user_agg), 'check2'] = 1 
# movie = movie.loc[(movie['check1'] == 1) & (movie['check2'] == 1)].reset_index(drop=True)

In [9]:
movie = movie[movie['original_title'].notnull()].reset_index(drop=True)

In [10]:
agg = movie.groupby(['userId'])['original_title'].agg({'unique'})
agg.head()

Unnamed: 0_level_0,unique
userId,Unnamed: 1_level_1
1,"[Jay and Silent Bob Strike Back, Vivement dima..."
2,"[Terminator 3: Rise of the Machines, The Conve..."
3,"[300, The Killing, Shortbus, Finding Neverland..."
4,"[David, The Wedding Planner, Casablanca, Sleep..."
5,"[Gleaming the Cube, Cool Hand Luke, Hidalgo, U..."


In [11]:
movie['original_title'].unique()

array(['The Endless Summer', 'Jarhead', '彼女の想いで', ...,
       'The Lonedale Operator', 'Violeta se fue a los cielos',
       'To Kill a Priest'], dtype=object)

## Word2vec 적용

In [12]:
# int형식은 Word2vec에서 학습이 안되어서 String으로 변경해줍니다. 
sentence = []
for user_sentence in agg['unique'].values:
    sentence.append(list(map(str, user_sentence)))

In [15]:
# Word2vec의 학습을 진행해줍니다. 
from gensim.models import Word2Vec
embedding_model = Word2Vec(sentence, vector_size=20, window = 5, 
                          min_count=1, workers=4, epochs=200, sg=1)

In [16]:
embedding_model.wv.most_similar(positive=['Spider-Man 2'], topn=10)

[('Snow Cake', 0.855394184589386),
 ("L'Aile ou la Cuisse", 0.8079094886779785),
 ('Domicile Conjugal', 0.7673839926719666),
 ('Conquest of the Planet of the Apes', 0.7452538013458252),
 ('Star Trek: Nemesis', 0.7430431842803955),
 ('Heavenly Creatures', 0.7352131605148315),
 ('Forrest Gump', 0.7338284850120544),
 ('Face/Off', 0.7298555970191956),
 ('Rumor Has It...', 0.7277589440345764),
 ('Mr. Brooks', 0.7254560589790344)]

## Doc2Vec 적용

In [5]:
from gensim.models import doc2vec

In [8]:
meta = pd.read_csv(path + 'movies_metadata.csv', low_memory=False)
meta = meta[meta['original_title'].notnull()].reset_index(drop=True)
meta = meta[meta['overview'].notnull()].reset_index(drop=True)

In [11]:
from nltk.corpus import stopwords 
from tqdm.notebook import tqdm
from nltk.tokenize import word_tokenize, sent_tokenize
import re 
stop_words = set(stopwords.words('english')) 

overview = []
for words in tqdm(meta['overview']):
    word_tokens = word_tokenize(words)
    sentence = re.sub('[^A-Za-z0-9]+', ' ', str(word_tokens))
    sentence = sentence.strip()
    
    sentence_tokens = word_tokenize(sentence)
    result = ''
    for token in sentence_tokens: 
        if token not in stop_words:
            result += ' ' + token 
    result = result.strip().lower()
    overview.append(result)

  0%|          | 0/44512 [00:00<?, ?it/s]

In [12]:
meta['pre_overview'] = overview

In [14]:
doc_vectorizer = doc2vec.Doc2Vec(
    dm=0,            # PV-DBOW / default 1
    dbow_words=1,    # w2v simultaneous with DBOW d2v / default 0
    window=10,        # distance between the predicted word and context words
    vector_size=100,        # vector size
    alpha=0.025,     # learning-rate
    seed=1234,
    min_count=5,    # ignore with freq lower
    min_alpha=0.025, # min learning-rate
    workers=4,   # multi cpu
    hs = 1,          # hierar chical softmax / default 0
    negative = 10   # negative sampling / default 5
)

In [15]:
from collections import namedtuple

agg = meta[['id', 'original_title', 'pre_overview']]
TaggedDocument = namedtuple('TaggedDocument', 'words tags')
tagged_train_docs = [TaggedDocument((c), [d]) for d, c in agg[['original_title', 'pre_overview']].values]

In [16]:
doc_vectorizer.build_vocab(tagged_train_docs)
print(str(doc_vectorizer))

Doc2Vec<dbow+w,d100,n10,hs,w10,mc5,s0.001,t4>


In [18]:
# 벡터 문서 학습
from time import time

start = time()

for epoch in tqdm(range(5)):
    doc_vectorizer.train(tagged_train_docs, total_examples=doc_vectorizer.corpus_count, epochs=doc_vectorizer.epochs)
    doc_vectorizer.alpha -= 0.002 # decrease the learning rate
    doc_vectorizer.min_alpha = doc_vectorizer.alpha # fix the learning rate, no decay

#doc_vectorizer.train(tagged_train_docs, total_examples=doc_vectorizer.corpus_count, epochs=doc_vectorizer.epochs)
end = time()
print("During Time: {}".format(end-start))

  0%|          | 0/5 [00:00<?, ?it/s]

During Time: 502.68213391304016


In [19]:
doc_vectorizer.docvecs.most_similar('Toy Story', topn=20)

[('Letzte Worte', 0.7219405174255371),
 ('It Stains the Sands Red', 0.7019785046577454),
 ('El vendedor de humo', 0.6761186122894287),
 ('Особенности национальной политики', 0.6540822386741638),
 ('Children in the Surf at Coney Island', 0.6527005434036255),
 ('Der Sandmann', 0.6481742858886719),
 ('The Aristocats', 0.6475422382354736),
 ('La moutarde me monte au nez', 0.6467899084091187),
 ('По следам бременских музыкантов', 0.6384307146072388),
 ("Independents' Day", 0.6233192682266235),
 ('Kader', 0.6233088970184326),
 ('Meet Me in Venice', 0.6139765977859497),
 ('Begegnung mit Fritz Lang', 0.61335688829422),
 ('Live Forever as You Are Now with Alan Resnick', 0.6125729084014893),
 ('Killer: A Journal of Murder', 0.6118432879447937),
 ('The Unbearable Lightness of Being', 0.6092782616615295),
 ('Skazka o Poteryannom Vremeni', 0.6004007458686829),
 ('Samoubiytsy', 0.5998007655143738),
 ('活着', 0.5949916839599609),
 ('Thunder Road', 0.5943598747253418)]

In [20]:
doc_vectorizer.docvecs.most_similar('Harry Potter and the Deathly Hallows: Part 1', topn=20)

[('The Great Ecstasy of Robert Carmichael', 0.669013261795044),
 ('Just Like Us', 0.634733259677887),
 ('Der Räuber', 0.6138728857040405),
 ('我知女人心', 0.6040328145027161),
 ('Se sei vivo spara', 0.6016620397567749),
 ('Tomorrow, When the War Began', 0.5776289701461792),
 ('Blood River', 0.5749691128730774),
 ('Cold Weather', 0.5611975789070129),
 ('Classe tous risques', 0.5587496161460876),
 ('Above and Beyond', 0.5571764707565308),
 ('Fear and Desire', 0.554139256477356),
 ('Emmas Glück', 0.5541350841522217),
 ('The Black Rose', 0.5459890961647034),
 ('Kasaba', 0.5439414381980896),
 ('Torch Song', 0.5344892740249634),
 ('Demonic Toys', 0.5337750911712646),
 ('A londoni férfi', 0.529091477394104),
 ('Zamilované Maso', 0.5248921513557434),
 ('Tribute', 0.5182867050170898),
 ('Ménilmontant', 0.5175023078918457)]