In [72]:
import pandas as pd
from matplotlib import pyplot as plt

from gensim.models.word2vec import Word2Vec
from os import path, mkdir
from multiprocessing import cpu_count

# from gensim.models.doc2vec import Doc2Vec

import re
import pickle
import time
from math import log10

In [None]:
with open('artTokens.dat','rb') as file: # 미리 doc별로 저장한 tokens의 리스트를 load한다
    tokens = pickle.load(file)

In [4]:
#하이퍼파라미터 세팅
num_features = 100
min_word_count = 10
num_workers = cpu_count()
context_size = 10
downsampling = 1e-3
seed=180816

In [5]:
#모델 생성
art2vec =Word2Vec( 
    sg=1, #skip-gram
    workers=cpu_count(),
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample = downsampling,
)

In [44]:
art2vec.build_vocab(tokens)

In [45]:
art2vec.corpus_count

20514

In [46]:
art2vec.epochs

5

In [47]:
art2vec.train(tokens,total_examples=art2vec.corpus_count,epochs=art2vec.epochs) #학습 with tokens

(54894850, 74022550)

In [51]:
##모델을 저장하고 불러와서 다시 training시킬 수 있다
art2vec.save(r'''art2vec.w2v''')

In [8]:
##전에 학습한 결과 불러오기
art2vec = Word2Vec.load(r'''art2vec.w2v''')

In [9]:
art2vec.wv.most_similar('광주')

[('조진호', 0.7715893983840942),
 ('대구', 0.7689499855041504),
 ('부산', 0.7660964727401733),
 ('시립', 0.7494326829910278),
 ('전남', 0.7457484006881714),
 ('광역시', 0.7450919151306152),
 ('센텀시티', 0.7318888306617737),
 ('개점', 0.7264745235443115),
 ('대전', 0.7231205701828003),
 ('환기재단', 0.7210865616798401)]

In [10]:
art2vec.wv.most_similar('희망')

[('희망이', 0.8070540428161621),
 ('소망', 0.7373624444007874),
 ('행복', 0.7227489352226257),
 ('긍정', 0.7062973380088806),
 ('꿈', 0.7053670883178711),
 ('앞날', 0.7005017995834351),
 ('절망', 0.6873255968093872),
 ('염원', 0.6766011714935303),
 ('열망', 0.6744593381881714),
 ('기쁨', 0.6718753576278687)]

In [64]:
art2vec.wv.most_similar('가족','반려동물')

[('기억해야', 0.21745026111602783),
 ('콜렉터', 0.21343661844730377),
 ('생각합', 0.20056943595409393),
 ('블루칩', 0.1907409131526947),
 ('알려지', 0.19049102067947388),
 ('삼류', 0.18937496840953827),
 ('가요', 0.1706419587135315),
 ('박찬호', 0.17036212980747223),
 ('팝아티스트', 0.16764971613883972),
 ('연예인', 0.16761338710784912)]

In [56]:
art2vec.wv.most_similar('재미')

[('즐거움', 0.7283370494842529),
 ('흥미', 0.7058372497558594),
 ('묘미', 0.6446568965911865),
 ('유쾌', 0.6441777348518372),
 ('재미있는', 0.6389391422271729),
 ('즐거운', 0.6228681206703186),
 ('톡톡', 0.6186307668685913),
 ('찾아보는', 0.6171889305114746),
 ('엉뚱함', 0.6071854829788208),
 ('유희', 0.6063868999481201)]

In [63]:
art2vec.wv.most_similar(['최순민','재미'],negative='김섭')

[('흥미', 0.4938831031322479),
 ('즐거움', 0.4859802722930908),
 ('막대사탕', 0.4640687108039856),
 ('디저트', 0.457018107175827),
 ('재밌는', 0.45514976978302),
 ('재미있는', 0.45226866006851196),
 ('힌트', 0.44326353073120117),
 ('아기자기', 0.43784695863723755),
 ('재미있고', 0.4362080693244934),
 ('예쁜', 0.4261799454689026)]

In [60]:
art2vec.wv.most_similar('김혜연')

[('임만혁', 0.7648429870605469),
 ('박형진', 0.7506508827209473),
 ('김덕기', 0.7272018194198608),
 ('이상선', 0.725709080696106),
 ('김태우', 0.7161332964897156),
 ('서은', 0.707787036895752),
 ('박지혜', 0.7039687633514404),
 ('최순민', 0.7037951946258545),
 ('손민광', 0.7032536864280701),
 ('김은기', 0.7019984722137451)]

In [65]:
art2vec.wv.most_similar('임만혁','김혜연')

[('정작', 0.39441823959350586),
 ('어슷비슷', 0.3715914785861969),
 ('만이', 0.36983853578567505),
 ('명백', 0.35792189836502075),
 ('창조할', 0.3490333557128906),
 ('무기', 0.34765177965164185),
 ('생산하지', 0.3454466462135315),
 ('무너짐', 0.34017521142959595),
 ('음에도', 0.33978211879730225),
 ('으론', 0.3387604355812073)]