# word2vec

In [2]:
import numpy as np
import matplotlib.pylab as plt

In [10]:
from __future__ import print_function, unicode_literals
from konlpy.utils import pprint
from gensim.models.word2vec import Word2Vec

## 뉴스 기사 로딩

In [5]:
with open('news_005930_label.json') as f:
    jsondata_label = ujson.load(f)
    
jsondata_token = jsondata_label      

## 형태소 분석

In [18]:
from konlpy.tag import Kkma
tagger = Kkma()

def tokenize(jd):
    pos = tagger.pos(jd['text'])
    jd['token'] = [p[0] for p in pos]

In [28]:
%%time
for i, jd in enumerate(jsondata_token):
    if i % 500 == 0:
        print(datetime.datetime.now(), i)
    tokenize(jd)

2017-01-11 18:09:52.791988 0
2017-01-11 18:11:29.539387 500
2017-01-11 18:12:52.329600 1000
2017-01-11 18:14:24.599239 1500
2017-01-11 18:15:50.733850 2000
2017-01-11 18:17:12.387108 2500
2017-01-11 18:18:30.778236 3000
2017-01-11 18:19:52.284475 3500
2017-01-11 18:21:11.033730 4000
2017-01-11 18:22:39.566415 4500
2017-01-11 18:24:08.937463 5000
2017-01-11 18:25:38.274130 5500
2017-01-11 18:27:03.724278 6000
2017-01-11 18:28:31.626850 6500
2017-01-11 18:29:56.099859 7000
2017-01-11 18:31:24.165973 7500
CPU times: user 22min, sys: 690 ms, total: 22min
Wall time: 22min 2s


In [29]:
sentences = [jd['token'] for jd in jsondata_token]

In [30]:
pprint(sentences[0][:10])

[삼성전자,
 가,
 중국,
 시장,
 을,
 정조준,
 하,
 어,
 개발,
 하]


## word2vec 모형

In [31]:
%%time
model = Word2Vec(sentences)

CPU times: user 16 s, sys: 30 ms, total: 16.1 s
Wall time: 16.1 s


In [32]:
model.save("model.w2v")

## 단어 유사도

In [52]:
model = Word2Vec.load("model.w2v")

In [53]:
model['삼성']

array([-0.88298994,  1.26178777, -0.77802747, -1.09302449, -1.26133657,
        2.69204617,  3.04820013,  1.06273222,  1.08390737,  1.36907279,
       -0.0880619 , -0.99113822,  0.34204349,  0.00336172,  3.20068002,
        2.50551748, -0.91180468, -0.64260548, -3.21668768,  0.02881739,
        2.33211613,  2.30430388,  0.60995513,  0.60308403,  0.53285176,
        0.11653692, -1.34046912,  0.10900211, -2.74044943,  0.80897999,
       -1.35799336, -0.15734237,  0.38982439,  1.02167475, -0.36940753,
        2.04397106,  1.59048545, -1.44739306,  1.12625074,  0.28484374,
        2.02303839,  1.24119914,  0.96651441,  0.84442705, -1.19634926,
       -1.79522455, -0.14226824, -0.18730004, -0.74381512, -3.14393783,
       -0.4018904 ,  0.48001659,  2.98944759, -0.13373449,  0.01049897,
       -2.64756203, -2.37049699, -0.3202585 , -1.69861674, -0.26690614,
       -0.38496479,  0.15361649, -2.13534689,  1.71759832, -2.98605919,
       -1.46513045,  1.5292021 , -0.01296279,  0.22405368, -0.91

In [54]:
pprint(model.most_similar('삼성'))

[(한화, 0.44487911462783813),
 (신세계, 0.4391079246997833),
 (유니온, 0.4338747560977936),
 (루프, 0.428579181432724),
 (삼성전자, 0.42772141098976135),
 (CJ, 0.42073482275009155),
 (씨티, 0.4170035719871521),
 (빅딜, 0.41692766547203064),
 (삼성생명, 0.40312451124191284),
 (제일, 0.40202853083610535)]


In [55]:
pprint(model.most_similar('주가'))

[(랠리, 0.5720997452735901),
 (외국인, 0.5668069124221802),
 (강세, 0.5595622062683105),
 (160, 0.5488746762275696),
 (상승세, 0.5474750995635986),
 (연일, 0.5425583124160767),
 (가르, 0.5420811176300049),
 (최고가, 0.5251951217651367),
 (상승, 0.5069561004638672),
 (증시, 0.5032307505607605)]


In [56]:
pprint(model.most_similar('수익'))

[(흐름, 0.6726746559143066),
 (수급, 0.6484875679016113),
 (수익성, 0.6400747895240784),
 (변동성, 0.6346718072891235),
 (원가, 0.6283073425292969),
 (수익률, 0.6223256587982178),
 (요인, 0.6194822192192078),
 (체질, 0.6164451837539673),
 (재무, 0.6116939187049866),
 (외형, 0.6017174124717712)]


In [57]:
pprint(model.most_similar(positive=['상승', '급등'], negative=['하락'], topn=1))

[(급락, 0.8104647397994995)]


In [58]:
pprint(model.most_similar(positive=['수익', '상승'], negative=['손실'], topn=1))

[(하락, 0.673424243927002)]
