# 사전 훈련된 Word2Vec 임베딩
### 1. 영어
- 사전 훈련된 3백만 단어 벡터
- 모델 다운로드 경로: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit

In [1]:
!wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

--2021-07-07 00:23:21--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.73.206
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.73.206|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1647046227 (1.5G) [application/x-gzip]
Saving to: ‘GoogleNews-vectors-negative300.bin.gz’


2021-07-07 00:23:59 (41.5 MB/s) - ‘GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]



In [2]:
!ls -l

total 1608452
-rw-r--r-- 1 root root 1647046227 Mar  5  2015 GoogleNews-vectors-negative300.bin.gz
drwxr-xr-x 1 root root       4096 Jun 15 13:37 sample_data


In [3]:
!gzip -d GoogleNews-vectors-negative300.bin.gz

In [4]:
!ls -l

total 3558856
-rw-r--r-- 1 root root 3644258522 Mar  5  2015 GoogleNews-vectors-negative300.bin
drwxr-xr-x 1 root root       4096 Jun 15 13:37 sample_data


In [5]:
import gensim

In [6]:
# 구글의 사전 훈련된 Word2Vect 모델을 로드
model = gensim.models.KeyedVectors.load_word2vec_format(
    'GoogleNews-vectors-negative300.bin', binary=True
)

In [7]:
# 모델의 크기
model.vectors.shape

(3000000, 300)

In [9]:
print(3000000*300*4, 3644258522)

3600000000 3644258522


In [10]:
# 단어 'king'의 벡터 출력
print(model['king'])

[ 1.25976562e-01  2.97851562e-02  8.60595703e-03  1.39648438e-01
 -2.56347656e-02 -3.61328125e-02  1.11816406e-01 -1.98242188e-01
  5.12695312e-02  3.63281250e-01 -2.42187500e-01 -3.02734375e-01
 -1.77734375e-01 -2.49023438e-02 -1.67968750e-01 -1.69921875e-01
  3.46679688e-02  5.21850586e-03  4.63867188e-02  1.28906250e-01
  1.36718750e-01  1.12792969e-01  5.95703125e-02  1.36718750e-01
  1.01074219e-01 -1.76757812e-01 -2.51953125e-01  5.98144531e-02
  3.41796875e-01 -3.11279297e-02  1.04492188e-01  6.17675781e-02
  1.24511719e-01  4.00390625e-01 -3.22265625e-01  8.39843750e-02
  3.90625000e-02  5.85937500e-03  7.03125000e-02  1.72851562e-01
  1.38671875e-01 -2.31445312e-01  2.83203125e-01  1.42578125e-01
  3.41796875e-01 -2.39257812e-02 -1.09863281e-01  3.32031250e-02
 -5.46875000e-02  1.53198242e-02 -1.62109375e-01  1.58203125e-01
 -2.59765625e-01  2.01416016e-02 -1.63085938e-01  1.35803223e-03
 -1.44531250e-01 -5.68847656e-02  4.29687500e-02 -2.46582031e-02
  1.85546875e-01  4.47265

In [11]:
# 단어 유사도(Cosine similarity)
print(model.similarity('king', 'prince'))
print(model.similarity('queen', 'princess'))

0.61599934
0.7070532


In [12]:
# 유사한 단어
model.most_similar('book')

[('tome', 0.7485830783843994),
 ('books', 0.7379178404808044),
 ('memoir', 0.7302927374839783),
 ('paperback_edition', 0.6868364810943604),
 ('autobiography', 0.6741527915000916),
 ('memoirs', 0.6505153179168701),
 ('Book', 0.6479282379150391),
 ('paperback', 0.6471226811408997),
 ('novels', 0.6341458559036255),
 ('hardback', 0.6283079385757446)]

In [13]:
model.most_similar('king', topn=3)

[('kings', 0.7138046026229858),
 ('queen', 0.6510956883430481),
 ('monarch', 0.6413194537162781)]

In [14]:
# king - man + woman
model.most_similar(positive=['king','woman'], negative=['man'], topn=3)

[('queen', 0.7118192911148071),
 ('monarch', 0.6189674139022827),
 ('princess', 0.5902431011199951)]

### 2. 한글
- 박규병님의 깃허브 주소인 https://github.com/Kyubyong/wordvectors 에 공개


In [16]:
!unzip ko.zip

Archive:  ko.zip
  inflating: ko.bin                  
  inflating: ko.tsv                  


In [17]:
!ls -l

total 3770444
-rw-r--r-- 1 root root 3644258522 Mar  5  2015 GoogleNews-vectors-negative300.bin
-rw------- 1 root root   50697568 Dec 21  2016 ko.bin
-rw------- 1 root root   85362829 Dec 21  2016 ko.tsv
-rw-r--r-- 1 root root   80596565 Jul  7 01:05 ko.zip
drwxr-xr-x 1 root root       4096 Jun 15 13:37 sample_data


In [18]:
kmodel = gensim.models.Word2Vec.load('ko.bin')
kmodel.wv.vectors.shape

(30185, 200)

In [19]:
kmodel.wv.most_similar('강아지')

[('고양이', 0.7290452718734741),
 ('거위', 0.7185635566711426),
 ('토끼', 0.7056223154067993),
 ('멧돼지', 0.6950401067733765),
 ('엄마', 0.6934334635734558),
 ('난쟁이', 0.6806551218032837),
 ('한마리', 0.6770296096801758),
 ('아가씨', 0.6750352382659912),
 ('아빠', 0.6729634404182434),
 ('목걸이', 0.6512460708618164)]

In [22]:
# 한국 - 서울 + 도쿄
kmodel.wv.most_similar(positive=['한국','도쿄'], negative=['서울'], topn=3)

[('일본', 0.5835654735565186),
 ('홋카이', 0.502429723739624),
 ('오사카', 0.431940495967865)]