# Word2Vec

## 한국어 데이터 다운로드 및 전처리

In [6]:
import os
import sys
import numpy as np
from tqdm import tqdm
import time
import pickle
import logging

In [7]:
import urllib.request
from konlpy.tag import Okt
from gensim.models.word2vec import Word2Vec
import pandas as pd
import matplotlib.pyplot as plt

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data_utils

In [9]:
if torch.cuda.is_available():
    device = torch.device('cuda:0')
else:
    device = torch.device('cpu')

In [10]:
logger = logging.getLogger(__name__)
logging.basicConfig(format="%(asctime)s [%(levelname)s] %(message)s", 
                    level=logging.INFO,
                    handlers=[
                        logging.FileHandler(os.path.join("./subtext_test_result.log")),
                        logging.StreamHandler()
                    ])

In [11]:
# path 추가
sys.path.append('../bertsum')

In [12]:
from models.data_loader import TextLoader, load_dataset
from bertsum import args, ExtTransformerEncoder, ExtSummarizer, WindowEmbedder

2021-05-04 07:38:49,629 [INFO] PyTorch version 1.1.0 available.


In [13]:
# Settings
device = "cpu" if args.visible_gpus == -1 else "cuda"
loader = TextLoader(args, device)

# model setting
ckpt_path = '../bertsum/checkpoint/model_step_24000.pt'
checkpoint = torch.load(ckpt_path, map_location=lambda storage, loc: storage)
bert_model = ExtSummarizer(args, device, checkpoint)
bert_model.eval()

[██████████████████████████████████████████████████]
using cached model
using cached model
[██████████████████████████████████████████████████]
2021-05-04 07:39:04,028 [INFO] loading configuration file ./tmp/kobert_from_pretrained/config.json
2021-05-04 07:39:04,029 [INFO] Model config BertConfig {
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "type_vocab_size": 2,
  "vocab_size": 8002
}

2021-05-04 07:39:04,030 [INFO] loading weights file ./tmp/kobert_from_pretrained/pytorch_model.bin
using cached model
2021-05-04 07:39:05,498 [INFO] All model checkpoint weights were used when initializing BertModel.

2021-05-04 07:39:05,49

ExtSummarizer(
  (bert): Bert(
    (model): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(8004, 768)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm(torch.Size([768]), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
                (LayerNorm): LayerNorm(torch.Size([768]),

## Word Embedding

### 네이버 서치앤클로바 "이기창"님 pretrained model
https://github.com/ratsgo/embedding

- Data used for training: 사용된 한국어 공개 말뭉치

1. 한국어 위키백과 : [다운로드](https://dumps.wikimedia.org/kowiki/latest/kowiki-latest-pages-articles.xml.bz2)
2. KorQuAD 학습(train) 데이터셋 : [다운로드](https://korquad.github.io/dataset/KorQuAD_v1.0_train.json)
3. KorQuAD 데브(dev) 데이터셋 : [다운로드](https://korquad.github.io/dataset/KorQuAD_v1.0_dev.json)
4. 네이버 영화 리뷰 학습(train) 데이터셋 :  [다운로드](https://github.com/e9t/nsmc/raw/master/ratings_train.txt)
5. 네이버 영화 리뷰 테스트(test) 데이터셋 : [다운로드](https://github.com/e9t/nsmc/raw/master/ratings_test.txt)

- Word Embedding : 모든 임베딩은 한국어 위키백과, KorQuAD, 네이버 영화 말뭉치를 은전한닢(mecab)으로 형태소 분석한 말뭉치로 학습됨.
1. FastText
2. GloVe
3. Swivel
4. **Word2Vec**

우선적으로 Word2Vec을 먼저 수행해볼 계획

### word2vec

In [16]:
import gensim
r_w2v = gensim.models.Word2Vec.load('./pretrained/ratsgo/word2vec/word2vec')

r_w2v

<gensim.models.word2vec.Word2Vec at 0x7f6b9fd65080>

In [18]:
r_w2v.wv.most_similar('강아지')

[('고양이', 0.8470748662948608),
 ('애완견', 0.7790216207504272),
 ('잉꼬', 0.7730404138565063),
 ('엄마', 0.77251797914505),
 ('아기', 0.7709106206893921),
 ('토끼', 0.7708439230918884),
 ('애완', 0.7673067450523376),
 ('바퀴벌레', 0.7650703191757202),
 ('아빠', 0.762525200843811),
 ('애봉이', 0.76053786277771)]

### 카카오 브레인 "박규병"님 Pretrained Model
https://github.com/Kyubyong/wordvectors

- Data used for training (Word2Vec, FastText)

|Vector Size|Corpus Size|Vocabulary Size|
|:---:|:---:|:---:|
|200|339M|30185|

In [5]:
import gensim
k_w2v = gensim.models.Word2Vec.load('./pretrained/kyubyong/word2vec/ko.bin')
k_w2v

In [6]:
test = k_w2v.wv.most_similar('선생님')
print(test)

[('선배', 0.6821573972702026), ('후배', 0.6759200692176819), ('동급생', 0.6664727330207825), ('친구', 0.6449654698371887), ('엄마', 0.6190557479858398), ('여학생', 0.6161683797836304), ('초등학생', 0.6156256794929504), ('남학생', 0.6080973148345947), ('유치원', 0.6021747589111328), ('선생', 0.5940440893173218)]


## 한국어 Word2Vec 훈련시키기
- 일단 `이기창`님의 임배딩 레이어를 바탕으로 Word2Vec모델 구축

In [29]:
import numpy
w2v_weight = numpy.load('./pretrained/ratsgo/word2vec/word2vec.wv.vectors.npy')


In [34]:
from gensim.models import Word2Vec
model = Word2Vec(sentences = tokenized_data, size = 100, window = 5, min_count = 5, workers = 4, sg = 0)

In [35]:
# 완성된 임베딩 매트릭스의 크기 확인
model.wv.vectors.shape

(16477, 100)

In [36]:
print(model.wv.most_similar("최민식"))

[('김수현', 0.8501417636871338), ('안성기', 0.8398776054382324), ('박중훈', 0.8371318578720093), ('한석규', 0.8280158042907715), ('서영희', 0.8190248012542725), ('채민서', 0.8170990347862244), ('브란도', 0.8161548376083374), ('정재영', 0.8039891719818115), ('이정재', 0.7988710403442383), ('이주승', 0.798769474029541)]


  if np.issubdtype(vec.dtype, np.int):


In [37]:
print(model.wv.most_similar("히어로"))

[('느와르', 0.8681026697158813), ('호러', 0.868053674697876), ('무비', 0.8638497591018677), ('슬래셔', 0.8524353504180908), ('무협', 0.8517643809318542), ('블록버스터', 0.8416270613670349), ('블랙', 0.8224462270736694), ('물', 0.8096286654472351), ('멜로', 0.8025323748588562), ('물의', 0.7977664470672607)]


  if np.issubdtype(vec.dtype, np.int):


## 영어 Word2Vec 시각화하기

eng_w2v라는 Word2Vec 모델이 이미 존재한다는 가정 하에 아래 커맨드를 수행합니다.

In [38]:
!python -m gensim.scripts.word2vec2tensor --input eng_w2v --output eng_w2v

2020-06-27 23:15:20,404 - word2vec2tensor - INFO - running /usr/local/lib/python3.6/dist-packages/gensim/scripts/word2vec2tensor.py --input eng_w2v --output eng_w2v
2020-06-27 23:15:20,405 - utils_any2vec - INFO - loading projection weights from eng_w2v
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
2020-06-27 23:15:22,299 - utils_any2vec - INFO - loaded (21613, 100) matrix from eng_w2v
2020-06-27 23:15:23,963 - word2vec2tensor - INFO - 2D tensor file saved to eng_w2v_tensor.tsv
2020-06-27 23:15:23,963 - word2vec2tensor - INFO - Tensor metadata file saved to eng_w2v_metadata.tsv
2020-06-27 23:15:23,966 - word2vec2tensor - INFO - finished running word2vec2tensor.py


커맨드를 수행하면 경로에 기존에 있던 eng_w2v 외에도 두 개의 파일이 생깁니다.



링크 : https://projector.tensorflow.org/  

이제 해당 링크로 이동!

위에 있는 Choose file 버튼을 누르고 eng_w2v_tensor.tsv 파일을 업로드하고, 아래에 있는 Choose file 버튼을 누르고 eng_w2v_metadata.tsv 파일을 업로드합니다. 두 파일을 업로드하면 임베딩 프로젝터에 학습했던 워드 임베딩 모델이 시각화됩니다.



# FastText

## Word2Vec의 OOV 문제 확인해보기

In [42]:
loaded_model = KeyedVectors.load_word2vec_format("eng_w2v") # Word2Vec 모델 로드

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [43]:
model_result = loaded_model.most_similar("overacting")
print(model_result)

KeyError: ignored

In [44]:
model_result = loaded_model.most_similar("memorry")
print(model_result)

KeyError: ignored

In [45]:
model_result = loaded_model.most_similar("electrofishing")
print(model_result)

KeyError: ignored

## FastText로 같은 단어에 대해서 테스트해보기

In [46]:
from gensim.models import FastText
fasttext_model = FastText(result, size=100, window=5, min_count=5, workers=4, sg=1)

In [47]:
fasttext_model.most_similar('overacting')

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('interacting', 0.8656531572341919),
 ('subtracting', 0.8312532305717468),
 ('distracting', 0.826479434967041),
 ('manipulating', 0.8187980651855469),
 ('extracting', 0.809623122215271),
 ('contracting', 0.8087961077690125),
 ('impacting', 0.7956535816192627),
 ('acting', 0.7873855233192444),
 ('behaving', 0.7625167369842529),
 ('contacting', 0.7553479671478271)]

In [48]:
fasttext_model.most_similar('memoryy')

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('memory', 0.9639291763305664),
 ('memo', 0.8072319030761719),
 ('emory', 0.7875003814697266),
 ('memories', 0.7036311626434326),
 ('memorial', 0.6839367151260376),
 ('brain', 0.6632785797119141),
 ('memorize', 0.662663459777832),
 ('memorizing', 0.6536747217178345),
 ('memoir', 0.6399561166763306),
 ('memoirs', 0.6290059089660645)]

In [49]:
fasttext_model.most_similar("electrofishing")

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('electrolux', 0.8139298558235168),
 ('electrolyte', 0.80857253074646),
 ('electro', 0.8040822148323059),
 ('electric', 0.7829129099845886),
 ('electroshock', 0.7784780263900757),
 ('electrochemical', 0.7773081660270691),
 ('gastric', 0.7742099761962891),
 ('electron', 0.7597347497940063),
 ('airbus', 0.7592199444770813),
 ('electrogram', 0.7589453458786011)]

# GloVe

In [50]:
!pip install glove_python

Collecting glove_python
[?25l  Downloading https://files.pythonhosted.org/packages/3e/79/7e7e548dd9dcb741935d031117f4bed133276c2a047aadad42f1552d1771/glove_python-0.1.0.tar.gz (263kB)
[K     |████████████████████████████████| 266kB 4.5MB/s 
Building wheels for collected packages: glove-python
  Building wheel for glove-python (setup.py) ... [?25l[?25hdone
  Created wheel for glove-python: filename=glove_python-0.1.0-cp36-cp36m-linux_x86_64.whl size=700284 sha256=6a6b47f21f798f655cde19e785bf180e6a54d370598b239cdcda6978f683d037
  Stored in directory: /root/.cache/pip/wheels/88/4b/6d/10c0d2ad32c9d9d68beec9694a6f0b6e83ab1662a90a089a4b
Successfully built glove-python
Installing collected packages: glove-python
Successfully installed glove-python-0.1.0


In [51]:
from glove import Corpus, Glove

corpus = Corpus() 
corpus.fit(result, window=5)
# 훈련 데이터로부터 GloVe에서 사용할 동시 등장 행렬 생성

glove = Glove(no_components=100, learning_rate=0.05)
glove.fit(corpus.matrix, epochs=20, no_threads=4, verbose=True)
glove.add_dictionary(corpus.dictionary)
# 학습에 이용할 쓰레드의 개수는 4로 설정, 에포크는 20.

Performing 20 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19


In [52]:
model_result1 = glove.most_similar("man")
print(model_result1)

[('woman', 0.9605876132425566), ('guy', 0.8873543995635172), ('girl', 0.851644711353462), ('young', 0.8454644715719105)]


In [53]:
model_result2=glove.most_similar("boy")
print(model_result2)

[('girl', 0.9419230106533926), ('kid', 0.8353257929535401), ('woman', 0.8300046728060503), ('man', 0.8214572429272888)]


In [54]:
model_result3=glove.most_similar("university")
print(model_result3)

[('harvard', 0.8891058026544912), ('mit', 0.8655953916957512), ('stanford', 0.8466465957274574), ('cambridge', 0.8443982653416059)]


In [55]:
model_result4=glove.most_similar("water")
print(model_result4)

[('clean', 0.8497457787246371), ('air', 0.8434838133871023), ('fresh', 0.8294691419399418), ('electricity', 0.8201486509192097)]


In [56]:
model_result5=glove.most_similar("physics")
print(model_result5)

[('chemistry', 0.8879813137524931), ('economics', 0.8671791836455324), ('biology', 0.851880595257216), ('mathematics', 0.8505414029308055)]


In [57]:
model_result6=glove.most_similar("muscle")
print(model_result6)

[('tissue', 0.8330768461898297), ('nerve', 0.8261792360472352), ('skeletal', 0.7678621802032819), ('stem', 0.7529100804581816)]


In [58]:
model_result7=glove.most_similar("clean")
print(model_result7)

[('fresh', 0.8671807133278371), ('water', 0.8497457787246372), ('drinking', 0.7980126180711001), ('heat', 0.7928526111116708)]


# Embedding layer Vs. Pretrained Embedding

## Embedding layer

In [59]:
from keras.preprocessing.text import one_hot
from keras.models import Sequential
from keras.layers import Embedding

Using TensorFlow backend.


In [60]:
# 임의의 문서 3개 생성
sample_text_1="bitty bought a bit of butter"
sample_text_2="but the bit of butter was a bit bitter"
sample_text_3="so she bought some better butter to make the bitter butter better"

corp = [sample_text_1, sample_text_2, sample_text_3]
no_docs=len(corp)

In [61]:
# 단어장의 크기를 50으로 하고 단어에 정수 부여
vocab_size = 50 
encod_corp=[]
for i,doc in enumerate(corp):
    encod_corp.append(one_hot(doc,50))
    print("The encoding for document",i+1," is : ",one_hot(doc,50))

The encoding for document 1  is :  [11, 39, 1, 1, 1, 2]
The encoding for document 2  is :  [39, 45, 1, 1, 2, 42, 1, 1, 43]
The encoding for document 3  is :  [28, 4, 39, 41, 14, 2, 41, 18, 45, 43, 2, 14]


In [62]:
# 임베딩 레이어 생성
model = Sequential()
model.add(Embedding(vocab_size, 128))

In [63]:
# 룩업 테이블 저장
embeddings = model.layers[0].get_weights()[0]

In [64]:
# 룩업 테이블 크기 확인
print(embeddings.shape)

(50, 128)


In [65]:
# 34번째 단어 임베딩 값 확인
embeddings[34]

array([-0.00625784,  0.00304301, -0.02381074, -0.02092072,  0.00296366,
       -0.02906653,  0.02724005, -0.00850149, -0.01932492, -0.04163956,
       -0.03811176, -0.03717542,  0.01938969,  0.04606402,  0.0447088 ,
        0.00108474,  0.02461438, -0.01045949,  0.01511837,  0.02081356,
       -0.03643564, -0.03209408,  0.00912812, -0.02497218, -0.01035582,
       -0.02770036,  0.03799773, -0.04605339,  0.03468381, -0.02862719,
       -0.03075497, -0.03036954,  0.04159946, -0.01430782, -0.0463004 ,
        0.00396238,  0.00204682, -0.01779637,  0.0095073 , -0.00189345,
       -0.03592537,  0.00179736, -0.01474626,  0.04805089,  0.00413129,
       -0.03430102, -0.00667437,  0.01623935,  0.0077672 ,  0.00340446,
       -0.01011674,  0.04452496,  0.04814274, -0.02230606, -0.04745673,
       -0.01787706,  0.04932922,  0.02816181,  0.04477451,  0.03841956,
       -0.03559925, -0.02391123,  0.0148969 ,  0.02126119,  0.01601067,
        0.04231565,  0.04218716,  0.0333457 ,  0.02483978,  0.04

In [66]:
# 첫번째 문서 임베딩화
embeddings[encod_corp[0]].shape

(6, 128)

In [67]:
# 두번째 문서 임베딩화
embeddings[encod_corp[1]].shape

(9, 128)

In [68]:
# 세번째 문서 임베딩화
embeddings[encod_corp[2]].shape

(12, 128)

## 감성 분류하기(Embedding layer)

In [69]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

In [70]:
sentences = ['nice great best amazing', 'stop lies', 'pitiful nerd', 'excellent work', 'supreme quality', 'bad', 'highly respectable']
y_train = [1, 0, 0, 1, 1, 0, 1]

In [71]:
t = Tokenizer()
t.fit_on_texts(sentences)
vocab_size = len(t.word_index) + 1

print(vocab_size)

16


In [72]:
X_encoded = t.texts_to_sequences(sentences)
print(X_encoded)

[[1, 2, 3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13], [14, 15]]


In [73]:
max_len = max(len(l) for l in X_encoded)
print(max_len)

4


In [74]:
X_train=pad_sequences(X_encoded, maxlen=max_len, padding='post')
y_train=np.array(y_train)
print(X_train)

[[ 1  2  3  4]
 [ 5  6  0  0]
 [ 7  8  0  0]
 [ 9 10  0  0]
 [11 12  0  0]
 [13  0  0  0]
 [14 15  0  0]]


In [75]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten

model = Sequential()
model.add(Embedding(vocab_size, 4, input_length=max_len)) # 모든 임베딩 벡터는 4차원.
model.add(Flatten()) # Dense의 입력으로 넣기위함.
model.add(Dense(1, activation='sigmoid'))

In [76]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.fit(X_train, y_train, epochs=100, verbose=2)

Epoch 1/100
1/1 - 0s - loss: 0.6869 - acc: 0.5714
Epoch 2/100
1/1 - 0s - loss: 0.6853 - acc: 0.5714
Epoch 3/100
1/1 - 0s - loss: 0.6837 - acc: 0.5714
Epoch 4/100
1/1 - 0s - loss: 0.6821 - acc: 0.5714
Epoch 5/100
1/1 - 0s - loss: 0.6805 - acc: 0.5714
Epoch 6/100
1/1 - 0s - loss: 0.6789 - acc: 0.7143
Epoch 7/100
1/1 - 0s - loss: 0.6774 - acc: 0.7143
Epoch 8/100
1/1 - 0s - loss: 0.6758 - acc: 0.7143
Epoch 9/100
1/1 - 0s - loss: 0.6742 - acc: 0.7143
Epoch 10/100
1/1 - 0s - loss: 0.6726 - acc: 0.7143
Epoch 11/100
1/1 - 0s - loss: 0.6710 - acc: 0.7143
Epoch 12/100
1/1 - 0s - loss: 0.6695 - acc: 0.7143
Epoch 13/100
1/1 - 0s - loss: 0.6679 - acc: 0.7143
Epoch 14/100
1/1 - 0s - loss: 0.6663 - acc: 0.8571
Epoch 15/100
1/1 - 0s - loss: 0.6647 - acc: 0.8571
Epoch 16/100
1/1 - 0s - loss: 0.6631 - acc: 0.8571
Epoch 17/100
1/1 - 0s - loss: 0.6615 - acc: 0.8571
Epoch 18/100
1/1 - 0s - loss: 0.6599 - acc: 0.8571
Epoch 19/100
1/1 - 0s - loss: 0.6583 - acc: 1.0000
Epoch 20/100
1/1 - 0s - loss: 0.6567 - a

<tensorflow.python.keras.callbacks.History at 0x7f1882f553c8>

## 감성 분류하기(Pre-trained Embedding)

In [77]:
print(X_train)

[[ 1  2  3  4]
 [ 5  6  0  0]
 [ 7  8  0  0]
 [ 9 10  0  0]
 [11 12  0  0]
 [13  0  0  0]
 [14 15  0  0]]


In [78]:
import numpy as np
import gensim

In [79]:
# 현재 위치에 구글의 사전 훈련된 Word2Vec을 다운로드
!wget "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

--2020-06-27 23:23:15--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.82.155
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.82.155|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1647046227 (1.5G) [application/x-gzip]
Saving to: ‘GoogleNews-vectors-negative300.bin.gz’


2020-06-27 23:23:49 (45.4 MB/s) - ‘GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]



In [80]:
# 구글의 사전 훈련된 Word2vec 모델을 로드합니다.
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)  

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [81]:
print(word2vec_model.vectors.shape) # 모델의 크기 확인

(3000000, 300)


In [82]:
embedding_matrix = np.zeros((vocab_size, 300))
# 단어 집합 크기의 행과 300개의 열을 가지는 행렬 생성. 값은 전부 0으로 채워진다.
np.shape(embedding_matrix)

(16, 300)

In [83]:
def get_vector(word):
    if word in word2vec_model:
        return word2vec_model[word]
    else:
        return None

In [84]:
for word, i in t.word_index.items(): # 훈련 데이터의 단어 집합에서 단어와 정수 인덱스를 1개씩 꺼내온다.
    temp = get_vector(word) # 단어(key) 해당되는 임베딩 벡터의 300개의 값(value)를 임시 변수에 저장
    if temp is not None: # 만약 None이 아니라면 임베딩 벡터의 값을 리턴받은 것이므로
        embedding_matrix[i] = temp # 해당 단어 위치의 행에 벡터의 값을 저장한다.

In [85]:
print(word2vec_model['nice'])

[ 0.15820312  0.10595703 -0.18945312  0.38671875  0.08349609 -0.26757812
  0.08349609  0.11328125 -0.10400391  0.17871094 -0.12353516 -0.22265625
 -0.01806641 -0.25390625  0.13183594  0.0859375   0.16113281  0.11083984
 -0.11083984 -0.0859375   0.0267334   0.34570312  0.15136719 -0.00415039
  0.10498047  0.04907227 -0.06982422  0.08642578  0.03198242 -0.02844238
 -0.15722656  0.11865234  0.36132812  0.00173187  0.05297852 -0.234375
  0.11767578  0.08642578 -0.01123047  0.25976562  0.28515625 -0.11669922
  0.38476562  0.07275391  0.01147461  0.03466797  0.18164062 -0.03955078
  0.04199219  0.01013184 -0.06054688  0.09765625  0.06689453  0.14648438
 -0.12011719  0.08447266 -0.06152344  0.06347656  0.3046875  -0.35546875
 -0.2890625   0.19628906 -0.33203125 -0.07128906  0.12792969  0.09619141
 -0.12158203 -0.08691406 -0.12890625  0.27734375  0.265625    0.1796875
  0.12695312  0.06298828 -0.34375    -0.05908203  0.0456543   0.171875
  0.08935547  0.14648438 -0.04638672 -0.00842285 -0.0279

In [86]:
print('단어 nice의 정수 인덱스 :', t.word_index['nice'])

단어 nice의 정수 인덱스 : 1


In [87]:
print(embedding_matrix[1])

[ 0.15820312  0.10595703 -0.18945312  0.38671875  0.08349609 -0.26757812
  0.08349609  0.11328125 -0.10400391  0.17871094 -0.12353516 -0.22265625
 -0.01806641 -0.25390625  0.13183594  0.0859375   0.16113281  0.11083984
 -0.11083984 -0.0859375   0.0267334   0.34570312  0.15136719 -0.00415039
  0.10498047  0.04907227 -0.06982422  0.08642578  0.03198242 -0.02844238
 -0.15722656  0.11865234  0.36132812  0.00173187  0.05297852 -0.234375
  0.11767578  0.08642578 -0.01123047  0.25976562  0.28515625 -0.11669922
  0.38476562  0.07275391  0.01147461  0.03466797  0.18164062 -0.03955078
  0.04199219  0.01013184 -0.06054688  0.09765625  0.06689453  0.14648438
 -0.12011719  0.08447266 -0.06152344  0.06347656  0.3046875  -0.35546875
 -0.2890625   0.19628906 -0.33203125 -0.07128906  0.12792969  0.09619141
 -0.12158203 -0.08691406 -0.12890625  0.27734375  0.265625    0.1796875
  0.12695312  0.06298828 -0.34375    -0.05908203  0.0456543   0.171875
  0.08935547  0.14648438 -0.04638672 -0.00842285 -0.0279

In [88]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten

model = Sequential()
e = Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=max_len, trainable=False)
model.add(e)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.fit(X_train, y_train, epochs=100, verbose=2)

Epoch 1/100
1/1 - 0s - loss: 0.6709 - acc: 0.7143
Epoch 2/100
1/1 - 0s - loss: 0.6530 - acc: 0.8571
Epoch 3/100
1/1 - 0s - loss: 0.6357 - acc: 0.8571
Epoch 4/100
1/1 - 0s - loss: 0.6189 - acc: 1.0000
Epoch 5/100
1/1 - 0s - loss: 0.6026 - acc: 1.0000
Epoch 6/100
1/1 - 0s - loss: 0.5869 - acc: 1.0000
Epoch 7/100
1/1 - 0s - loss: 0.5717 - acc: 1.0000
Epoch 8/100
1/1 - 0s - loss: 0.5570 - acc: 1.0000
Epoch 9/100
1/1 - 0s - loss: 0.5428 - acc: 1.0000
Epoch 10/100
1/1 - 0s - loss: 0.5291 - acc: 1.0000
Epoch 11/100
1/1 - 0s - loss: 0.5158 - acc: 1.0000
Epoch 12/100
1/1 - 0s - loss: 0.5030 - acc: 1.0000
Epoch 13/100
1/1 - 0s - loss: 0.4906 - acc: 1.0000
Epoch 14/100
1/1 - 0s - loss: 0.4787 - acc: 1.0000
Epoch 15/100
1/1 - 0s - loss: 0.4671 - acc: 1.0000
Epoch 16/100
1/1 - 0s - loss: 0.4560 - acc: 1.0000
Epoch 17/100
1/1 - 0s - loss: 0.4452 - acc: 1.0000
Epoch 18/100
1/1 - 0s - loss: 0.4347 - acc: 1.0000
Epoch 19/100
1/1 - 0s - loss: 0.4246 - acc: 1.0000
Epoch 20/100
1/1 - 0s - loss: 0.4148 - a

<tensorflow.python.keras.callbacks.History at 0x7f16fdbd5160>

# Tensorflow Hub로부터 Pre-trained Embedding 사용하기

In [148]:
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_hub as hub

In [142]:
(train_data, validation_data, test_data), info = tfds.load('imdb_reviews', split=['train[:80%]', 'train[80%:90%]', 'train[90%:]'], with_info=True, as_supervised=True)

In [143]:
train_examples_batch, train_labels_batch = next(iter(train_data.batch(10)))
train_examples_batch

<tf.Tensor: shape=(10,), dtype=string, numpy=
array([b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.",
       b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell 

In [144]:
train_labels_batch

<tf.Tensor: shape=(10,), dtype=int64, numpy=array([0, 0, 0, 1, 1, 1, 0, 0, 0, 0])>

In [149]:
embedding = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"
hub_layer = hub.KerasLayer(embedding, input_shape=[], 
                           dtype=tf.string, trainable=True)
hub_layer(train_examples_batch[:3])

<tf.Tensor: shape=(3, 20), dtype=float32, numpy=
array([[ 1.765786  , -3.882232  ,  3.9134233 , -1.5557289 , -3.3362343 ,
        -1.7357955 , -1.9954445 ,  1.2989551 ,  5.081598  , -1.1041286 ,
        -2.0503852 , -0.72675157, -0.65675956,  0.24436149, -3.7208383 ,
         2.0954835 ,  2.2969332 , -2.0689783 , -2.9489717 , -1.1315987 ],
       [ 1.8804485 , -2.5852382 ,  3.4066997 ,  1.0982676 , -4.056685  ,
        -4.891284  , -2.785554  ,  1.3874227 ,  3.8476458 , -0.9256538 ,
        -1.896706  ,  1.2113281 ,  0.11474707,  0.76209456, -4.8791065 ,
         2.906149  ,  4.7087674 , -2.3652055 , -3.5015898 , -1.6390051 ],
       [ 0.71152234, -0.6353217 ,  1.7385626 , -1.1168286 , -0.5451594 ,
        -1.1808156 ,  0.09504455,  1.4653089 ,  0.66059524,  0.79308075,
        -2.2268345 ,  0.07446612, -1.4075904 , -0.70645386, -1.907037  ,
         1.4419787 ,  1.9551861 , -0.42660055, -2.8022065 ,  0.43727064]],
      dtype=float32)>

In [150]:
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
keras_layer (KerasLayer)     (None, 20)                400020    
_________________________________________________________________
dense_2 (Dense)              (None, 16)                336       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17        
Total params: 400,373
Trainable params: 400,373
Non-trainable params: 0
_________________________________________________________________


In [151]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [152]:
history = model.fit(train_data.shuffle(10000).batch(512),
                    epochs=20,
                    validation_data=validation_data.batch(512),
                    verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [153]:
results = model.evaluate(test_data.batch(512), verbose=2)
for name, value in zip(model.metrics_names, results):
  print("%s: %.3f" % (name, value))

5/5 - 0s - loss: 0.2748 - accuracy: 0.8896
loss: 0.275
accuracy: 0.890


# SGNS 구현하기

In [92]:
import os
from collections import Counter
from time import time

import numpy as np
import pandas as pd
from keras.layers import Dense, Dot, Embedding, Input, Reshape
from keras.models import Model
from keras.preprocessing.sequence import skipgrams
from nltk.corpus import stopwords

np.random.seed(777)
os.environ["CUDA_VISIBLE_DEVICES"] = '-1'

In [104]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [95]:
import urllib.request
urllib.request.urlretrieve("https://raw.githubusercontent.com/eds-uga/csci1360e-su18/master/assignments/A9/abcnews-date-text.csv", filename="abcnews-date-text.csv")

('abcnews-date-text.csv', <http.client.HTTPMessage at 0x7f16fadafba8>)

In [97]:
corpus = pd.read_csv("abcnews-date-text.csv").iloc[:,1] 
corpus.head(10)

0    aba decides against community broadcasting lic...
1       act fire witnesses must be aware of defamation
2       a g calls for infrastructure protection summit
3             air nz staff in aust strike for pay rise
4        air nz strike to affect australian travellers
5                    ambitious olsson wins triple jump
6           antic delighted with record breaking barca
7    aussie qualifier stosur wastes four memphis match
8         aust addresses un security council over iraq
9           australia is locked into war timetable opp
Name: headline_text, dtype: object

In [98]:
# 소문자로 변경
corpus = corpus.str.lower()    
# 숫자/알파벳/공백 을 제외하고 전부 제거 (with pandas.core.strings.StringMethods)
corpus = corpus.str.replace('[^a-z0-9]+', ' ', regex=True)
corpus.head(10)

0    aba decides against community broadcasting lic...
1       act fire witnesses must be aware of defamation
2       a g calls for infrastructure protection summit
3             air nz staff in aust strike for pay rise
4        air nz strike to affect australian travellers
5                    ambitious olsson wins triple jump
6           antic delighted with record breaking barca
7    aussie qualifier stosur wastes four memphis match
8         aust addresses un security council over iraq
9           australia is locked into war timetable opp
Name: headline_text, dtype: object

In [99]:
corpus_list = corpus.values.tolist()
len(corpus_list)

1103665

In [100]:
corpus_list[0:5]

['aba decides against community broadcasting licence',
 'act fire witnesses must be aware of defamation',
 'a g calls for infrastructure protection summit',
 'air nz staff in aust strike for pay rise',
 'air nz strike to affect australian travellers']

In [105]:
# corpus에 있는 모든 line에서 word를 추출해서 하나의 list에 모두 집어넣습니다.
words = np.concatenate(np.core.defchararray.split(corpus_list)).tolist()
print(words[0:100])

['aba', 'decides', 'against', 'community', 'broadcasting', 'licence', 'act', 'fire', 'witnesses', 'must', 'be', 'aware', 'of', 'defamation', 'a', 'g', 'calls', 'for', 'infrastructure', 'protection', 'summit', 'air', 'nz', 'staff', 'in', 'aust', 'strike', 'for', 'pay', 'rise', 'air', 'nz', 'strike', 'to', 'affect', 'australian', 'travellers', 'ambitious', 'olsson', 'wins', 'triple', 'jump', 'antic', 'delighted', 'with', 'record', 'breaking', 'barca', 'aussie', 'qualifier', 'stosur', 'wastes', 'four', 'memphis', 'match', 'aust', 'addresses', 'un', 'security', 'council', 'over', 'iraq', 'australia', 'is', 'locked', 'into', 'war', 'timetable', 'opp', 'australia', 'to', 'contribute', '10', 'million', 'in', 'aid', 'to', 'iraq', 'barca', 'take', 'record', 'as', 'robson', 'celebrates', 'birthday', 'in', 'bathhouse', 'plans', 'move', 'ahead', 'big', 'hopes', 'for', 'launceston', 'cycling', 'championship', 'big', 'plan', 'to', 'boost']


In [106]:
# 너무 자주 출현하는 word들은 word vector 구성에 방해가 됩니다. 이런 불용어들을 제거해 줍니다. 
stopWords = set(stopwords.words('english'))
print(stopWords)

{'off', 'other', 'such', 'shouldn', 'what', 'these', 'aren', 'weren', 'themselves', "shouldn't", 'own', 'by', "mightn't", "couldn't", 'them', 'their', 'her', 'in', 'above', 'for', 'that', 'hadn', 'any', "mustn't", 'very', 'needn', 'does', 'after', 'i', "needn't", "weren't", 'those', 'about', 'then', "don't", 'are', 'being', 'but', 'under', 'no', 'as', 'where', 'was', 'now', 'should', 'nor', 'have', "doesn't", 'before', 'hasn', 'same', 'we', "you'd", 'over', 'only', 'm', 'which', "you're", 'from', 'down', 'itself', 'or', 'again', "you'll", 'most', "shan't", 't', 'myself', 'doesn', 'its', 'won', 'wouldn', 'against', 'not', 'just', 'ours', 'this', 'below', 'few', 'hers', "wouldn't", 'ourselves', "you've", 'why', 'isn', 'and', 'theirs', 'having', "didn't", 'doing', 'me', 'on', 'had', "hasn't", 'yourself', 'didn', "won't", 'each', 'until', 'to', 'while', 'some', 'himself', 'll', 'who', 'an', 'don', 'with', 'they', 'how', 'were', 'whom', 'did', 'am', 'is', "that'll", 'him', 'a', "haven't", "

In [107]:
# 단어의 등장 빈도수를 카운트해주는 패키지
counter = Counter(words)
len(counter)

96722

In [108]:
# 등장 빈도 상위 100개
counter.most_common(100)

[('to', 214201),
 ('in', 135982),
 ('for', 130239),
 ('of', 80759),
 ('on', 73037),
 ('over', 50306),
 ('the', 49810),
 ('police', 35986),
 ('at', 31723),
 ('with', 29676),
 ('after', 29661),
 ('new', 29095),
 ('man', 28110),
 ('a', 24885),
 ('and', 22748),
 ('up', 20990),
 ('as', 20481),
 ('says', 20001),
 ('from', 19086),
 ('by', 17727),
 ('us', 17536),
 ('out', 17184),
 ('govt', 16935),
 ('court', 16383),
 ('council', 16363),
 ('be', 15774),
 ('more', 15247),
 ('interview', 15025),
 ('fire', 13910),
 ('not', 13741),
 ('nsw', 12919),
 ('australia', 12680),
 ('plan', 12307),
 ('water', 11877),
 ('qld', 11790),
 ('wa', 11534),
 ('crash', 11208),
 ('death', 11174),
 ('into', 10901),
 ('sydney', 10761),
 ('off', 10731),
 ('against', 10427),
 ('health', 10373),
 ('australian', 10352),
 ('charged', 10148),
 ('back', 10134),
 ('no', 10050),
 ('report', 9589),
 ('down', 9555),
 ('call', 9292),
 ('murder', 9217),
 ('sa', 9121),
 ('an', 9053),
 ('day', 8827),
 ('hospital', 8821),
 ('car', 8690

In [109]:
# 불룡어 제거
words = [word for word in words if word not in stopWords]

In [110]:
print(counter.most_common(100))

[('to', 214201), ('in', 135982), ('for', 130239), ('of', 80759), ('on', 73037), ('over', 50306), ('the', 49810), ('police', 35986), ('at', 31723), ('with', 29676), ('after', 29661), ('new', 29095), ('man', 28110), ('a', 24885), ('and', 22748), ('up', 20990), ('as', 20481), ('says', 20001), ('from', 19086), ('by', 17727), ('us', 17536), ('out', 17184), ('govt', 16935), ('court', 16383), ('council', 16363), ('be', 15774), ('more', 15247), ('interview', 15025), ('fire', 13910), ('not', 13741), ('nsw', 12919), ('australia', 12680), ('plan', 12307), ('water', 11877), ('qld', 11790), ('wa', 11534), ('crash', 11208), ('death', 11174), ('into', 10901), ('sydney', 10761), ('off', 10731), ('against', 10427), ('health', 10373), ('australian', 10352), ('charged', 10148), ('back', 10134), ('no', 10050), ('report', 9589), ('down', 9555), ('call', 9292), ('murder', 9217), ('sa', 9121), ('an', 9053), ('day', 8827), ('hospital', 8821), ('car', 8690), ('may', 8534), ('coast', 8419), ('calls', 8401), ('a

In [111]:
# 상위 80%만 남긴다. 그리고 UNK 토큰을 Vocabulary에 추가
top_n_ratio=0.8

counter = Counter(dict(counter.most_common(int(top_n_ratio * len(counter)))))
vocab = list(counter) + ['UNK']
vocab[-10:]

['haytossing',
 'mitter',
 'flyboss',
 'bigga',
 'vandeleur',
 'accredits',
 'ides',
 'certifi',
 'jackknifes',
 'UNK']

In [112]:
vocab_size = len(vocab)
vocab_size

77378

In [113]:
word2index = {word:index for index, word in enumerate(vocab)}
index2word = {index:word for word, index in word2index.items()}

In [114]:
word2index['iraq']

132

In [115]:
index2word[96]

'china'

In [116]:
# 정수 인코딩
indexed_corpus_list = []
for doc in corpus_list:
    indexed_corpus_list.append([word2index[word] if word in word2index else word2index['UNK'] for word in doc.split()])

In [117]:
corpus_list[0]

'aba decides against community broadcasting licence'

In [119]:
indexed_corpus_list[0]

[11212, 6092, 41, 182, 11058, 1320]

In [120]:
word2index['aba']

11212

In [121]:
word2index['licence']


1320

In [123]:
# 네거티브 샘플링이 섞인 데이터셋 생성
def generating_wordpairs(indexed_corpus, vocab_size, window_size=4):
    X = []
    Y = []
    for row in indexed_corpus:
        x, y = skipgrams(sequence=row, vocabulary_size=vocab_size, window_size=window_size,
                        negative_samples=1.0, shuffle=True, categorical=False, sampling_table=None, seed=None)
        X = X + list(x)
        Y = Y + list(y)
    return X, Y

In [124]:
X, Y = generating_wordpairs(indexed_corpus_list[0:100], vocab_size, window_size=4)

In [125]:
print(X[0])

[6092, 55727]


In [126]:
print(Y[0])

0


In [133]:
#embedding_dim=300
embedding_dim=30

input_target = Input((1,))
input_context = Input((1,))

embedding_layer = Embedding(vocab_size, embedding_dim, input_length=1)

target_embedding = embedding_layer(input_target)
target_embedding = Reshape((embedding_dim, 1))(target_embedding)
context_embedding = embedding_layer(input_context)
context_embedding = Reshape((embedding_dim, 1))(context_embedding)

hidden_layer = Dot(axes=1)([target_embedding, context_embedding])
hidden_layer = Reshape((1,))(hidden_layer)

output = Dense(16, activation='sigmoid')(hidden_layer)
output = Dense(1, activation='sigmoid')(output)

model = Model(inputs=[input_target, input_context], outputs=output)
model.summary()
model.compile(loss='binary_crossentropy', optimizer='sgd')

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 1, 30)        2321340     input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
reshape_1 (Reshape)             (None, 30, 1)        0           embedding_2[0][0]          

In [134]:
epochs = 100000
batch_size = 512

for i in range(epochs):
    idx_batch = np.random.choice(len(indexed_corpus_list), batch_size)
    X, Y = generating_wordpairs(np.array(indexed_corpus_list)[idx_batch].tolist(), vocab_size)

    word_target, word_context = zip(*X)
    word_target = np.array(word_target, dtype=np.int32)
    word_context = np.array(word_context, dtype=np.int32)

    target = np.zeros((1,))
    context = np.zeros((1,))
    label = np.zeros((1,))
    idx = np.random.randint(0, len(Y)-1)
    target[0,] = word_target[idx]
    context[0,] = word_context[idx]
    label[0,] = Y[idx]
    loss = model.train_on_batch([target, context], label)
    if i % 10 == 0:
        print("Iteration {}, loss={}".format(i, loss))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Iteration 0, loss=0.418701708316803
Iteration 10, loss=1.020002007484436
Iteration 20, loss=0.4909874498844147
Iteration 30, loss=0.9454270601272583
Iteration 40, loss=0.4941861033439636
Iteration 50, loss=0.5181975364685059
Iteration 60, loss=0.5186092853546143
Iteration 70, loss=0.8760280013084412
Iteration 80, loss=0.5335646271705627
Iteration 90, loss=0.9153674840927124
Iteration 100, loss=0.5517467856407166
Iteration 110, loss=0.9519153237342834
Iteration 120, loss=0.5482457876205444
Iteration 130, loss=0.5639049410820007
Iteration 140, loss=0.793644905090332
Iteration 150, loss=0.7296739816665649
Iteration 160, loss=0.6996212601661682
Iteration 170, loss=0.6632164716720581
Iteration 180, loss=0.7446346879005432
Iteration 190, loss=0.6632117629051208
Iteration 200, loss=0.7419507503509521
Iteration 210, loss=0.7237974405288696


KeyboardInterrupt: ignored

In [135]:
word2vec_file_path = 'word2vec.txt'
f = open(word2vec_file_path, 'w')
f.write('{} {}\n'.format(vocab_size-1, embedding_dim))
vectors = model.get_weights()[0]
for word, i in word2index.items():
    f.write('{} {}\n'.format(word, ' '.join(map(str, list(vectors[i, :])))))
f.close()

In [136]:
from gensim.models.keyedvectors import Word2VecKeyedVectors

word_vectors = Word2VecKeyedVectors.load_word2vec_format(word2vec_file_path, binary=False)
vector = word_vectors['computer']
vector

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


array([-0.02655335,  0.01613258, -0.01884152,  0.00225597,  0.03588798,
        0.01791611, -0.04908825,  0.02676917, -0.00985587, -0.03896325,
        0.03224862, -0.0489888 ,  0.02925624, -0.04400914,  0.02372762,
        0.04993862, -0.04695542, -0.02855517, -0.02023038,  0.00429451,
       -0.04318174, -0.02264958,  0.04702992, -0.04923805, -0.03045248,
       -0.03173252,  0.02526869, -0.00751292, -0.01163135, -0.00431548],
      dtype=float32)

In [137]:
word_vectors.similar_by_word("cat")

  if np.issubdtype(vec.dtype, np.int):


[('cayleb', 0.676974892616272),
 ('omeros', 0.6681385636329651),
 ('certifications', 0.6634451150894165),
 ('smeaton', 0.6378586888313293),
 ('brougham', 0.6364120841026306),
 ('crypt', 0.6362036466598511),
 ('seaports', 0.629311740398407),
 ('gasgoyne', 0.6285969018936157),
 ('transformative', 0.6258813142776489),
 ('hazem', 0.6255269050598145)]