# Pre-trained Word Embeddings

This tutorial refers to https://gluon-nlp.mxnet.io/examples/word_embedding/word_embedding.html

In [3]:
import warnings
warnings.filterwarnings('ignore')

from mxnet import gluon
from mxnet import nd
import gluonnlp as nlp
import re

In [4]:
text = "내년 1월 집권 후반기를 맞는 도널드 트럼프 미국 대통령이 개각을 예고했다."

In [5]:
def simple_tokenize(source_str, token_delim=' ', seq_delim='\n'):
    return filter(None, re.split(token_delim + '|' + seq_delim, source_str))
counter = nlp.data.count_tokens(simple_tokenize(text))

In [6]:
counter

Counter({'내년': 1,
         '1월': 1,
         '집권': 1,
         '후반기를': 1,
         '맞는': 1,
         '도널드': 1,
         '트럼프': 1,
         '미국': 1,
         '대통령이': 1,
         '개각을': 1,
         '예고했다.': 1})

In [15]:
len(counter)

11

In [7]:
vocab = nlp.Vocab(counter)

In [8]:
for word in vocab.idx_to_token:
    print(word)

<unk>
<pad>
<bos>
<eos>
1월
개각을
내년
대통령이
도널드
맞는
미국
예고했다.
집권
트럼프
후반기를


In [9]:
print(vocab.token_to_idx["<unk>"])
print(vocab.token_to_idx["world"])

0
0


## Attaching word embeddings

List of data provided by gluon<br>
https://github.com/dmlc/gluon-nlp/blob/d49a7896ae92307cf3c930f2eb2e3d516a278fe7/src/gluonnlp/_constants.py

In [10]:
fasttext_simple = nlp.embedding.create('fasttext', source='wiki.ko')

Embedding file wiki.ko.npz is not found. Downloading from Gluon Repository. This may take some time.
Downloading /home/chatbot/.mxnet/embedding/fasttext/wiki.ko.npz from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/embeddings/fasttext/wiki.ko.npz...


In [11]:
vocab.set_embedding(fasttext_simple)

In [13]:
nlp.embedding.list_sources('fasttext')[:10]

['crawl-300d-2M',
 'wiki.aa',
 'wiki.ab',
 'wiki.ace',
 'wiki.ady',
 'wiki.af',
 'wiki.ak',
 'wiki.als',
 'wiki.am',
 'wiki.ang']

In [16]:
len(vocab)  # len(counter) + <unk>, <pad>, <bos>, <eos>

15

In [20]:
vocab.embedding['없는단어'].shape

(300,)

In [21]:
vocab.embedding['없는단어']


[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
<NDArray 300 @cpu(0)>

In [19]:
vocab.embedding['개각을']


[-0.11939    0.14281   -0.4036    -0.09584   -0.24916    0.20381
  0.035114   0.12061    0.40127    0.17465    0.10857    0.0728
 -0.12138    0.33747    0.022977   0.24363   -0.18804   -0.02665
 -0.28662    0.048179  -0.2089    -0.0092405 -0.058002   0.052719
  0.081023   0.14211    0.06015   -0.16239    0.28023    0.055946
 -0.26147   -0.13596    0.17029   -0.016631  -0.11155   -0.13916
 -0.28639    0.33995   -0.029001   0.24099    0.013514  -0.017179
  0.55403   -0.13183    0.29294   -0.14833    0.07291   -0.14361
  0.15363   -0.060235  -0.25805   -0.16631   -0.13454   -0.087062
 -0.1284    -0.17891   -0.029614  -0.24379    0.33243   -0.088118
  0.10899   -0.1572    -0.22881    0.22783   -0.086342   0.29453
  0.012163  -0.19059    0.10463    0.18104    0.2668     0.035839
  0.017008  -0.50608   -0.0053929 -0.4075    -0.16649   -0.059163
 -0.35484   -0.21731   -0.19367    0.24612    0.32474    0.19187
 -0.151     -0.3265     0.087657  -0.3476     0.49131    0.25075
 -0.14623    0.252

This is reference to embedding.TokenEmbedding<br>
https://gluon-nlp.mxnet.io/api/embedding.html?highlight=embedding#gluonnlp.embedding.TokenEmbedding

In [32]:
fasttext_simple

<gluonnlp.embedding.token_embedding.FastText at 0x7f1e0dca83c8>

In [27]:
fasttext_simple.token_to_idx

{'<unk>': 0,
 '</s>': 1,
 '.': 2,
 ',': 3,
 ')': 4,
 '(': 5,
 '년': 6,
 "'": 7,
 '-': 8,
 '분류': 9,
 '월': 10,
 '일': 11,
 '#': 12,
 '}': 13,
 '있다': 14,
 '/': 15,
 '~': 16,
 '이': 17,
 '《': 18,
 '》': 19,
 '는': 20,
 '수': 21,
 '제': 22,
 '의': 23,
 '넘겨주기': 24,
 '은': 25,
 '·': 26,
 '있는': 27,
 '그': 28,
 '역': 29,
 'kst': 30,
 '대한민국의': 31,
 '\\': 32,
 '에': 33,
 '토론': 34,
 '선수': 35,
 '바깥': 36,
 '고리': 37,
 '%': 38,
 '한': 39,
 '및': 40,
 '를': 41,
 '?': 42,
 '축구': 43,
 '한다': 44,
 'the': 45,
 '대한': 46,
 '영화': 47,
 'a': 48,
 '을': 49,
 '주': 50,
 '가': 51,
 '명': 52,
 '년에': 53,
 '다른': 54,
 '같은': 55,
 '로': 56,
 '되었다': 57,
 'm': 58,
 '등': 59,
 '회': 60,
 'of': 61,
 '이후': 62,
 '중': 63,
 '그는': 64,
 '미국의': 65,
 '함께': 66,
 '때': 67,
 '또한': 68,
 '에서': 69,
 '현재': 70,
 '때문에': 71,
 '같이': 72,
 '대': 73,
 '후': 74,
 '!': 75,
 '사람': 76,
 '위해': 77,
 '"': 78,
 '것을': 79,
 '더': 80,
 '배우': 81,
 '시': 82,
 '일본': 83,
 '대한민국': 84,
 '태어남': 85,
 '→': 86,
 '또는': 87,
 '그리고': 88,
 '두': 89,
 '하는': 90,
 'kbs': 91,
 '와': 92,
 '현': 93,
 '미국': 

In [34]:
fasttext_simple.CosineSimilarity

AttributeError: 'FastText' object has no attribute 'CosineSimilarity'