# Pre-trained Word Embeddings

This tutorial refers to https://gluon-nlp.mxnet.io/examples/word_embedding/word_embedding.html

In [2]:
import warnings
warnings.filterwarnings('ignore')

from mxnet import gluon
from mxnet import nd
import gluonnlp as nlp
import re

In [3]:
text = "내년 1월 집권 후반기를 맞는 도널드 트럼프 미국 대통령이 개각을 예고했다."

In [4]:
def simple_tokenize(source_str, token_delim=' ', seq_delim='\n'):
    return filter(None, re.split(token_delim + '|' + seq_delim, source_str))
counter = nlp.data.count_tokens(simple_tokenize(text))

In [5]:
counter

Counter({'내년': 1,
         '1월': 1,
         '집권': 1,
         '후반기를': 1,
         '맞는': 1,
         '도널드': 1,
         '트럼프': 1,
         '미국': 1,
         '대통령이': 1,
         '개각을': 1,
         '예고했다.': 1})

In [6]:
len(counter)

11

In [7]:
vocab = nlp.Vocab(counter)

In [8]:
for word in vocab.idx_to_token:
    print(word)

<unk>
<pad>
<bos>
<eos>
1월
개각을
내년
대통령이
도널드
맞는
미국
예고했다.
집권
트럼프
후반기를


In [9]:
print(vocab.token_to_idx["<unk>"])
print(vocab.token_to_idx["world"])

0
0


## Attaching word embeddings

List of data provided by gluon<br>
https://github.com/dmlc/gluon-nlp/blob/d49a7896ae92307cf3c930f2eb2e3d516a278fe7/src/gluonnlp/_constants.py

In [10]:
fasttext_simple = nlp.embedding.create('fasttext', source='wiki.ko')

Embedding file wiki.ko.npz is not found. Downloading from Gluon Repository. This may take some time.
Downloading /home/chatbot/.mxnet/embedding/fasttext/wiki.ko.npz from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/embeddings/fasttext/wiki.ko.npz...


To attach the newly loaded word embeddings fasttext_simple to indexed words in vocab, we simply call vocab’s set_embedding method:

In [11]:
vocab.set_embedding(fasttext_simple)

To see other available sources of pretrained word embeddings using the fastText algorithm, we can call text.embedding.list_sources.

In [12]:
nlp.embedding.list_sources('fasttext')[:10]

['crawl-300d-2M',
 'crawl-300d-2M-subword',
 'wiki.aa',
 'wiki.ab',
 'wiki.ace',
 'wiki.ady',
 'wiki.af',
 'wiki.ak',
 'wiki.als',
 'wiki.am']

In [13]:
len(vocab)  # len(counter) + <unk>, <pad>, <bos>, <eos>

15

By default, the vector of any token that is unknown to vocab is a zero vector.<br> 
Its length is equal to the vector dimension of the fastText word embeddings: 300.

In [14]:
vocab.embedding['없는단어'].shape

(300,)

In [15]:
vocab.embedding['없는단어']


[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
<NDArray 300 @cpu(0)>

In [16]:
vocab.embedding['개각을'].shape

(300,)

In [17]:
vocab.embedding['개각을']


[-0.11939    0.14281   -0.4036    -0.09584   -0.24916    0.20381
  0.035114   0.12061    0.40127    0.17465    0.10857    0.0728
 -0.12138    0.33747    0.022977   0.24363   -0.18804   -0.02665
 -0.28662    0.048179  -0.2089    -0.0092405 -0.058002   0.052719
  0.081023   0.14211    0.06015   -0.16239    0.28023    0.055946
 -0.26147   -0.13596    0.17029   -0.016631  -0.11155   -0.13916
 -0.28639    0.33995   -0.029001   0.24099    0.013514  -0.017179
  0.55403   -0.13183    0.29294   -0.14833    0.07291   -0.14361
  0.15363   -0.060235  -0.25805   -0.16631   -0.13454   -0.087062
 -0.1284    -0.17891   -0.029614  -0.24379    0.33243   -0.088118
  0.10899   -0.1572    -0.22881    0.22783   -0.086342   0.29453
  0.012163  -0.19059    0.10463    0.18104    0.2668     0.035839
  0.017008  -0.50608   -0.0053929 -0.4075    -0.16649   -0.059163
 -0.35484   -0.21731   -0.19367    0.24612    0.32474    0.19187
 -0.151     -0.3265     0.087657  -0.3476     0.49131    0.25075
 -0.14623    0.252

This is reference to embedding.TokenEmbedding<br>
https://gluon-nlp.mxnet.io/api/embedding.html?highlight=embedding#gluonnlp.embedding.TokenEmbedding

here is where to start

In [18]:
fasttext_simple

<gluonnlp.embedding.token_embedding.FastText at 0x7f128be6d128>

In [19]:
len(fasttext_simple.token_to_idx)

879130

In [21]:
fs = fasttext_simple

In [31]:
def pearson_correlation(w2v, word1, word2, scores):
    from scipy import stats
    evaluator = nlp.embedding.evaluation.WordEmbeddingSimilarity(
        idx_to_vec=w2v,
        similarity_function="CosineSimilarity")
    evaluator.initialize(ctx=ctx)
    evaluator.hybridize()
    pred = evaluator(word1, word2)
    scorr = stats.spearmanr(pred.asnumpy(), scores.asnumpy())
    return(scorr)


[1.]
<NDArray 1 @cpu(0)>

[-1.]
<NDArray 1 @cpu(0)>


In [33]:
def cos_sim(x, y):
    return nd.dot(x, y) / (nd.norm(x) * nd.norm(y))

x = nd.array([1,2])
y = nd.array([-1,-2])
print(cos_sim(x,x))
print(cos_sim(x,y))


[1.]
<NDArray 1 @cpu(0)>

[-1.]
<NDArray 1 @cpu(0)>


In [36]:
cos_sim(vocab.embedding['개각을'], vocab.embedding['대통령이'])


[0.55882585]
<NDArray 1 @cpu(0)>

In [44]:
cos_sim(fasttext_simple.idx_to_vec[fasttext_simple.token_to_idx['왕']], fasttext_simple.idx_to_vec[fasttext_simple.token_to_idx['남자']])


[0.2999336]
<NDArray 1 @cpu(0)>