# Pytorch를 이용한 word embedding 활용!
 - pytorch의 nn.Embedding의 활용
 - pretrained된 embedding의 활요

## 01. pytorch nn.Embedding의 활용

In [48]:
# coding: utf-8
import gensim
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x7f0c824c3150>

In [49]:
word_to_ix = {"hello": 0, "world": 1}
embeds = nn.Embedding(2, 5)  # 2 words in vocab, 5 dimensional embeddings


In [50]:
embeds

Embedding(2, 5)

In [52]:
lookup_tensor = torch.tensor([word_to_ix["hello"]], dtype=torch.long)

In [53]:
lookup_tensor

tensor([0])

In [54]:
hello_embed = embeds(lookup_tensor)
print(hello_embed)

tensor([[ 0.6614,  0.2669,  0.0617,  0.6213, -0.4519]],
       grad_fn=<EmbeddingBackward0>)


## 02. External word embedding의 활용

In [55]:
# Load word2vec pre-train model
vectors = gensim.models.KeyedVectors.load_word2vec_format('./word2vec_ko_50.model')
weights = torch.FloatTensor(vectors.vectors)

In [57]:
vectors["최민식"]

array([-0.1568402 ,  0.3138746 ,  0.13638571, -0.10927913, -0.32099295,
       -0.3220316 ,  0.26192468,  0.9275467 , -0.17426912, -0.3756095 ,
       -0.21964297, -0.15588401, -0.18579984,  0.22237049,  0.04790824,
       -0.22412781,  0.01293354,  0.21980919, -0.21135098, -0.04107093,
       -0.06391697, -0.01586648,  0.43257886,  0.09648908,  0.07741321,
        0.04191321, -0.28550947, -0.03373787, -0.17413023,  0.19423701,
        0.03852396, -0.23480639, -0.1806618 ,  0.16275446, -0.30010974,
       -0.1389996 ,  0.3291999 ,  0.19219263, -0.01871492, -0.5012303 ,
        0.5684251 ,  0.1994378 ,  0.11553255,  0.1536461 ,  0.13966206,
       -0.36483458,  0.2246087 , -0.4449461 ,  0.20038557,  0.18293789],
      dtype=float32)

In [58]:
vectors.key_to_index["최민식"]

3374

In [None]:
weights[[3374]]

In [66]:
weights[vectors.key_to_index["최민식"]]

tensor([-0.1568,  0.3139,  0.1364, -0.1093, -0.3210, -0.3220,  0.2619,  0.9275,
        -0.1743, -0.3756, -0.2196, -0.1559, -0.1858,  0.2224,  0.0479, -0.2241,
         0.0129,  0.2198, -0.2114, -0.0411, -0.0639, -0.0159,  0.4326,  0.0965,
         0.0774,  0.0419, -0.2855, -0.0337, -0.1741,  0.1942,  0.0385, -0.2348,
        -0.1807,  0.1628, -0.3001, -0.1390,  0.3292,  0.1922, -0.0187, -0.5012,
         0.5684,  0.1994,  0.1155,  0.1536,  0.1397, -0.3648,  0.2246, -0.4449,
         0.2004,  0.1829])

In [67]:
print(vectors.index_to_key[3374])

최민식


In [70]:
weights[vectors.key_to_index["제로투"]]

KeyError: ignored

In [72]:
weights[vectors.key_to_index["<unk>"]]

KeyError: ignored

In [74]:
import numpy as np

In [75]:
len(vectors)

16477

In [76]:
vectors.add_vector("<unk>", np.zeros(50))



16477

In [77]:
len(vectors)

16478

In [78]:
vectors.index_to_key[16477]

'<unk>'

In [79]:
vectors[16477]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32)

In [80]:
weights = torch.FloatTensor(vectors.vectors)

In [81]:
weights[16477]

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.])

In [82]:
# Build nn.Embedding() layer
embedding = nn.Embedding.from_pretrained(weights)
embedding.requires_grad = False

In [92]:
torch.tensor(vectors.key_to_index["전도연"])

tensor(3500)

In [83]:
weights[vectors.key_to_index["최민식"]]

tensor([-0.1568,  0.3139,  0.1364, -0.1093, -0.3210, -0.3220,  0.2619,  0.9275,
        -0.1743, -0.3756, -0.2196, -0.1559, -0.1858,  0.2224,  0.0479, -0.2241,
         0.0129,  0.2198, -0.2114, -0.0411, -0.0639, -0.0159,  0.4326,  0.0965,
         0.0774,  0.0419, -0.2855, -0.0337, -0.1741,  0.1942,  0.0385, -0.2348,
        -0.1807,  0.1628, -0.3001, -0.1390,  0.3292,  0.1922, -0.0187, -0.5012,
         0.5684,  0.1994,  0.1155,  0.1536,  0.1397, -0.3648,  0.2246, -0.4449,
         0.2004,  0.1829])

In [93]:
ids = torch.tensor([[3374, 3500, , , ,]])
embedding(ids)

tensor([[[-0.1568,  0.3139,  0.1364, -0.1093, -0.3210, -0.3220,  0.2619,
           0.9275, -0.1743, -0.3756, -0.2196, -0.1559, -0.1858,  0.2224,
           0.0479, -0.2241,  0.0129,  0.2198, -0.2114, -0.0411, -0.0639,
          -0.0159,  0.4326,  0.0965,  0.0774,  0.0419, -0.2855, -0.0337,
          -0.1741,  0.1942,  0.0385, -0.2348, -0.1807,  0.1628, -0.3001,
          -0.1390,  0.3292,  0.1922, -0.0187, -0.5012,  0.5684,  0.1994,
           0.1155,  0.1536,  0.1397, -0.3648,  0.2246, -0.4449,  0.2004,
           0.1829],
         [-0.3159,  0.2604,  0.1791,  0.0529, -0.1593, -0.4048,  0.1770,
           0.8402, -0.1956, -0.2543, -0.1362, -0.2640,  0.2043,  0.2013,
          -0.0790, -0.0886,  0.2914,  0.2092, -0.3059, -0.0808, -0.2045,
           0.0949,  0.0229, -0.1561, -0.0344,  0.0701, -0.2734,  0.1026,
          -0.1711, -0.0489,  0.0578, -0.0643, -0.0910,  0.0727, -0.1821,
           0.0929,  0.2744,  0.0857, -0.0672, -0.7545,  0.5618,  0.0910,
           0.0613, -0.1733,  0.