## Work embedding

sample taken from https://docs.pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html

puntos clave:

> Word embedding es la representación semantica de una palabra
en vez de usar one-hot vector


In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x7f1925258e50>

In [2]:
word_to_ix = {"hello": 0, "World":1}
word_to_ix["World"]

# 2 palabras en el vocabulario
# 5 tamaño del embedding de las palabras
embedd = nn.Embedding(2, 5)
lookup_tensor = torch.tensor([word_to_ix["World"]], dtype=torch.long)
worl_embedd = embedd(lookup_tensor)
print(worl_embedd)

tensor([[-0.1661, -1.5228,  0.3817, -1.0276, -0.5631]],
       grad_fn=<EmbeddingBackward0>)


In [3]:
CONTEXT_SIZE = 2
EMBEDDING_DIM = 10
PATH_FILE = '/mnt/c/Users/gustavo.choque/Desktop/incidents.txt'
with open(PATH_FILE, 'r', encoding='utf-8') as file:
    test_sentence = file.read().strip()

test_sentence = test_sentence.split()

# aca se crear una tupla ('dato1', 'dato2')
# la tupla es inmutable y de longitud fija
ngrams = [([test_sentence[i - j - 1] for j in range(CONTEXT_SIZE)], test_sentence[i])
       for i in range(CONTEXT_SIZE, len(test_sentence))]

#print(test_sentence)

vocab = set(test_sentence)

#agrega una enumración a cada elemento de un arreglo
#retorna un type Dict diccionary
word_to_ix = {word: i for i, word in enumerate(vocab)}    
word_to_ix["mismo"]


133

In [6]:
class NGramLanguageModeler(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)
    
    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_props = F.log_softmax(out, dim=1)
        return log_props

losses = []
loss_function = nn.L1Loss()
model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr=0.001)

for epoch in range(10):
    total_loss = 0
    for context, target in ngrams:
        context_ids = torch.tensor([word_to_ix[w] for w in context], 
                                   dtype=torch.long)
        model.zero_grad()
        
        #forwad pass
        log_probs = model(context_ids)
        
        #loss function 
        loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], 
                                                     dtype=torch.long))
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    losses.append(total_loss)

print(losses)
model.embeddings.weight[word_to_ix['mismo']]
        

[188279.96473026276, 188279.3112397194, 188278.68798160553, 188278.09405374527, 188277.52679777145, 188276.98608112335, 188276.46785354614, 188275.9724597931, 188275.49803256989, 188275.0440711975]


tensor([ 0.1714,  1.8895, -0.4044,  1.0543,  1.4037, -0.2143,  1.0520,  0.4427,
         0.3797,  0.4243], grad_fn=<SelectBackward0>)

In [None]:
# ref https://docs.pytorch.org/docs/stable/generated/torch.nn.Linear.html#torch.nn.Linear
# 3 cant. de features de entrada
# 5 cant. de features de salida
m = nn.Linear(3, 5)

#input is 81x3
input_sample = torch.randn(81, 3)
input_sample.shape

#output is 81x5
#A transpuesta tiene forma 3x5
#A tiene forma 5x3 , según doc de pytorch
output = m(input_sample)
print(output.shape)

#embedding
#10 es el tamaño del diccionario
#3 es el tamño del embedding
embedd = nn.Embedding(10, 3)
#size is 2x4
#tiene 2 samples de 1 indice(s)
inputEmb = torch.LongTensor([[2], [4]])

outEmb = embedd(inputEmb)
#2 samples con 1 indeice pero que ahora 
#tiene el embedding de 3
outEmb.size()


torch.Size([81, 5])


torch.Size([2, 1, 3])

In [12]:
#El View aplana el tensor
sample = torch.randn(5, 3)
a = sample.view((1,-1))
type(a)


torch.Tensor

list