## Work embedding

sample taken from https://docs.pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html

puntos clave:

> Word embedding es la representación semantica de una palabra
en vez de usar one-hot vector


In [27]:
import torch
import torch.nn as nn
import torch.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x7f7de02c57b0>

In [28]:
word_to_ix = {"hello": 0, "World":1}
word_to_ix["World"]

# 2 palabras en el vocabulario
# 5 tamaño del embedding de las palabras
embedd = nn.Embedding(2, 5)
lookup_tensor = torch.tensor([word_to_ix["World"]], dtype=torch.long)
worl_embedd = embedd(lookup_tensor)
print(worl_embedd)

tensor([[-0.1661, -1.5228,  0.3817, -1.0276, -0.5631]],
       grad_fn=<EmbeddingBackward0>)


In [29]:
CONTEXT_SIZE = 2
EMBEDDING_DIM = 10
PATH_FILE = '/mnt/c/Users/gustavo.choque/Desktop/incidents.txt'
with open(PATH_FILE, 'r', encoding='utf-8') as file:
    test_sentence = file.read().strip()

test_sentence = test_sentence.split()

# aca se crear una tupla ('dato1', 'dato2')
# la tupla es inmutable y de longitud fija
arr = [([test_sentence[i - j - 1] for j in range(CONTEXT_SIZE)], test_sentence[i])
       for i in range(CONTEXT_SIZE, len(test_sentence))]

#print(test_sentence)

vocab = set(test_sentence)

#agrega una enumarción a cada elemento de un arreglo
word_to_ix = {word: i for i, word in enumerate(vocab)}    


In [30]:
class NGramLanguageModeler(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)
    
    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_props = F.log_sofmax(out, dim=1)
        return log_props
        

In [None]:
# 3 cant. de features de entrada
# 5 cant. de features de salida
m = nn.Linear(3, 5)

#input is 81x3
input_sample = torch.randn(81, 3)
input_sample.shape

#output is 81x5
#A transpuesta tiene forma 3x5
#A tiene forma 5x3 , según doc de pytorch
output = m(input_sample)
print(output.shape)


torch.Size([81, 5])
