In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [65]:
# Build a LSTM model for language modeling with PyTorch.
# There should be an embedding layer. 
# The model should be able to generate new sentences.
# Train the model to predict the next word in a sentence given the previous words.
# The model should output a probability distribution over the vocabulary.


class MyModel(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers)
        self.linear = nn.Linear(hidden_dim, vocab_size)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.zeros(1, 1, self.hidden_dim),
                torch.zeros(1, 1, self.hidden_dim))

    def forward(self, tokens):
        embeds = self.embeddings(tokens)
        print(embeds.shape)
        lstm_out, self.hidden = self.lstm(embeds.view(len(tokens), 1, -1), self.hidden)
        linear_out = self.linear(lstm_out.view(len(tokens), -1))
        return F.log_softmax(linear_out, dim=1)
    

# Alternative model: perform mean pooling over the embeddings of tokens before "[SEP]" 
# (and no pooling in the tokens after "[SEP]")

class MeanPoolModel(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers, vocab):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.vocab = vocab
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers)
        self.linear = nn.Linear(hidden_dim, vocab_size)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.zeros(1, 1, self.hidden_dim),
                torch.zeros(1, 1, self.hidden_dim))

    def forward(self, tokens):
        embeds = self.embeddings(tokens)
        sep_idx = (tokens == self.vocab["[SEP]"]).nonzero()[0][0]
        embeds = embeds[:sep_idx]
        embeds = torch.mean(embeds, dim=0, keepdim=True)
        lstm_out, self.hidden = self.lstm(embeds.view(len(tokens), 1, -1), self.hidden)
        linear_out = self.linear(lstm_out.view(len(tokens), -1))
        return F.log_softmax(linear_out, dim=1)



In [62]:
from torchtext.vocab import build_vocab_from_iterator, Vocab


def tokenize_doc(doc: str) -> list:
    return doc.split()

def doc2tensor(doc: str, vocab: Vocab):
    """Convierte documento a flat Tensor de vocab token ids
    """
    tokens = tokenize_doc(doc)
    idxs = vocab(tokens)
    res = torch.tensor(idxs, dtype=torch.long)
    return res

texts_train = [
    "[TAG] food [TAG] drama [SEP] I hate food [EOS]",
    "[TAG] food [SEP] I like food [EOS]",
    "[TAG] music [SEP] I like music [EOS]",
]

vocab = build_vocab_from_iterator(map(tokenize_doc, texts_train), min_freq=1)
vocab["[SEP]"]

4

In [63]:
test_text = "[TAG] drama [SEP] I hate music [EOS]"

# run model with random initialization on test text:
model = MeanPoolModel(len(vocab), 10, 10, 1, vocab)
test_tensor = doc2tensor(test_text, vocab)
print(test_tensor)
with torch.no_grad():
    model.hidden = model.init_hidden()
    model.eval()
    out = model(test_tensor)
    print(out.shape)
    print(out[0])

tensor([0, 7, 4, 2, 8, 6, 3])


RuntimeError: shape '[7, 1, -1]' is invalid for input of size 10

In [67]:
test_text = "[TAG] drama [SEP] I hate music [EOS]"

# run model with random initialization on test text:
model = MyModel(len(vocab), 10, 10, 1)
test_tensor = doc2tensor(test_text, vocab)
# print(test_tensor)
with torch.no_grad():
    model.hidden = model.init_hidden()
    model.eval()
    out = model(test_tensor)
    # print(out.shape)
    # print(out[0])

torch.Size([7, 10])


In [52]:
# generate text with model with random based on prefix:
max_len = 10
model = MyModel(len(vocab), 10, 10, 1)
prefix_tensor = doc2tensor("[TAG] drama [SEP]", vocab)

with torch.no_grad():
    model.hidden = model.init_hidden()
    model.eval()
    for i in range(max_len):
        out = model(prefix_tensor)
        # sample next word
        next_word = torch.multinomial(out[-1].exp(), num_samples=1)
        print(vocab.get_itos()[next_word.item()])
        # append to prefix
        prefix_tensor = torch.cat([prefix_tensor, next_word], dim=0)
        # print(prefix_tensor)
        # stop if EOS
        if next_word == vocab["[EOS]"]:
            break
    # print(prefix_tensor)



hate
food
hate
drama
[SEP]
[SEP]
like
[TAG]
[EOS]


In [8]:
print(vocab.get_stoi())

{'hate': 8, 'music': 6, 'like': 5, '[SEP]': 4, '[EOS]': 3, 'I': 2, 'food': 1, 'drama': 7, '[TAG]': 0}


In [None]:
# prepare data to train Language Model:



In [3]:
# Build a LSTM model for language modeling with PyTorch.
# The first layer sholud be an embedding layer with the size of (vocab_size, embedding_dim). 
# These embeddings should be pooled using a mean pooling operation over the time dimension.
# The model should be able to generate new sentences based on a given set of tags.
# "[TAG] food [TAG] drama [SEP] The deadliest food I've ever tasted! [EOS]"
# Train the model to predict the next word in a sentence given the previous words.


class Modelo(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(Modelo, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        # apply mean pooling over the time dimension:
        self.pooling = nn.AvgPool1d(1)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        embeds = embeds.view(len(sentence), 1, -1)
        embeds = self.pooling(embeds)
        lstm_out, _ = self.lstm(embeds)
        # 


     

In [None]:
# Create a small set of toy data:

# The vocabulary size is 10.
# The embedding dimension is 6.
# The hidden dimension is 4.
# The tagset size is 3.

# The input is a sequence of 4 words: 1, 2, 3, 4.
# The output is a sequence of 4 tags: 0, 1, 2, 0.

# The model should be able to generate new sentences based on a given set of tags.






