In [1]:
# imports 
import numpy as np

import torch 
import torch.nn as nn
import torchtext
import torchtext.data as data
from torchtext.vocab import GloVe

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
glove = GloVe(name='6B', dim=300)

### Extracting data from GloVe Object
Grab the vocab, word embedding mappings, and embeddings

In [4]:
vocab = np.array(glove.itos)
vocab_emb_mapping_dict = glove.stoi
embeddings = np.array(glove.vectors)

print("Vocab len:", len(vocab))
print("Embeddings len:", len(embeddings))

Vocab len: 400000
Embeddings len: 400000


#### Add Special Tokens To Vocab
Our model requires special tokens, (these will all be wrapped by <>)
- **sos** - for start of sentence 
- **eos** - end of sentence 
- **unk** - unknown token
- **pad** - padding

This should also be reflected in the embeddings

In [5]:
special_tokens = ["<pad>", "<unk>", "<sos>", "<eos>"]

# increment every token mapping in the embedding index to accomodate the addition of the tokens
for token, mapping_index in vocab_emb_mapping_dict.items(): 
  vocab_emb_mapping_dict[token] = int(mapping_index) + len(special_tokens)

for i, token in enumerate(special_tokens):
  vocab = np.insert(vocab, i, token)

  vocab_emb_mapping_dict[token] = i

pad_embedding = np.zeros((1, embeddings.shape[1]))   #embedding for '<pad>' token, for now just zeroes
unk_embedding = np.mean(embeddings, axis=0, keepdims=True)    #embedding for '<unk>' token, for now just mean of all other tokens
sos_embedding = np.mean(embeddings, axis=0, keepdims=True)    #embedding for '<sos>' token, for now just mean of all other tokens
eos_embedding = np.mean(embeddings, axis=0, keepdims=True)    #embedding for '<eos>' token, for now just mean of all other tokens

embeddings = np.vstack((pad_embedding, unk_embedding, sos_embedding, eos_embedding, embeddings))

In [6]:
len(vocab_emb_mapping_dict)

400004

#### Create Embedding Layer 

In [7]:
glove_embedding_layer = torch.nn.Embedding.from_pretrained(torch.from_numpy(embeddings))

assert glove_embedding_layer.weight.shape == embeddings.shape
print(glove_embedding_layer.weight.shape)

torch.Size([400004, 300])


## The Model
An unoptimized version of my final model. It contains extra functionality to allow setting the model's embeddings as well as a function to generate a sequence based on a starting sequence.

In [8]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer

class TransformerModel(nn.Module):
    def __init__(self, embedding_matrix, embedding_dim, hidden_dim, num_layers, num_heads, vocab_len, max_len, dropout_p):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.num_heads = num_heads
        self.max_len = max_len
        self.dropout_p = dropout_p
        
        print(embedding_matrix.shape)
        print("embedding_dim", embedding_dim)
        
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix).to(torch.float) # nn.Embedding(vocab_len, embedding_dim)
        self.pos_encoder = PositionalEncoding(embedding_dim, max_len, dropout_p)
        encoder_layers = TransformerEncoderLayer(hidden_dim, num_heads, hidden_dim * 4, dropout_p)
        self.transformer_encoder = TransformerEncoder(encoder_layers, num_layers)
        self.decoder = nn.Linear(hidden_dim, vocab_len)
        
    def forward(self, src):
        src = self.embedding(src) * math.sqrt(self.embedding_dim)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src)
        output = self.decoder(output)
        return output


class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len, dropout_p=0.2):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout_p)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

## Defining the Model

### Hyperparameters

In [9]:
# Define hyperparameters
embedding_dim = embeddings.shape[1]
hidden_dim = 300
num_layers = 2
num_heads = 4
vocab_len = len(vocab)
max_len = 100
dropout_p = 0.2

### Instantiate the Model

In [10]:
# Instantiate Transformer object
model = TransformerModel(
    embedding_matrix=torch.from_numpy(embeddings),
    embedding_dim=embedding_dim,
    hidden_dim=hidden_dim,
    num_layers=num_layers,
    num_heads=num_heads,
    vocab_len=vocab_len,
    max_len=max_len,
    dropout_p=dropout_p
).to(device)

torch.Size([400004, 300])
embedding_dim 300


## Generating Text
The generate_seq function associated with my model includes outputs a sequence referencing tokens rather than outputing words. This sequence will need to be converted.

In [18]:
def generate_sen(model, start_seq, vocab_emb_mapping_dict, max_len=20):
    model.eval()
    start_tokens = torch.tensor([vocab_emb_mapping_dict[word] for word in start_seq]).unsqueeze(0)
    print("Start Tokens", start_tokens)

    with torch.no_grad():
        for i in range(max_len):
            output = model(start_tokens)
            output = output[:, -1, :]
            probabilities = nn.functional.softmax(output, dim=-1)
            next_token = torch.multinomial(probabilities, num_samples=1)
            start_tokens = torch.cat([start_tokens, next_token], dim=-1)
            if next_token == vocab_emb_mapping_dict['<eos>']:
                break
            
    output_seq = [list(vocab_emb_mapping_dict.keys())[list(vocab_emb_mapping_dict.values()).index(token)] for token in start_tokens.squeeze()]
    return output_seq


In [20]:
start_seq = ["the", "quick", "brown"]

out_seq = generate_sen(
    model=model,
    start_seq=start_seq,
    vocab_emb_mapping_dict=vocab_emb_mapping_dict,
    max_len=25
)

print(start_seq)

print(out_seq)

Start Tokens tensor([[   4, 2586, 1046]])
['the', 'quick', 'brown']
['the', 'quick', 'brown', '32.57', 'swabians', 'hochwald', 'glamorize', 'kronenbourg', 'askar', 'wiślica', 'jjb', 'maartens', 'danic', 'salpointe', 'scrapyards', 'fearnley', 'concessionary', 'rostro', 'zenobia', 'watered-down', 'armyworms', 'coombe', '(760)', 'fyodorova', 'positron', '19.37', 'lehmans', 'she-devil']
28
