In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import numpy as np
import random

# Load Wiki-Text-2 data

In [2]:
path = '../wikitext-2/'
train_path = path + 'wiki.train.tokens'
valid_path = path + 'wiki.valid.tokens'
test_path = path + 'wiki.test.tokens'

def read_file(file_path):
    # Open the file in read mode
    with open(file_path, 'r') as file:
        # Read the contents of the file
        file_contents = file.read()
    return file_contents

train_string = read_file(train_path)
valid_string = read_file(valid_path)
test_string = read_file(test_path)
print(train_string[:100])

 
 = Valkyria Chronicles III = 
 
 Senjō no Valkyria 3 : <unk> Chronicles ( Japanese : 戦場のヴァルキュリア3 ,


# Tokenization
The preprocessing step that 

1. splits a corpus into words or characters (tokens),
2. keeps the track of the words in a vocabulary, and
3. maps each unique token into a unique integer

## You can make your own tokenizer following these steps
1. Go through all the words in your corpus and keep unique words in a dictionary
2. Assign a unique integer to each unique word.
3. Convert your corpus of text to integers representing each word

**Or you can use a python library (e.g. nltk, Scikit Learn, HF Tokenizer) to do this step for you.**

## Advantages of using HF Tokenizer:
1. It removes the boilerplate tokenization code.
2. It is written in Rust, so it is faster than python.

We will use the BPE (Byte-Pair Encoding) algorithm to tokenize our corpus

In [3]:
from tokenizers import Tokenizer
from tokenizers.models import BPE, 
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

In [52]:
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
trainer = BpeTrainer(special_tokens=["[PAD]","[UNK]"], vocab_size=3000)

tokenizer.pre_tokenizer = Whitespace()
files = [train_path]
tokenizer.train(files, trainer)






In [53]:
vocab = tokenizer.get_vocab()

len(vocab)

3000

In [54]:
tokenizer.encode('A single sequence').tokens

['A', 'single', 'sequ', 'ence']

In [55]:
tokenizer.encode(train_string[200:250]).tokens

['an',
 ',',
 'is',
 'a',
 't',
 'act',
 'ical',
 'role',
 '@-@',
 'playing',
 'video',
 'game',
 'de',
 'v']

In [58]:
tokens = tokenizer.encode('  HEC  is  ').ids
tokens

[41, 38, 36, 302]

In [63]:
[tokenizer.decode([token]) for token in tokens]

['H', 'E', 'C', 'is']

In [66]:
vocab['[PAD]']

0

In [64]:
vocab['[UNK]']

1

In [69]:
tokenizer.save(path+'tokenizer.json')

# Load our pre-trained tokenizer

In [87]:
from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast(tokenizer_file=path+"tokenizer.json", 
                                    bos_token='[BOS]', eos_token='[EOS]', 
                                    unk_token='[UNK]', pad_token='[PAD]')

In [88]:
tokenizer

Using sep_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using mask_token, but it is not set yet.


PreTrainedTokenizerFast(name_or_path='', vocab_size=3000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[BOS]', 'eos_token': '[EOS]', 'unk_token': '[UNK]', 'pad_token': '[PAD]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3000: AddedToken("[BOS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3001: AddedToken("[EOS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [89]:
tokens = tokenizer.encode('  HEC  is  ')
tokens

[41, 38, 36, 302]

In [90]:
tokens = tokenizer.encode('HEC  is a university.')
tokens

[41, 38, 36, 302, 65, 291, 1410, 15]

In [91]:
tokenizer.vocab['[PAD]']

0

In [92]:
len(tokenizer.vocab)

3002

In [93]:
tokenizer.convert_tokens_to_ids(['[PAD]','[BOS]'])

[0, 3000]

In [94]:
tokenizer.convert_tokens_to_ids(['[BOS]','[PAD]','[EOS]'])

[3000, 0, 3001]

## Tokenize the training set

In [99]:
train_tokens = tokenizer(train_string)['input_ids']

In [101]:
len(train_tokens)

3115026

In [108]:
train_tokens[:10]

[30, 2275, 2641, 353, 65, 453, 2685, 662, 2330, 30]

In [110]:
from torch.utils.data import Dataset, DataLoader

In [143]:
class LanguageData(Dataset):
    def __init__(self,token_list, seq_len):
        self.token_list = token_list
        self.seq_len = seq_len
    def __getitem__(self, ind):
        inp = torch.tensor(self.token_list[ind:ind+self.seq_len])
        out = torch.tensor(self.token_list[ind+1:ind+self.seq_len+1])
        return inp, out
        
    def __len__(self):
        return len(self.token_list) - self.seq_len

In [144]:
seq_len = 5

train_set = LanguageData(train_tokens, seq_len)

In [145]:
len(train_set)

3115021

In [146]:
train_set[1000]

(tensor([  15,   38,  475, 1078,  489]),
 tensor([  38,  475, 1078,  489, 2515]))

In [147]:
tokenizer.batch_decode(train_set[1000])

['. E ach character has', 'E ach character has specific']

## Make a batch of data

In [152]:
bs = 8
train_loader = DataLoader(train_set, batch_size=bs)

In [153]:
bx, by = next(iter(train_loader))

In [154]:
bx

tensor([[  30, 2275, 2641,  353,   65],
        [2275, 2641,  353,   65,  453],
        [2641,  353,   65,  453, 2685],
        [ 353,   65,  453, 2685,  662],
        [  65,  453, 2685,  662, 2330],
        [ 453, 2685,  662, 2330,   30],
        [2685,  662, 2330,   30,   52],
        [ 662, 2330,   30,   52, 2736]])

In [155]:
by

tensor([[2275, 2641,  353,   65,  453],
        [2641,  353,   65,  453, 2685],
        [ 353,   65,  453, 2685,  662],
        [  65,  453, 2685,  662, 2330],
        [ 453, 2685,  662, 2330,   30],
        [2685,  662, 2330,   30,   52],
        [ 662, 2330,   30,   52, 2736],
        [2330,   30,   52, 2736,  152]])

In [156]:
tokenizer.batch_decode(bx)

['= Val ky ri a',
 'Val ky ri a Ch',
 'ky ri a Ch ronic',
 'ri a Ch ronic les',
 'a Ch ronic les III',
 'Ch ronic les III =',
 'ronic les III = S',
 'les III = S enj']

In [157]:
tokenizer.batch_decode(by)

['Val ky ri a Ch',
 'ky ri a Ch ronic',
 'ri a Ch ronic les',
 'a Ch ronic les III',
 'Ch ronic les III =',
 'ronic les III = S',
 'les III = S enj',
 'III = S enj ō']

## Put it all together

Subset the training set to faster computation

In [252]:
len(train_string), len(valid_string)

(10780437, 1120192)

In [253]:
train_tokens = tokenizer(train_string[:500_000])['input_ids']
valid_tokens = tokenizer(valid_string)['input_ids']
test_tokens = tokenizer(test_string)['input_ids']

In [254]:
seq_len = 8
bs = 64

train_set = LanguageData(train_tokens, seq_len)
valid_set = LanguageData(valid_tokens, seq_len)
test_set = LanguageData(test_tokens, seq_len)

print(f'Train set has {len(train_set)} sequences')
print(f'Valid set has {len(valid_set)} sequences')
print(f'Test set has {len(test_set)} sequences')

train_loader = DataLoader(train_set, batch_size=bs, drop_last=True)
valid_loader = DataLoader(valid_set, batch_size=bs, drop_last=True)
test_loader = DataLoader(test_set, batch_size=bs, drop_last=True)


Train set has 144558 sequences
Valid set has 323006 sequences
Test set has 366126 sequences


# Embedding layer

In [242]:
vocab_size = len(tokenizer)
embedding_dim = 10
padding_idx = tokenizer.pad_token_id
padding_idx

0

In [174]:
embed = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim, padding_idx=padding_idx)
embed

Embedding(3002, 10, padding_idx=0)

In [175]:
hi = tokenizer('hi')['input_ids']
hi

[72, 73]

In [176]:
embed(torch.tensor(hi))

tensor([[ 0.4664, -0.9007,  1.2894,  0.7496,  0.2736,  0.2147,  0.2893, -0.1120,
         -1.5148, -0.5786],
        [-1.2685,  0.5601, -0.0723, -0.0527, -0.6910, -1.8882, -0.2942,  0.1605,
          0.9129, -1.7694]], grad_fn=<EmbeddingBackward0>)

In [177]:
embed(torch.tensor(hi)).shape

torch.Size([2, 10])

In [180]:
bx, by = next(iter(train_loader))
bx.shape

torch.Size([64, 100])

In [181]:
embed(bx).shape

torch.Size([64, 100, 10])

# Create an RNN to predict the next word

In [327]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, seq_len):
        super(RNN, self).__init__()

        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_size = hidden_size
        self.seq_len = seq_len

        self.embed = nn.Embedding(self.vocab_size, self.embedding_dim, 0)
        self.rnn = nn.RNN(embedding_dim, hidden_size, batch_first=True) # [bs, seq_len, embedding_dim]
        # self.project = nn.Linear(hidden_size*seq_len, seq_len*self.vocab_size)
        self.project = nn.Linear(hidden_size, vocab_size)
    def init_h(self, bs):
        # input shape
        bs, self.seq_len, self.embedding_dim
        # (D * num_layers, bs, H_{out})
        h = torch.zeros(1, bs, self.hidden_size, requires_grad=False)
        return h
    def forward(self, x, h):
        x = self.embed(x)
        x, h = self.rnn(x, h)
        # x.shape -> bs, seq_len, hidden_size
        # x = x.contiguous().view(-1, self.seq_len*self.hidden_size)
        # x = self.project(x) # shape -> bs, seq_len*vocab_size
        x = self.project(x) # shape -> bs, seq_len, vocab_size
        x = x.contiguous().view(-1, self.vocab_size, self.seq_len) # multi-dim cross-entropy loss
        return x, h

In [345]:
embedding_dim = 16
hidden_size = 20
model = RNN(vocab_size, embedding_dim, hidden_size, seq_len)
ce = nn.CrossEntropyLoss()
lr = 1e-3
optimizer = torch.optim.SGD(model.parameters(), lr)

In [346]:
sum([p.numel() for p in model.parameters()])

111834

In [347]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
epochs = 10

h = model.init_h(bs)
for e in range(epochs):
    losses=[]
    model.train()
    for bx, by in train_loader:
        bx, by, model = bx.to(device), by.to(device), model.to(device)
        out, h = model(bx, h)
        loss = ce(out, by)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        h.detach_()
        losses.append(loss.item())
    print(np.mean(losses))

8.052412684561405
8.046844196699698
8.041544349347731
8.036416841250167
8.031389635996653
8.026406132087336
8.021419736112083
8.016390332930694
8.011282468074397
8.006063566680922


# Transformer architecture

In [269]:
embedding_dim = 16

In [276]:
transformer_layer = nn.TransformerEncoderLayer(embedding_dim, 8, 2048, batch_first=True)

In [284]:
transformer_layer

TransformerEncoderLayer(
  (self_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=16, out_features=16, bias=True)
  )
  (linear1): Linear(in_features=16, out_features=2048, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (linear2): Linear(in_features=2048, out_features=16, bias=True)
  (norm1): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
  (norm2): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
  (dropout1): Dropout(p=0.1, inplace=False)
  (dropout2): Dropout(p=0.1, inplace=False)
)

In [286]:
transformer_layer.linear1.weight.shape, transformer_layer.linear2.weight.shape

(torch.Size([2048, 16]), torch.Size([16, 2048]))

In [277]:
bx.shape, embedding_dim, seq_len

(torch.Size([64, 8]), 16, 8)

In [278]:
embedd = nn.Embedding(vocab_size, embedding_dim)

In [279]:
out = embedd(bx)
print(out.shape)
out = transformer_layer(out)
print(out.shape)

torch.Size([64, 8, 16])
torch.Size([64, 8, 16])


In [285]:
num_layers = 6
encoder = nn.TransformerEncoder(transformer_layer, num_layers)
encoder

TransformerEncoder(
  (layers): ModuleList(
    (0-5): 6 x TransformerEncoderLayer(
      (self_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=16, out_features=16, bias=True)
      )
      (linear1): Linear(in_features=16, out_features=2048, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (linear2): Linear(in_features=2048, out_features=16, bias=True)
      (norm1): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
    )
  )
)

In [281]:
out = embedd(bx)
print(out.shape)
out = encoder(out)
print(out.shape)

torch.Size([64, 8, 16])
torch.Size([64, 8, 16])


In [338]:
class TransformerEncoder(nn.Module):
    def __init__(self, vocab_size, seq_len, embedding_dim, n_layers, n_heads, dim_feedforward):
        super(TransformerEncoder, self).__init__()

        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.seq_len = seq_len
        self.n_layers = n_layers
        
        self.relu = nn.ReLU()
        self.embed = nn.Embedding(vocab_size, embedding_dim)
        self.transformer_layers = nn.ModuleList([nn.TransformerEncoderLayer(self.embedding_dim, n_heads, dim_feedforward, batch_first=True) for _ in range(n_layers)])
        self.fc = nn.Linear(self.embedding_dim, self.vocab_size)
    def forward(self, x):
        x = self.embed(x)
        for layer in self.transformer_layers:
            x = self.relu(layer(x))
        # x = x.contiguous().view(-1, self.seq_len*self.embedding_dim)
        x = self.fc(x)
        return x.contiguous().view(-1, self.vocab_size, self.seq_len)

In [339]:
embedding_dim = 16
model = TransformerEncoder(vocab_size, seq_len, embedding_dim, 6, 8, hidden_size)
lr = 1e-3
optimizer = torch.optim.SGD(model.parameters(), lr)

In [340]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
epochs = 10

for e in range(epochs):
    losses=[]
    model.train()
    for bx, by in train_loader:
        bx, by, model = bx.to(device), by.to(device), model.to(device)
        out = model(bx)
        loss = ce(out, by)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        losses.append(loss.item())
    print(np.mean(losses))

8.124975384812739
8.093722384413118
8.046181983500059
8.01690059979475
8.004818373807666
7.998558104618975
7.9944436172133955
7.990597010292344
7.986975481491156
7.983121994430772


In [341]:
sum([p.numel() for p in model.parameters()])

110034