In [36]:
import os
from pathlib import Path
import numpy as np
import time


import torch
from torch import nn
from torch.utils.tensorboard import SummaryWriter
import torch.nn.functional as F
from torchvision.datasets import MNIST
from torch.utils.data import DataLoader, random_split
from torchvision import transforms


# import pytorch_lightning as pl


In [3]:
import pytorch_lightning
# for some reason, need to run this twice

In [4]:
import pytorch_lightning as pl
pl.seed_everything(hash("setting random seeds") % 2**32 - 1)

# 🏋️‍♀️ Weights & Biases
import wandb

# ⚡ 🤝 🏋️‍♀️
from pytorch_lightning.loggers import WandbLogger

wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mfeloundou[0m (use `wandb login --relogin` to force relogin)


True

In [48]:
# Test Text
# from torchtext import data, datasets
# import torchtext
from torchtext.data import Field, BucketIterator
# from torchtext.datasets import IMDB
from torchtext.datasets import IWSLT
# from torchtext.datasets import WikiText2 #vocab size of 33,278
import spacy


In [12]:
# python -m spacy download fr

nlp = spacy.load("en")
doc = nlp("This is an English sentence.")
print([(w.text, w.pos_) for w in doc])

nlpf = spacy.load('fr')
docu = nlpf("Voici une phrase en francais.")
print([(w.text, w.pos_) for w in docu])

print([tok.text for tok in spacy_fr.tokenizer("Je ne suis pas une malade.")])

[('This', 'DET'), ('is', 'AUX'), ('an', 'DET'), ('English', 'ADJ'), ('sentence', 'NOUN'), ('.', 'PUNCT')]
[('Voici', 'VERB'), ('une', 'DET'), ('phrase', 'NOUN'), ('en', 'ADP'), ('francais', 'NOUN'), ('.', 'PUNCT')]
['Je', 'ne', 'suis', 'pas', 'une', 'malade', '.']


In [21]:

args = {
    "full_data_dir": DATA_PATH,
    "model_dimension" : 512,
    "num_layers" : 6,
    "num_heads" : 8,
#     "batch_size" : 4096, # batch size from the paper
    "batch_size" : 8,
    "dropout" : 0.1,
    "label_smoothing" : 0.1
}



In [55]:
BOS_TOKEN = '<s>'
EOS_TOKEN = '</s>'
PAD_TOKEN = "<pad>"
BLANK_TOKEN = "<blank>"


spacy_en = spacy.load('en')
spacy_fr = spacy.load('fr')

def tokenize_french(text):
    return [tok.text for tok in spacy_fr.tokenizer(text)]

def tokenize_english(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

SRC = Field(tokenize=tokenize_french, pad_token=BLANK_TOKEN)
TGT = Field(tokenize=tokenize_english, init_token = BOS_TOKEN, eos_token = EOS_TOKEN, pad_token=BLANK_TOKEN)

MAX_LEN = 100
train, val, test = IWSLT.splits(
    exts=('.fr', '.en'), fields=(SRC, TGT), 
    filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and len(vars(x)['trg']) <= MAX_LEN)
MIN_FREQ = 2
SRC.build_vocab(train.src, min_freq=MIN_FREQ)
TGT.build_vocab(train.trg, min_freq=MIN_FREQ)

SRC

<torchtext.data.field.Field at 0x7f26b7c7ad00>

In [None]:
# atoi  -> ASCII to integer.
# atol  -> ASCII to long.
# atof  -> ASCII to floating.
# stoi  -> string to integer.
# stol  -> string to long.
# stoll -> string to long long.
# stof  -> string to float. 
# stod  -> string to double.
# stold -> string to long double.
# itos  -> integer to string

In [102]:
print(SRC.vocab.stoi['dog'])
print(SRC.vocab.itos[666])
print(SRC.vocab.itos[33130])

print('\n')
print(TGT.vocab.stoi['canine'])
print(TGT.vocab.itos[666])
print(TGT.vocab.itos[27375])



33130
Ou
dog


27375
sitting
canine


In [103]:
# Let's look at a batch of 4 sentences
train_iter = BucketIterator(train, batch_size=4, sort_key=lambda x: len(x.trg), shuffle=True)


In [104]:

batch = next(iter(train_iter))
'''In each batch, the sentences have been transposed so they are descending vertically 
(important: we will need to transpose these again to work with the transformer). Each index represents a token (word), 
and each column represents a sentence. We have 10 columns, as 10 was the batch_size we specified.'''

print(batch.src) # source



tensor([[  102,    26,    90,    48],
        [    2,    65,    10,    81],
        [16010,    28,   200,    87],
        [    7,    32,    35,  5896],
        [   31,    78,     5,    89],
        [13999,   769,  1858,   235],
        [   29,    25,     2, 45549],
        [ 3333,     2,   171,     3],
        [    9,    33,    14,  1178],
        [    5,    60,   142,   396],
        [  128,    24,     4,     4],
        [   24,    11, 15765,    13],
        [   59,  1229,     2,  3090],
        [   14,    72,    10,     2],
        [13906,   213,  1905,     4],
        [    6,     3,    20,  1333],
        [   14,     1,   199,     4],
        [ 9806,     1,     9,   724],
        [    3,     1,   304,     2],
        [    1,     1,    49,     4],
        [    1,     1,  1589,  2011],
        [    1,     1,     2,     4],
        [    1,     1,   151,  1262],
        [    1,     1,   171,    50],
        [    1,     1,   123,   238],
        [    1,     1,  7977,   263],
        [   

In [105]:
print(batch.trg) # target

tensor([[    2,     2,     2,     2],
        [   32,    19,    90,    32],
        [12983,    62,    35,    12],
        [   14,    76,    79,    22],
        [  198,   104,    27,   236],
        [   39,    40,    10,    39],
        [  409,   179,  1797, 15149],
        [13523,    67,     4,   237],
        [    9,     4,   190,     4],
        [ 7301,    16,   102,    51],
        [  121,    22,     8,    34],
        [ 3036,     8, 19484,    22],
        [    5,   202,     4,     6],
        [    3,     7,     6,  2870],
        [    1,  1147,   648,     4],
        [    1,    67,   187,     6],
        [    1,     5,     7,  3790],
        [    1,     3,   636,     4],
        [    1,     1,     4,     6],
        [    1,     1,    55,   822],
        [    1,     1,   190,     4],
        [    1,     1,  6325,     6],
        [    1,     1,     5,  1406],
        [    1,     1,     3,  2561],
        [    1,     1,     1,    11],
        [    1,     1,     1,    60],
        [   

In [89]:
global max_src_in_batch, max_tgt_in_batch

def batch_size_fn(new, count, sofar):
    "Keep augmenting batch and calculate total number of tokens + padding."
    global max_src_in_batch, max_tgt_in_batch
    if count == 1:
        max_src_in_batch = 0
        max_tgt_in_batch = 0
    max_src_in_batch = max(max_src_in_batch,  len(new.English))
    max_tgt_in_batch = max(max_tgt_in_batch,  len(new.French) + 2)
    src_elements = count * max_src_in_batch
    tgt_elements = count * max_tgt_in_batch
    return max(src_elements, tgt_elements)

In [15]:

PATH = './transformer_fr_en.pth'

DATA_PATH=Path('./data/')
DATA_PATH.mkdir(exist_ok=True)


BOS_TOKEN = '<s>'
EOS_TOKEN = '</s>'
PAD_TOKEN = "<pad>"
BLANK_TOKEN = "<blank>"
MAX_LEN = 100  # filter out examples that have more than MAX_LEN tokens
MIN_FREQ = 2


In [44]:
def save_cache(cache_path, dataset):
    with open(cache_path, 'w', encoding='utf-8') as cache_file:
        # Interleave source and target tokenized examples, source is on even lines, target is on odd lines
        for ex in dataset.examples:
            cache_file.write(' '.join(ex.src) + '\n')
            cache_file.write(' '.join(ex.trg) + '\n')

In [106]:

def get_language_datasets(data_dir, max_len = MAX_LEN, min_freq = MIN_FREQ):

    spacy_en = spacy.load('en')
    spacy_fr = spacy.load('fr')

    def tokenize_fr(text):
        return [tok.text for tok in spacy_fr.tokenizer(text)]

    def tokenize_en(text):
        return [tok.text for tok in spacy_en.tokenizer(text)]

    
    # Tokenize for the source and target
    src_tokenizer = tokenize_en
    trg_tokenizer = tokenize_fr
    
    src_field_processor = Field(tokenize=src_tokenizer, pad_token=PAD_TOKEN, batch_first=True) # Whether to produce tensors with the batch dimension first. Default: False.
    trg_field_processor = Field(tokenize=trg_tokenizer, init_token=BOS_TOKEN, eos_token=EOS_TOKEN, pad_token=PAD_TOKEN, batch_first=True)

    fields = [('src', src_field_processor), ('trg', trg_field_processor)]

    # Only call once the splits function it is super slow as it constantly has to redo the tokenization
    prefix = 'en_fr_iwslt'
    
    train_cache_path = os.path.join(data_dir, f'{prefix}_train_cache.csv')
    val_cache_path = os.path.join(data_dir, f'{prefix}_val_cache.csv')
    test_cache_path = os.path.join(data_dir, f'{prefix}_test_cache.csv')

    # This simple caching mechanism gave me ~30x speedup on my machine! From ~70s -> ~2.5s!
    ts = time.time()
 
    src_ext = '.en' 
    trg_ext = '.fr' 
    
    train_dataset, val_dataset, test_dataset = IWSLT.splits(
        exts=(src_ext, trg_ext),
        fields=fields,
        root=data_dir,
        filter_pred=lambda x: len(x.src) <= max_len and len(x.trg) <= max_len
    )
    

    save_cache(train_cache_path, train_dataset)
    save_cache(val_cache_path, val_dataset)
    save_cache(test_cache_path, test_dataset)

#     print(f'Time it took to prepare the data: {time.time() - ts:3f} seconds.')
    print('It took {} seconds to prepare the data.'.format(time.time()-ts))

    # __getattr__ implementation in the base Dataset class enables us to call .src on Dataset objects even though
    # we only have a list of examples in the Dataset object and the example itself had .src attribute.
    # Implementation will yield examples and call .src/.trg attributes on them (and those contain tokenized lists)
    src_field_processor.build_vocab(train_dataset.src, min_freq=min_freq)
    trg_field_processor.build_vocab(train_dataset.trg, min_freq=min_freq)

    return train_dataset, val_dataset, src_field_processor, trg_field_processor


In [95]:
# How to get efficient batching

'''While Torchtext is brilliant, it’s sort_key-based batching leaves 
a little to be desired. Often the sentences are of different lengths, 
and you end up feeding a lot of padding into your network 
(as you can see with all the 1s in the last figure).

Additionally, if your RAM can process say 1500 tokens each iteration, 
and your batch_size is 20, then only when you have batches of length 75 
utilising all the memory. 

An efficient batching mechanism would change the batch size 
depending on the sequence length to make sure around 1500 tokens were being processed each iteration.'''

global max_src_in_batch, max_tgt_in_batch

def batch_size_fn(new, count, sofar):
    "Keep augmenting batch and calculate total number of tokens + padding."
    global max_src_in_batch, max_tgt_in_batch
    if count == 1:
        max_src_in_batch = 0
        max_tgt_in_batch = 0
    max_src_in_batch = max(max_src_in_batch,  len(new.src))  #can change this .src to a more generic extension, #TODO: Review
    max_tgt_in_batch = max(max_tgt_in_batch,  len(new.trg) + 2)
    src_elements = count * max_src_in_batch
    tgt_elements = count * max_tgt_in_batch
    return max(src_elements, tgt_elements)


In [96]:
# How to get masks and count the tokens in source and target sentences. Note that 

def masks_tokens(src_token_ids_batch, trg_token_ids_batch, pad_token_id, device):
    
    def masks_tokens_src(src_token_ids_batch, pad_token_id):
        batch_size = src_token_ids_batch.shape[0]

        # src_mask shape = (B, 1, 1, S) #TODO: check this
        # src_mask only masks pad tokens as we want to ignore their representations (no information in there...)
        src_mask = (src_token_ids_batch != pad_token_id).view(batch_size, 1, 1, -1)
        num_src_tokens = torch.sum(src_mask.long())

        return src_mask, num_src_tokens



    def masks_tokens_trg(trg_token_ids_batch, pad_token_id):
        batch_size = trg_token_ids_batch.shape[0]
        device = trg_token_ids_batch.device

        # Same as src_mask but we additionally want to mask future tokens (want to predict)
        # Note: wherever the mask value is true we want to attend to that token, otherwise we mask (ignore) it.
        sequence_length = trg_token_ids_batch.shape[1]  # trg_token_ids shape = (B, T) where T max trg token-sequence length
        trg_padding_mask = (trg_token_ids_batch != pad_token_id).view(batch_size, 1, 1, -1)  # shape = (B, 1, 1, T)
        trg_no_look_forward_mask = torch.triu(torch.ones((1, 1, sequence_length, sequence_length), device=device) == 1).transpose(2, 3)

        # logic AND operation (both padding mask and no-look-forward must be true to attend to a certain target token)
        trg_mask = trg_padding_mask & trg_no_look_forward_mask  # final shape = (B, 1, T, T)
        num_trg_tokens = torch.sum(trg_padding_mask.long())

        return trg_mask, num_trg_tokens

    src_mask, num_src_tokens = masks_tokens_src(src_token_ids_batch, pad_token_id)
    trg_mask, num_trg_tokens = masks_tokens_trg(trg_token_ids_batch, pad_token_id)

    return src_mask, trg_mask, num_src_tokens, num_trg_tokens

In [107]:
# View some of our samples
def custom_dataloader(data_dir, batch_size, device):
    train_dataset, val_dataset, src_field_processor, trg_field_processor = get_language_datasets(data_dir=DATA_PATH)
    
    '''batch_size_fn: 
    Function of three arguments (new example to add, current count of examples in the batch, and current effective batch size)
            that returns the new effective batch size resulting from adding
            that example to a batch. This is useful for dynamic batching, where
            this function would add to the current effective batch size the
            number of tokens in the new example.'''

    # using default sorting function which
    train_token_ids_loader, val_token_ids_loader = BucketIterator.splits(
     datasets=(train_dataset, val_dataset), batch_size=batch_size,
     device=device,
     sort_within_batch=True,
     batch_size_fn=batch_size_fn
    )

    return train_token_ids_loader, val_token_ids_loader, src_field_processor, trg_field_processor



In [98]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [108]:
# Load the data

train_token_ids_loader, val_token_ids_loader, src_field_processor, trg_field_processor = custom_dataloader(data_dir=DATA_PATH, 
                                                                                                           batch_size=8, device=device)

# These are generator objects


It took 68.31505393981934 to prepare the data.


In [111]:
pad_token_id = src_field_processor.vocab.stoi[PAD_TOKEN]
print(pad_token_id)

for batch in train_token_ids_loader:
    # Visually inspect that masks make sense
    src_padding_mask, trg_mask, num_src_tokens, num_trg_tokens = masks_tokens(batch.src, batch.trg, pad_token_id, device)
    break

# Check vocab size
print('The source vocabulary length: {}.'.format(len(src_field_processor.vocab)))
print('The target vocabulary length: {}.'.format(len(trg_field_processor.vocab)))





1
The source vocabulary length: 38627.
The target vocabulary length: 47668.


In [115]:
# helper function to inspect the text
def sample_text_from_loader(src_field_processor, trg_field_processor, token_ids_loader, 
                            num_samples=2, sample_src=True, sample_trg=True, show_padded=False):
    
    assert sample_src or sample_trg, f'Either src or trg or both must be enabled.'

    for b_idx, token_ids_batch in enumerate(token_ids_loader):
        if b_idx == num_samples:  # Number of sentence samples to print
            break

        print('-' * 10)
        if sample_src:
            print("Source text:", end="\t")
            for token_id in token_ids_batch.src[0]:  # print only the first example from the batch
                src_token = src_field_processor.vocab.itos[token_id]

                if src_token == PAD_TOKEN and not show_padded:
                    continue

                print(src_token, end=" ")
            print()

        if sample_trg:
            print("Target text:", end="\t")
            for token_id in token_ids_batch.trg[0]:
                trg_token = trg_field_processor.vocab.itos[token_id]

                if trg_token == PAD_TOKEN and not show_padded:
                    continue

                print(trg_token, end=" ")
            print()

sample_text_from_loader(src_field_processor, trg_field_processor, train_token_ids_loader, num_samples=5)


----------
Source text:	You are going to fake it . 
Target text:	<s> Vous allez faire semblant . </s> 
----------
Source text:	So the thing is , this kind of idea of Chinese - American food does n't exist only in America . 
Target text:	<s> Mais en fait , ce genre d' idée qu' on se fait de la cuisine sino-américaine n' existe pas qu' en Amérique . </s> 
----------
Source text:	He looks like this . 
Target text:	<s> Il ressemble à ceci . </s> 
----------
Source text:	So again , people discover their creative agency in this way . 
Target text:	<s> Une fois de plus , les gens découvrent leur rôle créatif de cette manière . </s> 
----------
Source text:	It was like breathing . 
Target text:	<s> C' était comme respirer . </s> 


In [10]:
#jachiam


B = 5
L = 32
d = 16
w = 8

x = torch.as_tensor(np.random.rand(B,L,d), dtype=torch.float32)

class Attention(nn.Module):

    def __init__(self, d_in = 16, d_proj = 8, n_heads = 2, causal = True):  #ask josh why causal here  
        #a causal convolution has a window that overlaps only the current and previous timesteps
        #Causal convolutions are necessary because it would be cheating if the CNN was able to 
        #“see” information from the future timesteps that it is trying to predict.
        super().__init__()
        self.d_proj = d_proj
        self.n_heads = n_heads
        self.qkv_lin = nn.Linear(d_in, 3 * d_proj * n_heads, bias=False)
        self.causal = True

    def _mask(self, qk):
        mask = 1e9 * (torch.tril(torch.ones_like(qk)) - 1)
        return torch.tril(qk) + mask

    def forward(self, x):
        qkv_lin = self.qkv_lin(x).reshape(*x.shape[:2], self.n_heads, self.d_proj * 3) # B x L x n x 3d
        qkv_lin = qkv_lin.permute(0,2,1,3)                  # B x n x L x 3d
        q, k, v = torch.split(qkv_lin, self.d_proj, dim=-1) # B x n x L x d
        q_pre = q.reshape(-1, *q.shape[2:])                 # Bn x L x d
        k_pre = k.reshape(-1, *k.shape[2:]).permute(0,2,1)  # Bn x d x L
        v_pre = v.reshape(-1, *v.shape[2:])                 # Bn x L x d
        qk = torch.bmm(q_pre,k_pre)                         # Bn x L x L
        qk_ = self._mask(qk) if self.causal else qk
        softmax_qk = torch.softmax(qk_, axis=-1)
        
        y = torch.bmm(softmax_qk, v_pre)                    # Bn x L x d
        y = y.reshape(x.shape[0], self.n_heads, x.shape[1], self.d_proj)  # B x n x L x d
        y = y.permute(0,2,1,3)
        y = y.reshape(*x.shape[:2],-1)
        return y, softmax_qk.reshape(x.shape[0], self.n_heads, x.shape[1], x.shape[1])
        #q, k, v = torch.split(qkv_lin, self.d_proj, dim=2)
        #return q, k, v

In [12]:
embedding = nn.Embedding(10, 3) # an Embedding module containing 10 tensors of size 3
embedding

input = torch.LongTensor([[1,2,4],[4,3,2], [2,3,1]])
input2 = torch.LongTensor([[0,8], [7,6]])
print(embedding(input))
print(embedding(input2))


tensor([[[-0.7247, -1.0848, -1.0903],
         [-0.0517,  0.1599, -0.1975],
         [ 0.1476, -0.1203, -1.3196]],

        [[ 0.1476, -0.1203, -1.3196],
         [ 0.1189, -1.7153, -1.1312],
         [-0.0517,  0.1599, -0.1975]],

        [[-0.0517,  0.1599, -0.1975],
         [ 0.1189, -1.7153, -1.1312],
         [-0.7247, -1.0848, -1.0903]]], grad_fn=<EmbeddingBackward>)
tensor([[[ 0.2512, -1.5186,  0.7122],
         [-1.5975, -1.2887, -0.9902]],

        [[ 2.0694,  0.3160, -0.1283],
         [-0.3061, -0.9990, -0.3430]]], grad_fn=<EmbeddingBackward>)


In [116]:
# Input : Embeddings + Positional Encodings
src_vocab_size = len(src_field_processor.vocab)
trg_vocab_size = len(trg_field_processor.vocab)
# Embeddings
class Embedding(nn.Module):
    def __init__(self, src_vocab_size, trg_vocab_size):
        super().__init__()
#         self.embeddings_lookup = nn.Embedding(length,  embedding_dim = 512)
        self.source_embedding = nn.Embedding(num_embeddings=src_vocab_size, embedding_dim = 512)
        self.target_embedding = nn.Embedding(num_embeddings=trg_vocab_size, embedding_dim = 512)
        
        

# Positional Encoding
class PositionalEncoding(nn.Module):
    

In [None]:
# Dummy data
src_vocab_size = 20
trg_vocab_size = 20
src_token_ids_batch = torch.randint(1, 10, size=(3, 2))
trg_token_ids_batch = torch.randint(1, 10, size=(3, 2))



In [None]:
# dropout_rate = 0.1

# Multiheaded attention


In [None]:
# Encoder Implementation (In the AIAYN paper, we will stack 6 of these, may experiment with more)
# In each encoder layer, there is an attention mechanism and there a FFNN



# 1. Multihead Attention = Z

# 2. LayerNorm(Z + Drop(Z)) = Y

# 3. LayerNorm(Y + Drop(Fat-Relu(Y)))






In [None]:
# Decoder Implementation

In [None]:
def make_clones(module, num_copies):
    # Create layer clones that can be adjusted separately, rather than referentially
    return nn.ModuleList([copy.deepcopy(module) for _ in range(num_copies)])
