In [76]:
import os
os.environ['CUDA_VISIBLE_DEVICES']='0'
os.environ['CUDA_LAUNCH_BLOCKING']='1'

In [77]:
from io import open
import unicodedata
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.optim import AdamW
import math 

import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [78]:
SOS_token = 0
EOS_token = 1

MAX_LENGTH = 10
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS", 2:"PAD"}
        self.n_words = 3  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z!?]+", r" ", s)
    return s.strip()

def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open('data/%s-%s.txt' % (lang1, lang2), encoding='utf-8').\
        read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs


eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)

def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH 
    # and p[1].startswith(eng_prefixes)


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs



input_lang, output_lang, pairs = prepareData('spa', 'eng', False)
print(random.choice(pairs))


Reading lines...
Read 141370 sentence pairs
Trimmed to 119484 sentence pairs
Counting words...
Counted words:
spa 12092
eng 23396
['i m going to tell you what i think', 'te voy a decir lo que pienso', 'cc by france attribution tatoeba org ck hayastan']


In [79]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1)

def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

def get_dataloader(batch_size):
    input_lang, output_lang, pairs = prepareData('spa', 'eng', False)

    n = len(pairs)
    input_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)
    target_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)

    for idx, (inp, tgt,_) in enumerate(pairs):
        inp_ids = indexesFromSentence(input_lang, inp)
        tgt_ids = indexesFromSentence(output_lang, tgt)
        inp_ids.append(EOS_token)
        tgt_ids.append(EOS_token)
        input_ids[idx, :len(inp_ids)] = inp_ids
        target_ids[idx, :len(tgt_ids)] = tgt_ids

    train_data = TensorDataset(torch.LongTensor(input_ids),
                               torch.LongTensor(target_ids))

    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
    return input_lang, output_lang, train_dataloader

batch_size = 256
input_lang, output_lang, train_dataloader = get_dataloader(batch_size)

Reading lines...
Read 141370 sentence pairs
Trimmed to 119484 sentence pairs
Counting words...
Counted words:
spa 12092
eng 23396


In [80]:
class Attention(nn.Module):
    def __init__(self, n_embd, block_size):
        super().__init__()
        self.n_embd = n_embd
        self.wq = nn.Linear(n_embd, n_embd)
        self.wk = nn.Linear(n_embd, n_embd)
        self.wv = nn.Linear(n_embd, n_embd)
        
        causal_mask = torch.ones(block_size,block_size)
        causal_mask = torch.tril(causal_mask).to(bool)
        self.register_buffer('causal_mask', causal_mask)
    
    def forward(self,q,k,v,causal:bool): 
        # q: B,T1,C
        # k: B,T2,C
        # v: B,T2,C
        q = self.wq(q) # B,T1,C
        k = self.wk(k) # B,T2,C
        v = self.wv(v) # B,T2,C
        T1 = q.shape[1]
        T2 = k.shape[1]
        
        weights = q@k.transpose(-1,-2) / math.sqrt(self.n_embd) # B,T1,T2
        if causal:
            weights = weights.masked_fill(~self.causal_mask, -torch.inf)
        attn = torch.softmax(weights, axis=-1) # B,T1,T2
        assert not torch.isnan(attn).any()
        x = attn @ v # B,T1,C
        return x


class Encoder(nn.Module):
    def __init__(self, input_size, n_embd, block_size):
        super().__init__()

        self.embedding = nn.Embedding(input_size, n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.attn = Attention(n_embd, block_size)
        
        self.ln2 = nn.LayerNorm(n_embd)
        self.mlp = nn.Sequential(
            nn.Linear(n_embd, n_embd*4),
            nn.ReLU(),
            nn.Linear(n_embd*4, n_embd)
        )

    def forward(self,x):
        o = self.embedding(x)
        o = self.ln1(o)
        o = self.attn(o,o,o,causal=True)
        assert not torch.isnan(o).any()
        o = self.mlp(self.ln2(o))
        return o


class Decoder(nn.Module):
    def __init__(self, output_vocab_size, n_embd, block_size):
        super().__init__()
        self.embedding = nn.Embedding(output_vocab_size, n_embd)
        self.pos_enc = nn.Embedding(block_size, n_embd)
        self.s_ln1 = nn.LayerNorm(n_embd)
        self.s_attn = Attention(n_embd, block_size)

        self.x_ln1 = nn.LayerNorm(n_embd)
        self.x_attn = Attention(n_embd, block_size)
        
        # self.ln2 = nn.LayerNorm(n_embd)
        self.mlp = nn.Sequential(
            nn.Linear(n_embd, n_embd*4),
            nn.ReLU(),
            nn.Linear(n_embd*4, n_embd)
        )

        self.proj = nn.Linear(n_embd, output_vocab_size)

        self.register_buffer('pos_id', torch.arange(0,block_size))

    def forward(self,x,decoder_outputs):
        x = self.embedding(x) + self.pos_enc(self.pos_id)

        # self attention
        o = self.s_attn(x,x,x,causal=True)      
        x = self.s_ln1(x + o)            

        # cross attention
        o = self.x_attn(x,decoder_outputs,decoder_outputs,causal=False)
        o = self.x_ln1(x+o)
        
        o = self.mlp(o)
        o = self.proj(o)
        return o
        

In [82]:
X.shape

torch.Size([256, 10])

In [49]:
step = 0
for epoch in range(10):
    for X,y in train_dataloader:
        step +=1
        X = X.to(device)
        y = y.to(device)
        enc_outputs = encoder(X)
        decoder_outputs = decoder(y, enc_outputs)
        loss = F.cross_entropy(decoder_outputs.view(-1, decoder_outputs.shape[-1]), y.view(-1))
        if step % 100 == 0:
            print(f'e{epoch}:s{step} | loss:{loss:.3f}')
        optim1.zero_grad()
        optim2.zero_grad()
        loss.backward()
        optim1.step()
        optim2.step()
        # break


e0:s100 | loss:0.708
e0:s200 | loss:0.421
e0:s300 | loss:0.325
e0:s400 | loss:0.209
e1:s500 | loss:0.100
e1:s600 | loss:0.129
e1:s700 | loss:0.116
e1:s800 | loss:0.110


KeyboardInterrupt: 

In [84]:
X, y

(tensor([[  71,  286,  138,  ...,    1,    0,    0],
         [ 250,   44,  443,  ...,    0,    0,    0],
         [2652,  548,   65,  ...,    0,    0,    0],
         ...,
         [ 236,  112,  134,  ...,    1,    0,    0],
         [ 285,   79,  366,  ..., 7388,    1,    0],
         [ 806,  366,  827,  ...,    0,    0,    0]], device='cuda:0'),
 tensor([[  154,   164,   670,  ...,     1,     0,     0],
         [  510,   147,  1016,  ...,     0,     0,     0],
         [  705,  1197,    64,  ...,     0,     0,     0],
         ...,
         [  476,    64,   454,  ...,     1,     0,     0],
         [  257,  5644,   135,  ..., 11298,     1,     0],
         [ 3999,   135,   352,  ...,     0,     0,     0]], device='cuda:0'))

In [95]:
y[1]

tensor([ 510,  147, 1016,    1,    0,    0,    0,    0,    0,    0],
       device='cuda:0')

In [86]:
enc_outputs = encoder(X)

In [74]:
F.pad(input_tensor, input_lang.word2index['<PAD>'])

KeyError: '<PAD>'

In [71]:
with torch.no_grad():
    input_tensor = tensorFromSentence(input_lang, sentence)

    enc_outputs = encoder(input_tensor)
    decoder_outputs = decoder(y, enc_outputs)

    _, topi = decoder_outputs[:,0].topk(1)
    decoded_ids = topi.squeeze()

    decoded_words = []
    for idx in decoded_ids:
        if idx.item() == EOS_token:
            decoded_words.append('<EOS>')
            break
        decoded_words.append(output_lang.index2word[idx.item()])

RuntimeError: The size of tensor a (10) must match the size of tensor b (4) at non-singleton dimension 2

In [57]:
def evaluate(encoder, decoder, sentence, input_lang, output_lang):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)

        enc_outputs = encoder(input_tensor)
        decoder_outputs = decoder(y, enc_outputs)

        _, topi = decoder_outputs[:,0].topk(1)
        decoded_ids = topi.squeeze()

        decoded_words = []
        for idx in decoded_ids:
            if idx.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            decoded_words.append(output_lang.index2word[idx.item()])
    return decoded_words, None#, decoder_attn

def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, _ = evaluate(encoder, decoder, pair[0], input_lang, output_lang)
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')
        
evaluateRandomly(encoder,decoder)

> would you like to talk to the manager ?
= le gustaria hablar con el gerente ?
< a confia sin te fue el si el el como lo no el me intente yo hoy la veo muerde a te puedes vuelvo tu en se apenas voy cuanto imposible esto por hay lo desde tom al tom no mereces se cuando quiero sabes necesitamos dejale me tom puede mi eso me cuales la llama yo de deberias a tom la tom es yo el yo estoy por tom tu quien estaba no por tengo helen me no tom tom gracias se tom mi yo eso quede el estoy estoy no pense nadie me esta es eisenhower debes tu debo no queres no me afortunadamente ha el quedate deja da por alejate esta en soy ayer esto no es que estoy tom el le hay pense su le he tom este quiero ella ella estaba tus soy a este volvere tom el quiero es la solo veinte estas deberias me ella me acepto el no el tom un creo el ellos su donde ella tom me no a me hagales quiero es no tom tom mi debes sugiero tom tom queda tom creo no mi ellos tom estoy hace llego este tom tom por quisiera tom adonde ellas p

RuntimeError: The size of tensor a (10) must match the size of tensor b (7) at non-singleton dimension 2