In [1]:
import pickle
import torch
import torchtext.transforms as T
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

def loadFiles(file):
    with open(file, "rb") as file:
        data = pickle.load(file)
    print("The size of the dataset is:", len(data))
    return data


def separateData(data):
    X = data[:, 0]
    y = data[:, 1]
    return X, y

In [2]:
data = loadFiles(r'./english-german-both.pkl')

The size of the dataset is: 10000


In [3]:
eng, germ = separateData(data)

In [4]:
def findLongestSequence(sentList):
    return max(len(seq.split()) for seq in sentList)


In [5]:
enTokenizer = get_tokenizer('spacy', language='en')
deTokenizer = get_tokenizer('spacy', language='de')
def yieldTokensEn(data):
    for text in data:
        yield enTokenizer(text[:-1])

def yieldTokensDe(data):
    
    for text in data:
        yield deTokenizer(text[:-1])



In [6]:
vocabEn = build_vocab_from_iterator(yieldTokensEn(eng), specials=["<pad>", "<sos>", "<eos>", "<unk>"], special_first=True)
vocabDe = build_vocab_from_iterator(yieldTokensDe(germ), specials=["<pad>", "<sos>", "<eos>", "<unk>"], special_first=True)

In [7]:
textPipelineEn = lambda x: vocabEn(enTokenizer(x))
textPipelineDe = lambda x: vocabDe(deTokenizer(x))

In [8]:
from sklearn.model_selection import train_test_split

SEED = 42
trainEn, testEn, trainDe, testDe = train_test_split(eng, germ, test_size=0.1, random_state=SEED)
trainEn, valEn, trainDe, valDe = train_test_split(eng, germ, test_size=0.1, random_state=SEED)

In [29]:
BATCH_SIZE = 4
PAD_IDX = vocabEn(['<pad>'])[0]
SOS_IDX = vocabEn(['<sos>'])
EOS_IDX = vocabEn(['<eos>'])
print(PAD_IDX)
print(SOS_IDX)
print(EOS_IDX)

0
[1]
[2]


In [30]:
from torch.nn.utils.rnn import pad_sequence

def generateData(eng, deu):
    data = []
    for en, de in zip(eng, deu):
        enTensor = torch.tensor(textPipelineEn(en[:-1]), dtype=torch.long)
        deTensor = torch.tensor(textPipelineDe(de[:-1]), dtype=torch.long)
        data.append((enTensor, deTensor))
    return data


trainData = generateData(trainEn, trainDe)
valData = generateData(valEn, valDe)         
testData = generateData(testEn, testDe)        

In [31]:
from torch.utils.data import DataLoader

    
def generateBatch(data_batch):
  de_batch, en_batch = [], []
  # print(data_batch)
  for (de_item, en_item) in data_batch:
    de_batch.append(torch.cat([torch.tensor(SOS_IDX), de_item, torch.tensor(EOS_IDX)], dim=0))
    en_batch.append(torch.cat([torch.tensor(SOS_IDX), en_item, torch.tensor(EOS_IDX)], dim=0))
  deLength = len(de_batch)
  batch = pad_sequence(en_batch + de_batch, padding_value=PAD_IDX, batch_first=True)
  en_batch, de_batch = batch[:deLength], batch[deLength:]
  return de_batch, en_batch


trainIter = DataLoader(trainData, batch_size=BATCH_SIZE, shuffle=True, collate_fn=generateBatch)
valIter =  DataLoader(valData, batch_size=BATCH_SIZE, shuffle=True, collate_fn=generateBatch)
testIter = DataLoader(testData, batch_size=BATCH_SIZE, shuffle=True, collate_fn=generateBatch)

In [32]:
from model.transformer import Transformer
from model.positenc import PositionalEncodingTorch
from torch.nn import Embedding, Module, Linear

EMB_DIM = 64
HEADS = 8
LINEAR_DIM = 2048
DROPOUT = 0.1
LAYERS = 6
BETA_1 = 0.9
BETA_2 = 0.98
EPSILON = 10**-9
ENG_VOCAB_LEN = vocabEn.__len__()
DE_VOCAB_LEN = vocabDe.__len__()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f"The english has {ENG_VOCAB_LEN} words.")
print(f"The german has {DE_VOCAB_LEN} words.")

EN_MAX_SEQ_LEN, DE_MAX_SEQ_LEN = findLongestSequence(eng) + 2 , findLongestSequence(germ) + 2

#mask for the decoder
def createMask(x):
    batch, seq_length, _ = x.size()
    mask = torch.ones((batch, seq_length, seq_length)).to(device)
    mask = torch.tril(mask, diagonal=0)
    return mask

def createMaskGen(x):
    x = x.unsqueeze(1)
    batch, _, seq_length = x.size()
    mask = torch.ones((batch, 1, seq_length)).to(device)
    maskR = torch.logical_and(mask, x)
    maskC = torch.logical_and(x.transpose(-1, 1), mask.transpose(-1, 1))
    mask = torch.multiply(maskR, maskC)
    return mask
    


class TransformerModel(Module):
    def __init__(self):
        super().__init__()
        self.embEn = Embedding(ENG_VOCAB_LEN, EMB_DIM)
        self.embDe = Embedding(DE_VOCAB_LEN, EMB_DIM)
        self.positEn = PositionalEncodingTorch(EN_MAX_SEQ_LEN, EMB_DIM)
        self.positDe = PositionalEncodingTorch(DE_MAX_SEQ_LEN, EMB_DIM)
        self.transformer = Transformer(LAYERS, EMB_DIM, EMB_DIM, HEADS, LINEAR_DIM, DROPOUT)
        self.linear = Linear(EMB_DIM, DE_VOCAB_LEN)
        
    def forward(self, eng, de, device=device):
        mask1 = createMaskGen(eng)
        eng = self.embEn(eng)
        eng = self.positEn(eng, device=device)
        mask = createMaskGen(de)
        de = self.embDe(de)
        de = self.positDe(de, device=device)
        mask2 = createMask(de)
        mask2 = torch.logical_and(mask, mask2)
        dec = self.transformer(eng, de, mask1, mask2)
        lin = self.linear(dec)    
        return lin
        






The english has 2594 words.
The german has 4167 words.


In [13]:
class TransformerLRScheduler(object):
    def __init__(self,  warmup_steps:int=10, d_model:int=512):
        self.warmup_steps = warmup_steps
        self.d_model = d_model
    
    
    def __call__(self, epoch):
        epoch = epoch + 1
        minimum = min(epoch**-0.5, epoch * ((self.warmup_steps) ** (-1.5)))
        return (self.d_model**-0.5 ) * minimum
    

In [14]:
def calculateAccuracy(prediction, target):
    padding_mask = torch.logical_not(torch.eq(target, torch.tensor(0)))
    accuracy = torch.eq(target, torch.argmax(prediction, axis=2))
    accuracy = torch.logical_and(padding_mask, accuracy)
    accuracy = accuracy.type(torch.float32)
    padding_mask = padding_mask.type(torch.float32)
    return torch.sum(accuracy) / torch.sum(padding_mask)
    

In [15]:
def validateModel(model, testIter, loss, device):
    lossPerBatch = []
    accPerBatch = []
    with torch.no_grad():
        model.eval()
        for i, (X, y) in enumerate(testIter):
            X, y = X.to(device), y.to(device)
            out = model(X[:, 1:], y[:, :-1])
            l = loss(out.contiguous().view(-1, 4167), y[:, 1:].contiguous().view(-1))
            a = calculateAccuracy(out, y[:, 1:])
            lossPerBatch.append(l.item())
            accPerBatch.append(a.item())
        meanLoss = sum(lossPerBatch)/len(lossPerBatch)
    return lossPerBatch, meanLoss, accPerBatch

In [33]:
from torch import optim
import torch.nn as nn
from torch.optim import lr_scheduler
model = TransformerModel()
model.to(device)
optimizer = optim.Adam(model.parameters(), betas=(BETA_1, BETA_2), eps=EPSILON)
loss = nn.CrossEntropyLoss(ignore_index=0)
scheduler = TransformerLRScheduler(100)

In [17]:
from tqdm import tqdm



def train(model, trainIter, 
          testIter=None, 
          epochs=None, 
          loss=None, 
          optimizer=None, 
          device=device, 
          scheduler=None):
    # model.to(device)
    # pbar = trange(epochs, desc="Epochs ", unit="batches")
    # with tqdm(trainIter, unit="epochs") as tepoch:
    logs_dic = {
        "valildationLoss": [],
        "trainingLoss" : [],
        "validationAccuracy": [],
        "trainingAccuracy": []
    }
    for epoch in range(epochs):
        trainLossPerBatch = []
        trainAccuracyPerBatch = []
        with tqdm(trainIter, unit="batches") as tepoch:
            for i, (X,y) in enumerate(tepoch):
                model.train()
                optimizer.zero_grad()
                X, y = X.to(device), y.to(device)
                out = model(X[:, 1:], y[:, :-1])
                l = loss(out.contiguous().view(-1, 4167), y[:, 1:].contiguous().view(-1))
                acc = calculateAccuracy(out, y[:, 1:])
                trainLossPerBatch.append(l.item())
                trainAccuracyPerBatch.append(acc.item())
                tepoch.set_description(f"Epoch {epoch + 1}")            
                tepoch.set_postfix(loss=l.item(), accuracy=acc.item())
                l.backward()
                optimizer.step()
            valLoss, meanValLoss, valAcc = validateModel(model, testIter, loss=loss, device=device)
            print(f"The validation loss is: {meanValLoss}")
            logs_dic["valildationLoss"].append(valLoss)
            logs_dic["trainingLoss"].append(trainLossPerBatch)
            logs_dic["trainingAccuracy"].append(trainAccuracyPerBatch)
            logs_dic["validationAccuracy"].append(valAcc)
            # print(f"Epoch: {epoch+1}     loss: {l}")
            if scheduler:
                if scheduler.__module__ == lr_scheduler.__name__:
                    scheduler.step()
                else:
                    for param_group in optimizer.param_groups:
                        lr = scheduler(epoch)
                        param_group['lr'] = lr
                
    return logs_dic
# history = train(model, trainIter, testIter=valIter, epochs=1, loss=loss, optimizer=optimizer, device=device, scheduler=scheduler)

In [18]:
def saveHistory(history, filename):
    print("pickling history.")
    with open(filename, 'wb') as fp:
        pickle.dump(history, fp)
    print("successfully pickled") 
    
# saveHistory(history, "./history")   

In [19]:
# %matplotlib inline
# import matplotlib.pyplot as plt

# lr = TransformerLRScheduler(1000)
# plt.plot([i for i in range(5000)], [lr(i) for i in range(5000)])
# plt.grid()
# plt.xlabel('Epochs')
# plt.ylabel("Learning Rate")

In [20]:
# torch.save(model.save_dict(), './')

In [34]:

def test(model, encInput):
    model.eval()
    batch, seq_length = encInput.size()
    decOutput =  torch.tensor([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])  
    decOutput = torch.tile(decOutput, (batch, 1)) 
    for i in range(DE_MAX_SEQ_LEN):
        prediction = model(encInput, decOutput)
        prediction = prediction[:, -1, :]
        predicted_id = torch.argmax(prediction, dim=-1)
        print(predicted_id)
        
        
model.load_state_dict(torch.load('e150emb64b256d15l6.pth', map_location=device))
for i, (X, y) in enumerate(testIter):
    print(i, X.shape)
    test(model, X)        
    break



0 torch.Size([4, 7])
prediction is:  torch.Size([4, 4167])
tensor([2, 2, 2, 2])
prediction is:  torch.Size([4, 4167])
tensor([2, 2, 2, 2])
prediction is:  torch.Size([4, 4167])
tensor([2, 2, 2, 2])
prediction is:  torch.Size([4, 4167])
tensor([2, 2, 2, 2])
prediction is:  torch.Size([4, 4167])
tensor([2, 2, 2, 2])
prediction is:  torch.Size([4, 4167])
tensor([2, 2, 2, 2])
prediction is:  torch.Size([4, 4167])
tensor([2, 2, 2, 2])
prediction is:  torch.Size([4, 4167])
tensor([2, 2, 2, 2])
prediction is:  torch.Size([4, 4167])
tensor([2, 2, 2, 2])
prediction is:  torch.Size([4, 4167])
tensor([2, 2, 2, 2])
prediction is:  torch.Size([4, 4167])
tensor([2, 2, 2, 2])
prediction is:  torch.Size([4, 4167])
tensor([2, 2, 2, 2])


In [None]:
def test(model, encInput):
    model.eval()
    batch, seq_length = encInput.size()
    out = [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    decOutput =  torch.tensor(out)
    decOutput = torch.tile(decOutput, (batch, 1))
    for i in range(DE_MAX_SEQ_LEN-1):
        prediction = model(encInput.to(device)[:, 1:], decOutput.to(device))
        prediction = prediction[:, i, :]
        predicted_id = torch.argmax(prediction, dim=-1)
        decOutput[:, i+1] = predicted_id[ :]
        print(predicted_id.tolist()[0])
        if predicted_id.tolist()[0] == EOS_IDX[0]:
          break
    return decOutput
testIter = DataLoader(testData, batch_size=1, shuffle=True, collate_fn=generateBatch)



for i, (X, y) in enumerate(testIter):
    pred = test(model, X)
    for x, y, z in zip(pred.tolist(), X.tolist(), y.tolist()):
      print(' '.join(vocabDe.lookup_tokens(x)))
      print(' '.join(vocabEn.lookup_tokens(y)))
      print(' '.join(vocabDe.lookup_tokens(z)))
      print("\n")

  
    break