In [2]:
import pickle
import torch
import torchtext.transforms as T
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

def loadFiles(file):
    with open(file, "rb") as file:
        data = pickle.load(file)
    print("The size of the dataset is:", len(data))
    return data


def separateData(data):
    X = data[:, 0]
    y = data[:, 1]
    return X, y

In [3]:
data = loadFiles(r'./english-german-both.pkl')

The size of the dataset is: 10000


In [4]:
eng, germ = separateData(data)

In [5]:
def findLongestSequence(sentList):
    return max(len(seq.split()) for seq in sentList)


In [6]:
enTokenizer = get_tokenizer('spacy', language='en')
deTokenizer = get_tokenizer('spacy', language='de')
def yieldTokensEn(data):
    for text in data:
        yield enTokenizer(text[:-1])

def yieldTokensDe(data):
    
    for text in data:
        yield deTokenizer(text[:-1])



In [7]:
vocabEn = build_vocab_from_iterator(yieldTokensEn(eng), specials=["<pad>", "<sos>", "<eos>", "<unk>"], special_first=True)
vocabDe = build_vocab_from_iterator(yieldTokensDe(germ), specials=["<pad>", "<sos>", "<eos>", "<unk>"], special_first=True)

In [8]:
textPipelineEn = lambda x: vocabEn(enTokenizer(x))
textPipelineDe = lambda x: vocabDe(deTokenizer(x))

In [9]:
from sklearn.model_selection import train_test_split

trainEn, testEn, trainDe, testDe = train_test_split(eng, germ, test_size=0.1)
trainEn, valEn, trainDe, valDe = train_test_split(eng, germ, test_size=0.1)

In [10]:
BATCH_SIZE = 64
PAD_IDX = vocabEn(['<pad>'])[0]
SOS_IDX = vocabEn(['<sos>'])
EOS_IDX = vocabEn(['<eos>'])
print(PAD_IDX)
print(SOS_IDX)
print(EOS_IDX)

0
[1]
[2]


In [11]:
from torch.nn.utils.rnn import pad_sequence

def generateData(eng, deu):
    data = []
    for en, de in zip(eng, deu):
        enTensor = torch.tensor(textPipelineEn(en[:-1]), dtype=torch.long)
        deTensor = torch.tensor(textPipelineDe(de[:-1]), dtype=torch.long)
        data.append((enTensor, deTensor))
    return data


trainData = generateData(trainEn, trainDe)
valData = generateData(valEn, valDe)         
testData = generateData(testEn, testDe)        

In [12]:
from torch.utils.data import DataLoader

    
def generateBatch(data_batch):
  de_batch, en_batch = [], []
  # print(data_batch)
  for (de_item, en_item) in data_batch:
    de_batch.append(torch.cat([torch.tensor(SOS_IDX), de_item, torch.tensor(EOS_IDX)], dim=0))
    en_batch.append(torch.cat([torch.tensor(SOS_IDX), en_item, torch.tensor(EOS_IDX)], dim=0))
  deLength = len(de_batch)
  batch = pad_sequence(en_batch + de_batch, padding_value=PAD_IDX, batch_first=True)
  en_batch, de_batch = batch[:deLength], batch[deLength:]
  return de_batch, en_batch


trainIter = DataLoader(trainData, batch_size=BATCH_SIZE, shuffle=True, collate_fn=generateBatch)
valIter =  DataLoader(valData, batch_size=BATCH_SIZE, shuffle=True, collate_fn=generateBatch)
testIter = DataLoader(testData, batch_size=BATCH_SIZE, shuffle=True, collate_fn=generateBatch)

In [20]:
from model.transformer import Transformer
from model.positenc import PositionalEncodingTorch
from torch.nn import Embedding, Module, Linear

EMB_DIM = 128
HEADS = 8
LINEAR_DIM = 2048
DROPOUT = 0.1
LAYERS = 6
BETA_1 = 0.9
BETA_2 = 0.98
EPSILON = 10**-9
ENG_VOCAB_LEN = vocabEn.__len__()
DE_VOCAB_LEN = vocabDe.__len__()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f"The english has {ENG_VOCAB_LEN} words.")
print(f"The german has {DE_VOCAB_LEN} words.")

EN_MAX_SEQ_LEN, DE_MAX_SEQ_LEN = findLongestSequence(eng) + 2 , findLongestSequence(germ) + 2

#mask for the decoder
def createMask(x):
    batch, seq_length, _ = x.size()
    mask = torch.ones((batch, seq_length, seq_length)).to(device)
    mask = torch.tril(mask, diagonal=0)
    return mask

def createMaskGen(x):
    x = x.unsqueeze(1)
    batch, _, seq_length = x.size()
    mask = torch.ones((batch, 1, seq_length)).to(device)
    maskR = torch.logical_and(mask, x)
    maskC = torch.logical_and(x.transpose(-1, 1), mask.transpose(-1, 1))
    mask = torch.multiply(maskR, maskC)
    return mask
    


class TransformerModel(Module):
    def __init__(self):
        super().__init__()
        self.embEn = Embedding(ENG_VOCAB_LEN, EMB_DIM)
        self.embDe = Embedding(DE_VOCAB_LEN, EMB_DIM)
        self.positEn = PositionalEncodingTorch(EN_MAX_SEQ_LEN, EMB_DIM)
        self.positDe = PositionalEncodingTorch(DE_MAX_SEQ_LEN, EMB_DIM)
        self.transformer = Transformer(LAYERS, EMB_DIM, EMB_DIM, HEADS, LINEAR_DIM, DROPOUT)
        self.linear = Linear(EMB_DIM, DE_VOCAB_LEN)
        
    def forward(self, eng, de):
        mask1 = createMaskGen(eng)
        eng = self.embEn(eng)
        eng = self.positEn(eng)
        mask = createMaskGen(de)
        de = self.embDe(de)
        de = self.positDe(de)
        mask2 = createMask(de)
        mask2 = torch.logical_and(mask, mask2)
        dec = self.transformer(eng, de, mask1, mask2)
        lin = self.linear(dec)    
        return lin
        






The english has 2594 words.
The german has 4167 words.


In [21]:
class TransformerLRScheduler(object):
    def __init__(self,  warmup_steps:int=10, d_model:int=512):
        self.warmup_steps = warmup_steps
        self.d_model = d_model
    
    
    def __call__(self, epoch):
        epoch = epoch + 1
        minimum = min(epoch**-0.5, epoch * ((self.warmup_steps) ** (-1.5)))
        return (self.d_model**-0.5 ) * minimum
    

In [22]:
def calculateAccuracy(prediction, target):
    padding_mask = torch.logical_not(torch.eq(target, torch.tensor(0)))
    accuracy = torch.eq(target, torch.argmax(prediction, axis=2))
    accuracy = torch.logical_and(padding_mask, accuracy)
    accuracy = accuracy.type(torch.float32)
    padding_mask = padding_mask.type(torch.float32)
    return torch.sum(accuracy) / torch.sum(padding_mask)
    

In [23]:
def validateModel(model, testIter, loss, device):
    lossPerBatch = []
    accPerBatch = []
    with torch.no_grad():
        model.eval()
        for i, (X, y) in enumerate(testIter):
            X, y = X.to(device), y.to(device)
            out = model(X[:, 1:], y[:, :-1])
            l = loss(out.contiguous().view(-1, 4167), y[:, 1:].contiguous().view(-1))
            a = calculateAccuracy(out, y[:, 1:])
            lossPerBatch.append(l.item())
            accPerBatch.append(a.item())
        meanLoss = sum(lossPerBatch)/len(lossPerBatch)
    return lossPerBatch, meanLoss, accPerBatch

In [24]:
from torch import optim
import torch.nn as nn
from torch.optim import lr_scheduler
model = TransformerModel()
model.to(device)
optimizer = optim.Adam(model.parameters(), betas=(BETA_1, BETA_2), eps=EPSILON)
loss = nn.CrossEntropyLoss(ignore_index=0)
scheduler = TransformerLRScheduler(100)

In [25]:
from tqdm import tqdm



def train(model, trainIter, 
          testIter=None, 
          epochs=None, 
          loss=None, 
          optimizer=None, 
          device=device, 
          scheduler=None):
    # model.to(device)
    # pbar = trange(epochs, desc="Epochs ", unit="batches")
    # with tqdm(trainIter, unit="epochs") as tepoch:
    logs_dic = {
        "valildationLoss": [],
        "trainingLoss" : [],
        "validationAccuracy": [],
        "trainingAccuracy": []
    }
    for epoch in range(epochs):
        trainLossPerBatch = []
        trainAccuracyPerBatch = []
        with tqdm(trainIter, unit="batches") as tepoch:
            for i, (X,y) in enumerate(tepoch):
                model.train()
                optimizer.zero_grad()
                X, y = X.to(device), y.to(device)
                out = model(X[:, 1:], y[:, :-1])
                l = loss(out.contiguous().view(-1, 4167), y[:, 1:].contiguous().view(-1))
                acc = calculateAccuracy(out, y[:, 1:])
                trainLossPerBatch.append(l.item())
                trainAccuracyPerBatch.append(acc.item())
                tepoch.set_description(f"Epoch {epoch + 1}")            
                tepoch.set_postfix(loss=l.item(), accuracy=acc.item())
                l.backward()
                optimizer.step()
            valLoss, meanValLoss, valAcc = validateModel(model, testIter, loss=loss, device=device)
            print(f"The validation loss is: {meanValLoss}")
            logs_dic["valildationLoss"].append(valLoss)
            logs_dic["trainingLoss"].append(trainLossPerBatch)
            logs_dic["trainingAccuracy"].append(trainAccuracyPerBatch)
            logs_dic["validationAccuracy"].append(valAcc)
            # print(f"Epoch: {epoch+1}     loss: {l}")
            if scheduler:
                if scheduler.__module__ == lr_scheduler.__name__:
                    scheduler.step()
                else:
                    for param_group in optimizer.param_groups:
                        lr = scheduler(epoch)
                        param_group['lr'] = lr
                
    return logs_dic
history = train(model, trainIter, testIter=valIter, epochs=1000, loss=loss, optimizer=optimizer, device=device, scheduler=scheduler)

Epoch 1: 100%|██████████| 141/141 [00:39<00:00,  3.55batches/s, accuracy=0.37, loss=4.5]  


The validation loss is: 4.494223594665527


Epoch 2: 100%|██████████| 141/141 [00:39<00:00,  3.60batches/s, accuracy=0.35, loss=4.34] 


The validation loss is: 4.386409789323807


Epoch 3: 100%|██████████| 141/141 [00:38<00:00,  3.65batches/s, accuracy=0.376, loss=4.56]


The validation loss is: 4.139525979757309


Epoch 4: 100%|██████████| 141/141 [00:39<00:00,  3.55batches/s, accuracy=0.432, loss=3.78]


The validation loss is: 3.951369196176529


Epoch 5: 100%|██████████| 141/141 [00:40<00:00,  3.52batches/s, accuracy=0.459, loss=3.42]


The validation loss is: 3.7955708354711533


Epoch 6: 100%|██████████| 141/141 [00:38<00:00,  3.62batches/s, accuracy=0.464, loss=3.3] 


The validation loss is: 3.634869337081909


Epoch 7: 100%|██████████| 141/141 [00:39<00:00,  3.61batches/s, accuracy=0.48, loss=3.21] 


The validation loss is: 3.471206620335579


Epoch 8: 100%|██████████| 141/141 [00:37<00:00,  3.72batches/s, accuracy=0.587, loss=2.86]


The validation loss is: 3.3126397281885147


Epoch 9: 100%|██████████| 141/141 [00:36<00:00,  3.91batches/s, accuracy=0.484, loss=3.09]


The validation loss is: 3.1896308958530426


Epoch 10: 100%|██████████| 141/141 [00:36<00:00,  3.82batches/s, accuracy=0.528, loss=2.95]


The validation loss is: 3.0579340159893036


Epoch 11: 100%|██████████| 141/141 [00:39<00:00,  3.55batches/s, accuracy=0.577, loss=2.54]


The validation loss is: 2.900737091898918


Epoch 12: 100%|██████████| 141/141 [00:38<00:00,  3.64batches/s, accuracy=0.609, loss=2.29]


The validation loss is: 2.7769332230091095


Epoch 13: 100%|██████████| 141/141 [00:36<00:00,  3.88batches/s, accuracy=0.661, loss=1.99]


The validation loss is: 2.6011139899492264


Epoch 14: 100%|██████████| 141/141 [00:36<00:00,  3.85batches/s, accuracy=0.708, loss=1.63]


The validation loss is: 2.4490253776311874


Epoch 15: 100%|██████████| 141/141 [00:36<00:00,  3.89batches/s, accuracy=0.685, loss=1.68]


The validation loss is: 2.376742497086525


Epoch 16: 100%|██████████| 141/141 [00:38<00:00,  3.71batches/s, accuracy=0.739, loss=1.32]


The validation loss is: 2.29269852489233


Epoch 17: 100%|██████████| 141/141 [00:38<00:00,  3.69batches/s, accuracy=0.809, loss=1.04] 


The validation loss is: 2.2394701167941093


Epoch 18: 100%|██████████| 141/141 [00:39<00:00,  3.61batches/s, accuracy=0.773, loss=1.07] 


The validation loss is: 2.3154958710074425


Epoch 19: 100%|██████████| 141/141 [00:37<00:00,  3.80batches/s, accuracy=0.812, loss=1.05] 


The validation loss is: 2.3351793065667152


Epoch 20: 100%|██████████| 141/141 [00:38<00:00,  3.68batches/s, accuracy=0.773, loss=1.18] 


The validation loss is: 2.255977638065815


Epoch 21: 100%|██████████| 141/141 [00:35<00:00,  3.93batches/s, accuracy=0.746, loss=1.03] 


The validation loss is: 2.2758749276399612


Epoch 22: 100%|██████████| 141/141 [00:35<00:00,  3.93batches/s, accuracy=0.802, loss=0.72] 


The validation loss is: 2.415500044822693


Epoch 23: 100%|██████████| 141/141 [00:37<00:00,  3.79batches/s, accuracy=0.81, loss=0.792] 


The validation loss is: 2.479667142033577


Epoch 24: 100%|██████████| 141/141 [00:40<00:00,  3.51batches/s, accuracy=0.818, loss=0.637]


The validation loss is: 2.5352026373147964


Epoch 25: 100%|██████████| 141/141 [00:39<00:00,  3.59batches/s, accuracy=0.776, loss=0.791]


The validation loss is: 2.556930035352707


Epoch 26: 100%|██████████| 141/141 [00:39<00:00,  3.53batches/s, accuracy=0.724, loss=1.13] 


The validation loss is: 2.567691072821617


Epoch 27: 100%|██████████| 141/141 [00:39<00:00,  3.58batches/s, accuracy=0.822, loss=0.601]


The validation loss is: 2.6552927643060684


Epoch 28: 100%|██████████| 141/141 [00:39<00:00,  3.57batches/s, accuracy=0.76, loss=0.842] 


The validation loss is: 2.68728506565094


Epoch 29: 100%|██████████| 141/141 [00:39<00:00,  3.60batches/s, accuracy=0.805, loss=0.762]


The validation loss is: 2.6593070924282074


Epoch 30: 100%|██████████| 141/141 [00:40<00:00,  3.47batches/s, accuracy=0.8, loss=0.822]  


The validation loss is: 2.7017649710178375


Epoch 31: 100%|██████████| 141/141 [00:42<00:00,  3.35batches/s, accuracy=0.848, loss=0.594]


The validation loss is: 2.7665096819400787


Epoch 32:  79%|███████▉  | 112/141 [00:31<00:08,  3.57batches/s, accuracy=0.789, loss=0.707]


KeyboardInterrupt: 

In [None]:
# %matplotlib inline
# import matplotlib.pyplot as plt

# lr = TransformerLRScheduler(1000)
# plt.plot([i for i in range(5000)], [lr(i) for i in range(5000)])
# plt.grid()
# plt.xlabel('Epochs')
# plt.ylabel("Learning Rate")

In [None]:
torch.save(model.save_dict(), './')


