In [1]:
#Imports
import pandas as pd
import torch
import torch.nn as nn
import torchtext; torchtext.disable_torchtext_deprecation_warning()
from torch.utils.data import Dataset, DataLoader, random_split
from torchtext.vocab import build_vocab_from_iterator
import torch.optim as optim
from nltk.tokenize import WordPunctTokenizer
import re
import random
import contractions
from tqdm import tqdm
import matplotlib.pyplot as plt

In [2]:
DATA_PATH = 'data/sentences.csv'

unk_token = '<UNK>'
start_token = '<SOS>'
end_token = '<EOS'
pad_token = '<PAD>'

In [3]:
class EnglishDarijaDataset(Dataset):
    def __init__(self, path, tokenizer_en, tokenizer_da, max_len=20):
        self.df = pd.read_csv(path)
        self.df.dropna(inplace=True)
        self.tokenizer_en = tokenizer_en
        self.tokenizer_da = tokenizer_da
        self.en_vocab = None
        self.da_vocab = None
        self.max_len = max_len
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        en_s = self.df.iloc[idx]['eng']
        da_s = self.df.iloc[idx]['darija']
        en_l = tokenizer_en(en_s, self.max_len)
        da_l = tokenizer_da(da_s, self.max_len)
        if self.en_vocab and self.da_vocab:
            en_ids = self.en_vocab.lookup_indices(en_l)
            da_ids = self.da_vocab.lookup_indices(da_l)
            return {
                'en_sentence' : en_s,
                'da_sentence' : da_s,
                'en_tensors' : torch.tensor(en_ids),
                'da_tensors' : torch.tensor(da_ids)
            }
        else:
            return {
                'en_sentence' : en_s,
                'da_sentence' : da_s,
                'en_tokens' : en_l,
                'da_tokens' : da_l
            }
    def set_vocabs(self, en, da):
        self.en_vocab = en
        self.da_vocab = da

In [4]:
def tokenizer_en(s, max_len):
    tokenizer = WordPunctTokenizer()
    tokenized = tokenizer.tokenize(contractions.fix(s.lower()))
    if len(tokenized) > max_len - 2:
        tokenized = tokenized[:max_len-2]
    tokenized.insert(0, start_token)
    tokenized.append(end_token)
    while len(tokenized) < max_len:
        tokenized.append(pad_token)
    return tokenized

In [5]:
def tokenizer_da(s, max_len):
    tokenizer = WordPunctTokenizer()
    tokenized = tokenizer.tokenize(s)
    if len(tokenized) > max_len - 2:
        tokenized = tokenized[:max_len-2]
    tokenized.insert(0, start_token)
    tokenized.append(end_token)
    while len(tokenized) < max_len:
        tokenized.append(pad_token)
    return tokenized

In [6]:
dataset = EnglishDarijaDataset(DATA_PATH, tokenizer_en, tokenizer_da)

In [7]:
train_dataset, val_dataset, test_dataset = random_split(dataset, [0.8, 0.1, 0.1])

In [8]:
en_tokens = (data['en_tokens'] for data in train_dataset)
da_tokens = (data['da_tokens'] for data in train_dataset)

In [9]:
special_tokens = [start_token, end_token, pad_token, unk_token]

en_vocab = build_vocab_from_iterator(en_tokens, min_freq=2, specials=special_tokens)

da_vocab = build_vocab_from_iterator(da_tokens, min_freq=2, specials=special_tokens)

In [10]:
print(en_vocab.get_itos()[:10])
print(da_vocab.get_itos()[:10])

['<SOS>', '<EOS', '<PAD>', '<UNK>', 'i', 'you', 'to', 'the', 'not', 'is']
['<SOS>', '<EOS', '<PAD>', '<UNK>', '?', 'ghadi', 'had', 'ana', 't', 'chi']


In [11]:
assert en_vocab[unk_token] == da_vocab[unk_token]
assert en_vocab[pad_token] == da_vocab[pad_token]

unk_index = en_vocab[unk_token]
pad_index = en_vocab[pad_token]

en_vocab.set_default_index(unk_index)
da_vocab.set_default_index(unk_index)

In [12]:
dataset.set_vocabs(en_vocab, da_vocab)

In [13]:
class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded)
        return hidden, cell

In [14]:
class Decoder(nn.Module):
    def __init__(self, output_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(0))
        return prediction, hidden, cell

In [15]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, trg, teacher_forcing_ratio):
        batch_size = trg.shape[1]
        trg_length = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(trg_length, batch_size, trg_vocab_size)
        hidden, cell = self.encoder(src)
        input = trg[0, :]
        for t in range(1, trg_length):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[t] if teacher_force else top1
        return outputs

In [16]:
lr = 0.001
batch_size = 32
num_epochs = 1
input_dim = len(en_vocab)
output_dim = len(da_vocab)
embedding_dim = 256
hidden_dim = 512
n_layers = 2
dropout = 0.1

encoder = Encoder(input_dim, embedding_dim, hidden_dim, n_layers, dropout)
decoder = Decoder(output_dim, embedding_dim, hidden_dim, n_layers, dropout)

model = Seq2Seq(encoder, decoder)

In [17]:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [18]:
optimizer = optim.Adam(model.parameters())

In [19]:
loss_fn = nn.CrossEntropyLoss(ignore_index=pad_index)

In [20]:
def train(model, dataloader, optimizer, loss_fn):
    model.train()
    train_loss = 0
    for batch in dataloader:
        optimizer.zero_grad()
        src = batch['en_tensors']
        trg = batch['da_tensors']
        pred = model(src, trg, 0.75)
        loss = loss_fn(pred.view(-1, pred.shape[-1]), trg.view(-1))
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
    return train_loss / len(dataloader)

In [21]:
def validate(model, dataloader, optimizer, loss_fn):
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in dataloader:
            src = batch['en_tensors']
            trg = batch['da_tensors']
            pred = model(src, trg, 0)
            loss = loss_fn(pred.view(-1, pred.shape[-1]), trg.view(-1))
            val_loss += loss.item()
    return val_loss / len(dataloader)

In [22]:
def plot_losses(train_losses, val_losses):
    plt.figure(figsize=(10, 5))
    plt.plot(train_losses, label='Training Loss', color='blue')
    plt.plot(val_losses, label='Validation Loss', color='red')
    plt.title('Training and Validation Losses')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)
    plt.show()

In [23]:
def train_model(model, train_dataloader, val_dataloader, optimizer, loss_fn, num_epochs, plot=True):
    train_losses = []
    val_losses = []
    for e in tqdm(range(1, num_epochs+1), desc="Training", leave=True):
        train_loss = train(model, train_dataloader, optimizer, loss_fn)
        val_loss = validate(model, val_dataloader, optimizer, loss_fn)
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        tqdm.write(f'Epoch {e}: Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}')
        tqdm.set_postfix({'train_loss': train_loss, 'val_loss': val_loss})
    if plot:
        plot_losses(train_losses, val_losses)

In [24]:
train_model(model, train_dataloader, val_dataloader, optimizer, loss_fn, num_epochs)

Training:   0%|                                    | 0/1 [17:45<?, ?it/s]


Epoch 1: Training Loss: 5.2710, Validation Loss: 4.7332


AttributeError: 'dict' object has no attribute 'postfix'

In [31]:
def test(model, dataloader):
    model.eval()
    for batch in dataloader:
        optimizer.zero_grad()
        src = batch['en_tensors']
        trg = batch['da_tensors']
        pred = model(src, trg, 0)
        print(src.size())
        print(pred)

In [32]:
test(model, test_dataloader)

torch.Size([32, 20])
tensor([[[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         ...,
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00]],

        [[ 1.5609e+01,  2.1714e+00, -8.0796e+00,  ..., -7.7806e-01,
          -7.4093e+00, -3.0480e+00],
         [-1.5111e-01, -9.5030e-01, -6.2552e+00,  ..., -3.0094e+00,
          -3.8640e+00, -2.5819e+00],
         [-5.1114e-01,  4.1533e+00, -8.4973e+00,  ..., -3.8553e+00,
          -3.9768e+00, -5.4630e+00],
         ...,
    

torch.Size([32, 20])
tensor([[[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         ...,
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00]],

        [[ 1.5609e+01,  2.1714e+00, -8.0796e+00,  ..., -7.7806e-01,
          -7.4093e+00, -3.0480e+00],
         [-1.1686e-01, -9.3598e-01, -6.0699e+00,  ..., -2.9160e+00,
          -3.7778e+00, -2.5096e+00],
         [-6.0429e-01,  4.4288e+00, -8.3960e+00,  ..., -3.7884e+00,
          -4.0403e+00, -5.5840e+00],
         ...,
    

KeyboardInterrupt: 