In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MAX_LENGTH = 25
SOS_token = 0

In [2]:
def add_sentence_tokens(s: list):
    return ["<SOS>"] + s + ["<EOS>"]

In [3]:
import pandas as pd
from nltk.tokenize import wordpunct_tokenize

df = pd.read_csv("../data/preprocessed/preprocessed_val.csv")

df["en_processed"] = df["en_processed"].apply(wordpunct_tokenize)
df["hu_processed"] = df["hu_processed"].apply(wordpunct_tokenize)

df["en_processed"] = df["en_processed"].apply(add_sentence_tokens)
df["hu_processed"] = df["hu_processed"].apply(add_sentence_tokens)

In [4]:
print(df.iloc[535]["en_processed"])
print(df.iloc[535]["hu_processed"])

['<SOS>', 'the', 'implementation', 'of', 'this', 'directive', 'will', ',', 'in', 'fact', ',', 'require', 'this', 'enhanced', 'coordination', '.', '<EOS>']
['<SOS>', 'az', 'irányelv', 'végrehajtásához', 'tulajdonképpen', 'szükség', 'van', 'erre', 'a', 'fokozott', 'együttműködésre', '.', '<EOS>']


In [5]:
from gensim.models import Word2Vec

w2c_model_en = Word2Vec(
    sentences=df["en_processed"], vector_size=100, window=5, min_count=1, workers=4
)
w2c_model_en.save("../models/word2vec_en.model")

In [6]:
w2c_model_hu = Word2Vec(
    sentences=df["hu_processed"], vector_size=100, window=5, min_count=1, workers=4
)
w2c_model_hu.save("../models/word2vec_hu.model")

In [45]:
w2c_model_en.wv["word"]
print(w2c_model_en.wv.most_similar(positive=[w2c_model_en.wv["word"]], topn=2))

[('word', 1.0), ('person', 0.9840174913406372)]


In [36]:
def get_vocab(sentences: list):
    text = []
    for sent in sentences:
        text += sent
    return set(text)

def word2idx(vocab: list):
    w2i = {}
    n_words = 0
    for word in vocab:
        w2i[word] = n_words
        n_words += 1
    return w2i

In [40]:
en_vocab = get_vocab(df["en_processed"].values)
en_word2idx = word2idx(en_vocab)

hu_vocab = get_vocab(df["hu_processed"].values)
hu_word2idx = word2idx(hu_vocab)


In [42]:
# extract Word2Vec embeddings for english and hu tokens
en_pretrained_embeddings = torch.zeros(len(en_vocab), w2c_model_en.vector_size)
hu_pretrained_embeddings = torch.zeros(len(hu_vocab), w2c_model_hu.vector_size)

for word, index in en_word2idx.items():
    en_pretrained_embeddings[index] = torch.tensor(w2c_model_en.wv[word])
    
for word, index in hu_word2idx.items():
    hu_pretrained_embeddings[index] = torch.tensor(w2c_model_hu.wv[word])


In [63]:
import numpy as np
from torch.utils.data import Dataset


class LanguageDataset(Dataset):
    def __init__(
        self, input_sentences, output_sentences, input_embedder, output_embedder
    ) -> None:
        self.input_sentences = input_sentences
        self.output_senteces = output_sentences
        self.input_emb = input_embedder
        self.output_emb = output_embedder
        in_lengths = np.array([len(sublist) for sublist in input_sentences]).max()
        out_lengths = np.array([len(sublist) for sublist in output_sentences]).max()
        self.max_len = in_lengths if in_lengths > out_lengths else out_lengths

    def __getitem__(self, index):

        data = [self.input_emb[token] for token in self.input_sentences[index]] + [
            self.input_emb["<EOS>"]
        ] * (self.max_len - len(self.input_sentences[index]))
        target = [self.output_emb[token] for token in self.output_senteces[index]] + [
            self.output_emb["<EOS>"]
        ] * (self.max_len - len(self.output_senteces[index]))

        return data, target

    def __len__(self):
        return len(self.input_sentences)

In [64]:
en_to_hu_dataset = LanguageDataset(
    df["en_processed"], df["hu_processed"], w2c_model_en.wv, w2c_model_hu.wv
)
en_to_hu_loader = DataLoader(en_to_hu_dataset, batch_size=64, shuffle=True)

In [43]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        #self.embedding = nn.Embedding(input_size, hidden_size)
        self.embedding = nn.Embedding.from_pretrained(en_pretrained_embeddings, freeze=False)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input_seq):
        embedded = self.dropout(self.embedding(input_seq))
        # embedded = self.dropout(input_seq)
        output, hidden = self.gru(embedded)
        return output, hidden

In [44]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        #self.embedding = nn.Embedding(output_size, hidden_size)
        self.embedding = nn.Embedding.from_pretrained(hu_pretrained_embeddings, freeze=False)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(
            batch_size, 1, dtype=torch.long, device=device
        ).fill_(SOS_token)
        decoder_hidden = encoder_hidden
        decoder_outputs = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden = self.forward_step(
                decoder_input, decoder_hidden
            )
            decoder_outputs.append(decoder_output)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1)  # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(
                    -1
                ).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        return (
            decoder_outputs,
            decoder_hidden,
            None,
        )  # We return `None` for consistency in the training loop

    def forward_step(self, input_seq, hidden):
        output = self.embedding(input_seq)
        # output = input_seq
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.out(output)
        return output, hidden

In [57]:
def train_epoch(
    dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion
):

    total_loss = 0
    for data in dataloader:
        input_tensor, target_tensor = data

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)

        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)), target_tensor.view(-1)
        )
        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

In [58]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return "%dm %ds" % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (- %s)" % (asMinutes(s), asMinutes(rs))

In [59]:
import matplotlib.pyplot as plt

plt.switch_backend("agg")
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [60]:
def train(
    train_dataloader,
    encoder,
    decoder,
    n_epochs,
    learning_rate=0.001,
    print_every=100,
    plot_every=100,
):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()

    for epoch in range(1, n_epochs + 1):
        loss = train_epoch(
            train_dataloader,
            encoder,
            decoder,
            encoder_optimizer,
            decoder_optimizer,
            criterion,
        )
        print_loss_total += loss
        plot_loss_total += loss

        if epoch % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print(
                "%s (%d %d%%) %.4f"
                % (
                    timeSince(start, epoch / n_epochs),
                    epoch,
                    epoch / n_epochs * 100,
                    print_loss_avg,
                )
            )

        if epoch % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [72]:
hidden_size = 100


encoder = EncoderRNN(100, hidden_size).to(device)
decoder = DecoderRNN(hidden_size, 100).to(device)

train(en_to_hu_loader, encoder, decoder, 80, print_every=5, plot_every=5)

TypeError: embedding(): argument 'indices' (position 2) must be Tensor, not list