In [1]:
import pandas as pd
import string
import numpy as np
import re

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from tqdm import tqdm

import matplotlib.pyplot as plt

from collections import defaultdict

from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score

In [2]:
df = pd.read_csv('../All_irregular_verb_list.csv', sep=';', encoding='utf-8')
df = df[['First', 'Second', 'Third']]

In [3]:
for column in df.columns[1:]:
    df[column] = df[column].apply(lambda x: re.sub(r'[/.*]', '', x))
    df[column] = df[column].apply(lambda x: re.sub(r'[^a-zA-Z]', '', x))

criteria = df['First'].map(lambda x: len(set(x) & set(string.ascii_uppercase)) == 0)
df = df[criteria]
df = df.reset_index(drop=True)

In [4]:
char_emb = {char: i+1 for i, char in enumerate(string.ascii_lowercase)}
char_emb['0'] = 0

In [5]:
decoder = {val: key for key, val in char_emb.items()}

def decode(idx, decoder):
    t = ''.join([decoder[ix] for ix in idx])
    return re.sub(r'[0.*]', '', t)

In [6]:
max_len = 0
for column_id in range(len(df.columns)):
    for i in range(df.index.size-1):
        if len(df[df.columns[column_id]][i]) > max_len:
            max_len = len(df[df.columns[column_id]][i])

In [7]:
def word2emb(emb, n, word):
    res = np.zeros(len(word)+1, dtype=int)
    for i, char in enumerate(word):
        res[i] = emb[char]
    return res

for column in df.columns:
    print(column)
    df[column] = df[column].apply(lambda word: word2emb(char_emb, max_len, word))

First
Second
Third


In [9]:
data, labels = df['First'], df['Second']
data, labels = data.map(lambda x: torch.from_numpy(x)), labels.map(lambda x: torch.from_numpy(x))

In [10]:
def acc(write_res = False):
    accuracy = []
    for sentence, target in zip(data, labels):
        with torch.no_grad():
            tag_scores = model(sentence)
            _, preds = torch.max(tag_scores, 1)
            first = decode(sentence.tolist(), decoder)
            gt = decode(target.tolist(), decoder)
            pred = decode(preds.tolist(), decoder)
            accuracy.append(int(gt == pred))
            if write_res:
                print('{0} => gt: {1}, pred: {2}'.format(first, 
                                                         gt, 
                                                         pred))
    return sum(accuracy) / len(accuracy)

In [11]:
EMBEDDING_DIM = 8
HIDDEN_DIM = 8

In [12]:
class LSTMVerbFormer(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size):
        super(LSTMVerbFormer, self).__init__()
        self.hidden_dim = hidden_dim

        self.char_embeddings = nn.Embedding(vocab_size, embedding_dim)

        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True) # False

        self.hidden2tag = nn.Linear(hidden_dim * 2, vocab_size) # without * 2

    def forward(self, sentence):
        embeds = self.char_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [13]:
model = LSTMVerbFormer(EMBEDDING_DIM, HIDDEN_DIM, len(char_emb))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [15]:
sentence = data[0]

In [17]:
sentence, sentence.shape

(tensor([1, 2, 9, 4, 5, 0]), torch.Size([6]))

In [16]:
emb = model.char_embeddings(sentence)

In [18]:
emb, emb.shape

(tensor([[-0.3672, -0.4354,  0.7451, -0.4180, -0.3946,  0.0936, -2.1803, -0.2202],
         [ 1.4989, -1.5685, -0.1214, -0.1095,  0.8053,  0.9983, -0.3050,  1.8419],
         [-0.3616, -0.2323, -0.2106,  1.1162, -0.4773, -0.8742,  0.0860,  0.8378],
         [-0.0238,  1.3686, -0.4056,  0.1771, -0.4261, -0.1113, -0.7985,  0.8763],
         [ 0.4645, -0.2100,  1.2180,  1.4324,  0.4759, -0.2477, -0.0648, -1.0911],
         [ 2.0399,  0.3315,  0.0111,  0.2345,  0.9206, -1.1231, -1.2478, -0.3788]],
        grad_fn=<EmbeddingBackward>), torch.Size([6, 8]))

In [19]:
lstm_out, hidden = model.lstm(emb.view(len(sentence), 1, -1))

In [20]:
lstm_out, lstm_out.shape

(tensor([[[ 0.0693, -0.0276, -0.0698, -0.0864,  0.0221, -0.1289, -0.0878,
            0.0068, -0.2516, -0.0443, -0.3045, -0.3029,  0.2695,  0.0874,
           -0.0336, -0.1518]],
 
         [[-0.0219, -0.2673, -0.0703, -0.0205, -0.0906, -0.1983, -0.0078,
            0.1578, -0.1488, -0.0532, -0.5636, -0.1503,  0.5515,  0.0276,
           -0.2562, -0.1013]],
 
         [[-0.1151, -0.2135, -0.0831, -0.1586, -0.0902, -0.2658, -0.0446,
            0.1898, -0.1841, -0.1462, -0.2891, -0.1820,  0.2523,  0.1579,
            0.0026, -0.1990]],
 
         [[-0.0638, -0.1291, -0.1815, -0.1643,  0.0434, -0.2744, -0.0081,
            0.0605, -0.3010, -0.1290, -0.1222, -0.2602,  0.2243,  0.0991,
            0.0292, -0.1828]],
 
         [[-0.1015,  0.0278, -0.0571, -0.0067, -0.0856, -0.2445, -0.0079,
            0.1692, -0.0815, -0.3574, -0.0869, -0.3370,  0.1224,  0.1942,
            0.0636, -0.1063]],
 
         [[-0.0658,  0.1261, -0.1430,  0.1295, -0.2216, -0.3495, -0.0187,
            0.1000, -

In [24]:
hidden[0], hidden[0].shape

(tensor([[[-0.0658,  0.1261, -0.1430,  0.1295, -0.2216, -0.3495, -0.0187,
            0.1000]],
 
         [[-0.2516, -0.0443, -0.3045, -0.3029,  0.2695,  0.0874, -0.0336,
           -0.1518]]], grad_fn=<StackBackward>), torch.Size([2, 1, 8]))

In [25]:
hidden[1], hidden[1].shape

(tensor([[[-0.2116,  0.1640, -0.3600,  0.2645, -0.3159, -0.7468, -0.0569,
            0.1549]],
 
         [[-0.3817, -0.1012, -0.7857, -0.5506,  0.4594,  0.1119, -0.0835,
           -0.2977]]], grad_fn=<StackBackward>), torch.Size([2, 1, 8]))

In [38]:
model.lstm.all_weights

[[Parameter containing:
  tensor([[-1.8689e-01, -3.4608e-01,  3.2576e-01,  1.1753e-01, -3.5274e-01,
            4.1254e-02,  4.1443e-02, -7.7751e-02],
          [ 1.8753e-01,  3.0312e-01, -2.9139e-01, -9.4925e-02,  9.1154e-02,
           -1.6511e-01, -3.4213e-01,  3.4725e-01],
          [ 6.6944e-02,  2.2907e-01, -2.6003e-02,  3.4659e-01,  2.9983e-01,
           -2.9050e-01,  3.1816e-01,  2.3159e-01],
          [-2.5763e-01,  8.8137e-02,  2.1695e-01,  2.6021e-01,  5.5331e-02,
           -3.4053e-01,  2.2657e-01, -3.0851e-01],
          [-1.9915e-01, -2.0600e-01, -1.2154e-01,  4.6871e-02,  3.4490e-01,
           -1.5106e-01,  2.5568e-01,  3.1827e-02],
          [ 2.3544e-01,  2.4281e-02,  7.5151e-02,  2.2761e-01, -2.7077e-02,
           -2.0442e-02, -2.3380e-01, -2.2475e-01],
          [-9.7622e-02,  2.0729e-01,  3.1938e-03,  1.6434e-01,  2.5890e-01,
            2.0603e-01, -8.3834e-02,  2.3760e-01],
          [-5.9033e-02,  9.5119e-02,  1.1279e-01, -2.6073e-01, -2.9193e-01,
           

In [69]:
class EncoderRNN(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size):
        super(EncoderRNN, self).__init__()
        self.hidden_dim = hidden_dim

        self.char_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

    def forward(self, input, hidden):
        embedded = self.char_embeddings(input)
        output = embedded
        output, hidden = self.lstm(output.view(1, 1, -1), hidden.view(1, 1, 1, -1))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_dim, device=device)

In [70]:
class DecoderRNN(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.char_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.LSTM(embedding_dim, hidden_dim)
        self.out = nn.Linear(hidden_dim, vocab_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.char_embeddings(input)
        output = F.relu(output)
        output, hidden = self.lstm(output.view(1, 1, -1), hidden.view(1, 1, 1, -1))
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_dim, device=device)

In [71]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, dropout_p=0.1, max_length=max_len):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        self.vocab_size = vocab_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.char_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.attn = nn.Linear(self.embedding_dim * 2, self.max_length)
        self.attn_combine = nn.Linear(self.embedding_dim * 2, self.embedding_dim)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.LSTM(self.embedding_dim, self.hidden_dim)
        self.out = nn.Linear(self.hidden_dim, self.vocab_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.char_embeddings(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.lstm(output, hidden.view(1, 1, 1, -1))

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_dim, device=device)

In [44]:
teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=max_len):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_dim, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [45]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [46]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = list(zip(data, labels))[:n_iters]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [48]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_
    plt.plot(points)

In [50]:
# def evaluate(encoder, decoder, sentence, max_length=max_len):
#     with torch.no_grad():
#         input_tensor = tensorFromSentence(input_lang, sentence)
#         input_length = input_tensor.size()[0]
#         encoder_hidden = encoder.initHidden()

#         encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

#         for ei in range(input_length):
#             encoder_output, encoder_hidden = encoder(input_tensor[ei],
#                                                      encoder_hidden)
#             encoder_outputs[ei] += encoder_output[0, 0]

#         decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

#         decoder_hidden = encoder_hidden

#         decoded_words = []
#         decoder_attentions = torch.zeros(max_length, max_length)

#         for di in range(max_length):
#             decoder_output, decoder_hidden, decoder_attention = decoder(
#                 decoder_input, decoder_hidden, encoder_outputs)
#             decoder_attentions[di] = decoder_attention.data
#             topv, topi = decoder_output.data.topk(1)
#             if topi.item() == EOS_token:
#                 decoded_words.append('<EOS>')
#                 break
#             else:
#                 decoded_words.append(output_lang.index2word[topi.item()])

#             decoder_input = topi.squeeze().detach()

#         return decoded_words, decoder_attentions[:di + 1]

def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [72]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
EMBEDDING_DIM = 8
HIDDEN_DIM = 8
encoder1 = EncoderRNN(EMBEDDING_DIM, HIDDEN_DIM, len(char_emb))
attn_decoder1 = AttnDecoderRNN(EMBEDDING_DIM, HIDDEN_DIM, len(char_emb), dropout_p=0.1)

trainIters(encoder1, attn_decoder1, len(data), print_every=5)

IndexError: index 1 is out of bounds for dimension 0 with size 1