In [8]:
import numpy as np

import utils

NUM_EMBEDDING = 2000
def word2numpy(txt):
    return np.array([ord(c) for c in txt])

def wordlist2numpy(lines):
    return utils.pad_sequences([word2numpy(line) for line in lines],
                               maxlen=12, dtype=int, value=0)

ALL_BINYAN = 'פעל נפעל פיעל פועל הפעיל הופעל התפעל'.split()
BINYAN_SIZE = len(ALL_BINYAN)

def binyan_to_category(b):
    return ALL_BINYAN.index(b)


def binyan_list_to_category(bs):
    return np.array([binyan_to_category(b) for b in bs])

In [9]:
import torch

import torch.nn as nn
assert torch.cuda.is_available()


class Model(nn.Module):
    def __init__(self, UNITS):
        super().__init__()
        self.units = UNITS

        self.embed = nn.Embedding(num_embeddings=NUM_EMBEDDING, embedding_dim=UNITS)
        self.lstm1 = nn.LSTM(input_size=UNITS, hidden_size=UNITS, num_layers=1, batch_first=True, bidirectional=True)

        self.binyan = nn.Linear(in_features=UNITS, out_features=BINYAN_SIZE)

    def forward(self, x):
        embeds = self.embed(x)

        lstm_out, (h_n, c_n) = self.lstm1(embeds)
        left, right = torch.chunk(h_n, 2, dim=0)
        merge = torch.squeeze(left + right)

        binyan = self.binyan(merge)
        return {'B': binyan }

model = Model(UNITS=125)
criterion = nn.CrossEntropyLoss()  # add ignore_index for root
optimizer = torch.optim.Adam(model.parameters(), lr=2e-4)

def to_device(d):
    return d
    if hasattr(d, 'cuda'):
        return d.cuda()
    return {k: v.cuda() for k, v in d.items()}

model = to_device(model)

In [10]:

wordlist = ['ידעתי', 'התאפס', 'יאבד']
binyanlist = ['פעל', 'התפעל', 'פיעל']
print(wordlist2numpy(wordlist))
print(binyan_list_to_category(binyanlist))

def sanity():
    with torch.no_grad():
        numpy_inp = wordlist2numpy(wordlist)
        inputs = to_device(torch.from_numpy(numpy_inp).to(torch.int64))
        tag_scores = model(inputs)
        expected = binyan_list_to_category(binyanlist)
        print(f'{tag_scores["B"].shape=}')
        print(f"{np.argmax(tag_scores['B'].cpu(), axis=1).shape=}")
        print(f"{expected.shape=}")

sanity()

[[1497 1491 1506 1514 1497    0    0    0    0    0    0    0]
 [1492 1514 1488 1508 1505    0    0    0    0    0    0    0]
 [1497 1488 1489 1491    0    0    0    0    0    0    0    0]]
[0 6 2]
tag_scores["B"].shape=torch.Size([3, 7])
np.argmax(tag_scores['B'].cpu(), axis=1).shape=torch.Size([3])
expected.shape=(3,)


In [14]:
BATCH_SIZE = 32

def batch(a):
    ub = a.shape[0] // BATCH_SIZE * BATCH_SIZE
    return torch.from_numpy(a[:ub]).to(torch.int64).split(BATCH_SIZE)

def batch_all_y(y):
    return [{'B': b } for b in batch(y)]

def accuracy(output, ybatch):
    n = (ybatch != 0).sum()
    c = np.argmax(output, axis=1)
    return ((c == ybatch) & (c != 0)).sum() / n

def fit(x_train, y_train, epochs=1):
    x_all = batch(x_train)
    y_all = batch_all_y(y_train)

    for epoch in range(epochs):
        total = len(x_all)
        accs = {'B': [] }
        for i, (x, y) in enumerate(zip(x_all, y_all)):

            x = to_device(x)

            optimizer.zero_grad()
            outputs = model(x)

            outputs = to_device(outputs)

            y = to_device(y)
            sub_losses = {k: criterion(outputs[k], y[k]) for k in outputs}
            loss = sum(sub_losses.values())
            loss.backward()

            optimizer.step()

            outputs = {k: v.cpu().data.numpy() for k, v in outputs.items()}
            y = {k: v.cpu().data.numpy() for k, v in y.items()}

            for k in outputs:
                accs[k].append(accuracy(outputs[k], y[k]))

            if i % 20 == 0:
                print("{} {:4}/{}".format(epoch, i, total), end=' ')
                for k in accs:
                    print("{}_acc: {:.4f}".format(k, np.mean(accs[k])), end=' ')
                print("Loss: {:.4f}".format(loss.item()), end='\r')
                accs = {'B': [] }
        print()

def validate(x_valid, y_valid):
    x_all = batch(x_valid)
    y_all = batch_all_y(y_valid)

    with torch.no_grad():
        accs = {'B': [] }
        losses = []
        for i, (x, y) in enumerate(zip(x_all, y_all)):
            x = to_device(x)

            outputs = to_device(model(x))

            y = to_device(y)

            sub_losses = {k: criterion(outputs[k], y[k]) for k in outputs}
            loss = sum(sub_losses.values())

            outputs = {k: v.cpu().data.numpy() for k, v in outputs.items()}
            y = {k: v.cpu().data.numpy() for k, v in y.items()}

            for k in outputs:
                accs[k].append(accuracy(outputs[k], y[k]))
            losses.append(loss.item())

        for k in accs:
            print("{}_acc: {:.4f}".format(k, np.mean(accs[k])), end=' ')
        print("Loss: {:.4f}".format(np.mean(losses)))

import table_dataset

verbs, binyans = table_dataset.load_dataset('random.tsv')
train_x, train_y = wordlist2numpy(verbs), binyan_list_to_category(binyans)
fit(train_x, train_y, epochs=30)
validate(train_x, train_y)


0  300/312 B_acc: 0.7342 Loss: 0.9588
1  300/312 B_acc: 0.7851 Loss: 0.7905
2  300/312 B_acc: 0.8150 Loss: 0.6870
3  300/312 B_acc: 0.8308 Loss: 0.5948
4  300/312 B_acc: 0.8341 Loss: 0.5065
5  300/312 B_acc: 0.8433 Loss: 0.4622
6  300/312 B_acc: 0.8504 Loss: 0.4341
7  300/312 B_acc: 0.8575 Loss: 0.4066
8  300/312 B_acc: 0.8645 Loss: 0.3867
9  300/312 B_acc: 0.8679 Loss: 0.3668
10  300/312 B_acc: 0.8766 Loss: 0.3360
11  300/312 B_acc: 0.8822 Loss: 0.3149
12  300/312 B_acc: 0.8895 Loss: 0.2905
13  300/312 B_acc: 0.8895 Loss: 0.2604
14  300/312 B_acc: 0.8877 Loss: 0.2382
15  300/312 B_acc: 0.8966 Loss: 0.2145
16  300/312 B_acc: 0.9021 Loss: 0.1930
17  300/312 B_acc: 0.9093 Loss: 0.1953
18  300/312 B_acc: 0.9182 Loss: 0.1926
19  300/312 B_acc: 0.9128 Loss: 0.2059
20  300/312 B_acc: 0.9094 Loss: 0.2266
21  300/312 B_acc: 0.9141 Loss: 0.1991
22  300/312 B_acc: 0.9233 Loss: 0.2568
23  300/312 B_acc: 0.9285 Loss: 0.2105
24  300/312 B_acc: 0.9428 Loss: 0.2130
25  300/31