In [1]:
import numpy as np

import utils

NUM_EMBEDDING = 2000
def word2numpy(txt):
    return np.array([ord(c) for c in txt])

def wordlist2numpy(lines):
    return utils.pad_sequences([word2numpy(line) for line in lines],
                               maxlen=12, dtype=int, value=0)

ALL_BINYAN = 'פעל נפעל פיעל פועל הפעיל הופעל התפעל'.split()
BINYAN_SIZE = len(ALL_BINYAN)

def binyan_to_category(b):
    return ALL_BINYAN.index(b)


def binyan_list_to_category(bs):
    return np.array([binyan_to_category(b) for b in bs])

In [8]:
import torch

import torch.nn as nn
assert torch.cuda.is_available()


class Model(nn.Module):
    def __init__(self, UNITS):
        super().__init__()
        self.units = UNITS

        self.embed = nn.Embedding(num_embeddings=NUM_EMBEDDING, embedding_dim=UNITS)
        self.lstm1 = nn.LSTM(input_size=UNITS, hidden_size=UNITS, num_layers=1, batch_first=True, bidirectional=True)

        self.binyan = nn.Linear(in_features=UNITS, out_features=BINYAN_SIZE)

    def forward(self, x):
        embeds = self.embed(x)

        lstm_out, (h_n, c_n) = self.lstm1(embeds)
        left, right = torch.chunk(h_n, 2, dim=0)
        merge = torch.squeeze(left + right)

        binyan = self.binyan(merge)
        return { 'B': binyan }

model = Model(UNITS=125)
criterion = nn.CrossEntropyLoss()  # add ignore_index for root
optimizer = torch.optim.Adam(model.parameters(), lr=2e-3)

def to_device(d):
    if hasattr(d, 'cuda'):
        return d.cuda()
    return {k: v.cuda() for k, v in d.items()}

model = to_device(model)

In [3]:

wordlist = ['ידעתי', 'התאפס', 'יאבד']
binyanlist = ['פעל', 'התפעל', 'פיעל']
print(wordlist2numpy(wordlist))
print(binyan_list_to_category(binyanlist))

def sanity():
    with torch.no_grad():
        numpy_inp = wordlist2numpy(wordlist)
        inputs = to_device(torch.from_numpy(numpy_inp).to(torch.int64))
        tag_scores = model(inputs)
        expected = binyan_list_to_category(binyanlist)
        print(f'{tag_scores["B"].shape=}')
        print(f"{np.argmax(tag_scores['B'].cpu(), axis=1).shape=}")
        print(f"{expected.shape=}")

sanity()

[[1497 1491 1506 1514 1497    0    0    0    0    0    0    0]
 [1492 1514 1488 1508 1505    0    0    0    0    0    0    0]
 [1497 1488 1489 1491    0    0    0    0    0    0    0    0]]
[0 6 2]
tag_scores["B"].shape=torch.Size([3, 7])
np.argmax(tag_scores['B'].cpu(), axis=1).shape=torch.Size([3])
expected.shape=(3,)


In [12]:
BATCH_SIZE = 32

def batch(a):
    ub = a.shape[0] // BATCH_SIZE * BATCH_SIZE
    return torch.from_numpy(a[:ub]).to(torch.int64).split(BATCH_SIZE)

def batch_all_y(y):
    return [{'B': b } for b in batch(y)]

def accuracy(output, ybatch):
    n = (ybatch != 0).sum()
    c = np.argmax(output, axis=1)
    return ((c == ybatch) & (c != 0)).sum() / n

def fit(x_train, y_train, epoch):
    x_all = batch(x_train)
    y_all = batch_all_y(y_train)

    total = len(x_all)
    accs = {'B': [] }
    for i, (x, y) in enumerate(zip(x_all, y_all)):

        x = to_device(x)

        optimizer.zero_grad()
        outputs = model(x)

        outputs = to_device(outputs)

        y = to_device(y)
        sub_losses = {k: criterion(outputs[k], y[k]) for k in outputs}
        loss = sum(sub_losses.values())
        loss.backward()

        optimizer.step()

        outputs = {k: v.cpu().data.numpy() for k, v in outputs.items()}
        y = {k: v.cpu().data.numpy() for k, v in y.items()}

        for k in outputs:
            accs[k].append(accuracy(outputs[k], y[k]))

        if i % 10 == 0:
            print("{} {:4}/{}".format(epoch, i, total), end=' ')
            for k in accs:
                print("{}_acc: {:.4f}".format(k, np.mean(accs[k])), end=' ')
            print("Loss: {:.4f}".format(loss.item()), end='\r')
            accs = {'B': [] }
    print()

def validate(x_valid, y_valid):
    x_all = batch(x_valid)
    y_all = batch_all_y(y_valid)

    with torch.no_grad():
        accs = {'B': [] }
        losses = []
        for i, (x, y) in enumerate(zip(x_all, y_all)):
            x = to_device(x)

            outputs = to_device(model(x))

            y = to_device(y)

            sub_losses = {k: criterion(outputs[k], y[k]) for k in outputs}
            loss = sum(sub_losses.values())

            outputs = {k: v.cpu().data.numpy() for k, v in outputs.items()}
            y = {k: v.cpu().data.numpy() for k, v in y.items()}

            for k in outputs:
                accs[k].append(accuracy(outputs[k], y[k]))
            losses.append(loss.item())
        print("  ", end="")
        for k in accs:
            print("{}_acc: {:.4f}".format(k, np.mean(accs[k])), end=' ')
        print("Loss: {:.4f}".format(np.mean(losses)))

def run_experiment(train, valid, epochs):
    train_x, train_y = wordlist2numpy(train[0]), binyan_list_to_category(train[1])
    valid_x, valid_y = wordlist2numpy(valid[0]), binyan_list_to_category(valid[1])
    for e in range(epochs):
        fit(train_x, train_y, epoch=e)
        validate(valid_x, valid_y)


In [7]:
import concrete

train = concrete.load_dataset('random_train.tsv')
valid = concrete.load_dataset('random_validate.tsv')

In [None]:
run_experiment(train, valid, 10)



0 3120/3125 B_acc: 0.9262 Loss: 0.3707Loss: 0.1014
  B_acc: 0.8751 Loss: 0.3237
1 3120/3125 B_acc: 0.9247 Loss: 0.3918
  B_acc: 0.8708 Loss: 0.3314
2 3120/3125 B_acc: 0.9374 Loss: 0.3293 B_acc: 0.9107 Loss: 0.1390Loss: 0.1943Loss: 0.2345Loss: 0.3047 B_acc: 0.9254 Loss: 0.1920Loss: 0.1126Loss: 0.4120Loss: 0.1769
  B_acc: 0.8717 Loss: 0.3345
3 3120/3125 B_acc: 0.9478 Loss: 0.3183Loss: 0.2125Loss: 0.2019 B_acc: 0.9186 Loss: 0.3154 B_acc: 0.9001 Loss: 0.1637
  B_acc: 0.8668 Loss: 0.3433
4 1560/3125 B_acc: 0.9092 Loss: 0.0991 B_acc: 0.9178 Loss: 0.1406Loss: 0.2077 B_acc: 0.9206 Loss: 0.2320 B_acc: 0.9006 Loss: 0.1789