In [14]:
import numpy as np

import utils

NUM_EMBEDDING = 100
def word2numpy(txt):
    return np.array([ord(c)-ord('א')+1 for c in txt])

def wordlist2numpy(lines):
    return utils.pad_sequences([word2numpy(line) for line in lines],
                               maxlen=12, dtype=int, value=0)

ALL_BINYAN = 'פעל נפעל פיעל פועל הפעיל הופעל התפעל'.split()
BINYAN_SIZE = len(ALL_BINYAN)

def binyan_to_category(b):
    return ALL_BINYAN.index(b)


def binyan_list_to_category(bs):
    return np.array([binyan_to_category(b) for b in bs])

In [21]:
import torch

import torch.nn as nn
assert torch.cuda.is_available()


class Model(nn.Module):
    def __init__(self, UNITS):
        super().__init__()
        self.units = UNITS

        self.embed = nn.Embedding(num_embeddings=NUM_EMBEDDING, embedding_dim=UNITS)
        self.lstm1 = nn.LSTM(input_size=UNITS, hidden_size=UNITS, num_layers=1, batch_first=True, bidirectional=True)

        self.binyan = nn.Linear(in_features=UNITS, out_features=BINYAN_SIZE)

    def forward(self, x):
        embeds = self.embed(x)

        lstm_out, (h_n, c_n) = self.lstm1(embeds)
        left, right = torch.chunk(h_n, 2, dim=0)
        merge = torch.squeeze(left + right)

        binyan = self.binyan(merge)
        return {'B': binyan }

model = Model(UNITS=125)
criterion = nn.CrossEntropyLoss()  # add ignore_index for root
optimizer = torch.optim.Adam(model.parameters(), lr=2e-4)

model = model.cuda()

In [8]:

wordlist = ['ידעתי', 'התאפס', 'יאבד']
binyanlist = ['פעל', 'התפעל', 'פיעל']
print(wordlist2numpy(wordlist))
print(binyan_list_to_category(binyanlist))

def sanity():
    with torch.no_grad():
        numpy_inp = wordlist2numpy(wordlist)
        inputs = torch.from_numpy(numpy_inp).to(torch.int64).cuda()
        tag_scores = model(inputs)
        expected = binyan_list_to_category(binyanlist)
        print(f'{tag_scores["B"].shape=}')
        print(f"{np.argmax(tag_scores['B'].cpu(), axis=1).shape=}")
        print(f"{expected.shape=}")

sanity()

[[1497 1491 1506 1514 1497    0    0    0    0    0    0    0]
 [1492 1514 1488 1508 1505    0    0    0    0    0    0    0]
 [1497 1488 1489 1491    0    0    0    0    0    0    0    0]]
[0 6 2]
tag_scores["B"].shape=torch.Size([3, 7])
np.argmax(tag_scores['B'].cpu(), axis=1).shape=torch.Size([3])
expected.shape=(3,)


In [23]:
BATCH_SIZE = 32

def batch(a):
    ub = a.shape[0] // BATCH_SIZE * BATCH_SIZE
    return torch.from_numpy(a[:ub]).to(torch.int64).split(BATCH_SIZE)

def batch_all_y(y):
    return [{'B': b } for b in batch(y)]

def accuracy(output, ybatch):
    n = (ybatch != 0).sum()
    c = np.argmax(output, axis=1)
    return ((c == ybatch) & (c != 0)).sum() / n

def fit(x_train, y_train, epochs=1):
    x_all = batch(x_train)
    y_all = batch_all_y(y_train)

    for epoch in range(epochs):
        total = len(x_all)
        accs = {'B': [] }
        for i, (x, y) in enumerate(zip(x_all, y_all)):

            x = x.cuda()
            optimizer.zero_grad()
            outputs = model(x)

            outputs = {k: v.cuda() for k, v in outputs.items()}

            y = {k: v.cuda() for k, v in y.items()}

            sub_losses = {k: criterion(outputs[k], y[k]) for k in outputs}
            loss = sum(sub_losses.values())
            loss.backward()

            optimizer.step()

            outputs = {k: v.cpu().data.numpy() for k, v in outputs.items()}
            y = {k: v.cpu().data.numpy() for k, v in y.items()}

            for k in outputs:
                accs[k].append(accuracy(outputs[k], y[k]))

            if i % 20 == 0:
                print("{} {:4}/{}".format(epoch, i, total), end=' ')
                for k in accs:
                    print("{}_acc: {:.4f}".format(k, np.mean(accs[k])), end=' ')
                print("Loss: {:.4f}".format(loss.item()), end='\r')
                accs = {'B': [] }
        print()

def validate(x_valid, y_valid):
    x_all = batch(x_valid)
    y_all = batch_all_y(y_valid)

    with torch.no_grad():
        accs = {'B': [] }
        losses = []
        for i, (x, y) in enumerate(zip(x_all, y_all)):
            x = x.cuda()

            outputs = model(x)
            outputs = {k: v.cuda() for k, v in outputs.items()}

            y = {k: v.cuda() for k, v in y.items()}

            sub_losses = {k: criterion(outputs[k], y[k]) for k in outputs}
            loss = sum(sub_losses.values())

            outputs = {k: v.cpu().data.numpy() for k, v in outputs.items()}
            y = {k: v.cpu().data.numpy() for k, v in y.items()}

            for k in outputs:
                accs[k].append(accuracy(outputs[k], y[k]))
            losses.append(loss.item())

        for k in accs:
            print("{}_acc: {:.4f}".format(k, np.mean(accs[k])), end=' ')
        print("Loss: {:.4f}".format(np.mean(losses)))

import random
from functools import lru_cache
@lru_cache()
def tag(k):
    return random.choice(ALL_BINYAN)

wordlist = [random.choice(['ידעתי', 'התאפס']) for _ in range(32)]
binyanlist = [tag(wordlist[i]) for i in range(32)]

fit(wordlist2numpy(wordlist), binyan_list_to_category(binyanlist), epochs=1000)
validate(wordlist2numpy(wordlist), binyan_list_to_category(binyanlist))


0    0/1 B_acc: 0.5312 Loss: 2.1831
1    0/1 B_acc: 0.5312 Loss: 1.9371
2    0/1 B_acc: 0.5312 Loss: 1.6240
3    0/1 B_acc: 0.5312 Loss: 1.2892
4    0/1 B_acc: 0.5312 Loss: 0.9653
5    0/1 B_acc: 1.0000 Loss: 0.6946
6    0/1 B_acc: 1.0000 Loss: 0.5085
7    0/1 B_acc: 1.0000 Loss: 0.3804
8    0/1 B_acc: 1.0000 Loss: 0.2820
9    0/1 B_acc: 1.0000 Loss: 0.2109
10    0/1 B_acc: 1.0000 Loss: 0.1636
11    0/1 B_acc: 1.0000 Loss: 0.1310
12    0/1 B_acc: 1.0000 Loss: 0.1059
13    0/1 B_acc: 1.0000 Loss: 0.0852
14    0/1 B_acc: 1.0000 Loss: 0.0686
15    0/1 B_acc: 1.0000 Loss: 0.0563
16    0/1 B_acc: 1.0000 Loss: 0.0474
17    0/1 B_acc: 1.0000 Loss: 0.0408
18    0/1 B_acc: 1.0000 Loss: 0.0355
19    0/1 B_acc: 1.0000 Loss: 0.0309
20    0/1 B_acc: 1.0000 Loss: 0.0271
21    0/1 B_acc: 1.0000 Loss: 0.0238
22    0/1 B_acc: 1.0000 Loss: 0.0211
23    0/1 B_acc: 1.0000 Loss: 0.0189
24    0/1 B_acc: 1.0000 Loss: 0.0170
25    0/1 B_acc: 1.0000 Loss: 0.0154
26    0/1 B_acc: 1.000