In [1]:
import numpy as np

import utils

NUM_EMBEDDING = 2000
def word2numpy(txt):
    return np.array([ord(c) for c in txt])

def wordlist2numpy(lines):
    return utils.pad_sequences([word2numpy(line) for line in lines],
                               maxlen=12, dtype=int, value=0)

BINYAN = 'פעל נפעל פיעל פועל הפעיל הופעל התפעל'.split()
TENSE = 'עבר הווה עתיד ציווי'.split()
VOICE = 'ראשון שני שלישי'.split()
GENDER = 'זכר נקבה'.split()
PLURAL = 'יחיד רבים'.split()

NAMES = 'BTVGP'
FEATURES = {
    'B': BINYAN,
    'T': TENSE,
    'V': VOICE,
    'G': GENDER,
    'P': PLURAL,
}

def to_category(kind, b):
    return kind.index(b)

def from_category(kind, index):
    return kind[index]

def list_to_category(kind, bs):
    return np.array([to_category(kind, b) for b in bs])

def list_from_category(kind, indexes):
    return [from_category(kind, index) for index in indexes]

def list_of_lists_to_category(items):
    return { name: list_to_category(FEATURES[name], item)
             for name, item in zip(NAMES, items) }

In [2]:
import torch

import torch.nn as nn
assert torch.cuda.is_available()


class Model(nn.Module):
    def __init__(self, UNITS):
        super().__init__()
        self.units = UNITS

        self.embed = nn.Embedding(num_embeddings=NUM_EMBEDDING, embedding_dim=UNITS)
        self.lstm1 = nn.LSTM(input_size=UNITS, hidden_size=UNITS, num_layers=1, batch_first=True, bidirectional=True)

        # {k: nn.Linear(in_features=UNITS, out_features=len(v)) for k, v in features.items()}
        self.binyan = nn.Linear(in_features=UNITS, out_features=len(BINYAN))
        self.tense = nn.Linear(in_features=UNITS, out_features=len(TENSE))
        self.voice = nn.Linear(in_features=UNITS, out_features=len(VOICE))
        self.gender = nn.Linear(in_features=UNITS, out_features=len(GENDER))
        self.plural = nn.Linear(in_features=UNITS, out_features=len(PLURAL))
        
        self.features = {
            'B': self.binyan,
            'T': self.tense,
            'V': self.voice,
            'G': self.gender,
            'P': self.plural,
        }

    def forward(self, x):
        embeds = self.embed(x)

        lstm_out, (h_n, c_n) = self.lstm1(embeds)
        left, right = torch.chunk(h_n, 2, dim=0)
        merge = torch.squeeze(left + right)

        outputs = { k: f(merge) for k, f in self.features.items() }
        return outputs

model = Model(UNITS=300)
criterion = nn.CrossEntropyLoss()  # add ignore_index for root
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

def to_device(d):
    if hasattr(d, 'cuda'):
        return d.cuda()
    return {k: v.cuda() for k, v in d.items()}

model = to_device(model)
model

Model(
  (embed): Embedding(2000, 300)
  (lstm1): LSTM(300, 300, batch_first=True, bidirectional=True)
  (binyan): Linear(in_features=300, out_features=7, bias=True)
  (tense): Linear(in_features=300, out_features=4, bias=True)
  (voice): Linear(in_features=300, out_features=3, bias=True)
  (gender): Linear(in_features=300, out_features=2, bias=True)
  (plural): Linear(in_features=300, out_features=2, bias=True)
)

In [None]:

wordlist = ['ידעתי', 'התאפס', 'יאבד']
binyanlist = ['פעל', 'התפעל', 'פיעל']

def sanity():
    with torch.no_grad():
        numpy_inp = wordlist2numpy(wordlist)
        inputs = to_device(torch.from_numpy(numpy_inp).to(torch.int64))
        tag_scores = model(inputs)
        expected = list_to_category(BINYAN, binyanlist)
        print(f'{tag_scores["B"].shape=}')
        print(f"{np.argmax(tag_scores['B'].cpu(), axis=1).shape=}")
        print(f"{expected.shape=}")

sanity()

In [9]:
BATCH_SIZE = 32

def batch(a):
    ub = a.shape[0] // BATCH_SIZE * BATCH_SIZE
    return torch.from_numpy(a[:ub]).to(torch.int64).split(BATCH_SIZE)

def batch_all_ys(ys):
    res = []
    m = {k: batch(ys[k]) for k in NAMES}
    nbatches = len(m['B'])
    for i in range(nbatches):
        res.append({k: m[k][i] for k in NAMES})
    return res

def accuracy(output, ybatch):
    n = (ybatch != 0).sum()
    c = np.argmax(output, axis=1)
    return ((c == ybatch) & (c != 0)).sum() / n

def fit(x_train, y_train, epoch):
    x_all = batch(x_train)
    y_all = batch_all_ys(y_train)

    total = len(x_all)
    accs = {k: [] for k in NAMES}
    for i, (x, y) in enumerate(zip(x_all, y_all)):

        x = to_device(x)

        optimizer.zero_grad()
        outputs = model(x)

        outputs = to_device(outputs)

        y = to_device(y)
        sub_losses = {k: criterion(outputs[k], y[k]) for k in outputs}
        loss = sum(sub_losses.values())
        loss.backward()

        optimizer.step()

        outputs = {k: v.cpu().data.numpy() for k, v in outputs.items()}
        y = {k: v.cpu().data.numpy() for k, v in y.items()}

        for k in outputs:
            accs[k].append(accuracy(outputs[k], y[k]))

        if i % 10 == 0:
            print("{} {:4}/{}".format(epoch, i, total), end=' ')
            for k in accs:
                print("{}_acc: {:.4f}".format(k, np.mean(accs[k])), end=' ')
            print("Loss: {:.4f}".format(loss.item()), end='\r')
        accs = {k: [] for k in NAMES}
    print()

def validate(x_valid, y_valid):
    x_all = batch(x_valid)
    y_all = batch_all_ys(y_valid)

    with torch.no_grad():
        accs = {k: [] for k in NAMES}
        losses = []
        for i, (x, y) in enumerate(zip(x_all, y_all)):
            x = to_device(x)

            outputs = to_device(model(x))

            y = to_device(y)

            sub_losses = {k: criterion(outputs[k], y[k]) for k in outputs}
            loss = sum(sub_losses.values())

            outputs = {k: v.cpu().data.numpy() for k, v in outputs.items()}
            y = {k: v.cpu().data.numpy() for k, v in y.items()}

            for k in outputs:
                accs[k].append(accuracy(outputs[k], y[k]))
            losses.append(loss.item())
        print("  ", end="")
        for k in accs:
            print("{}_acc: {:.4f}".format(k, np.mean(accs[k])), end=' ')
        print("Loss: {:.4f}".format(np.mean(losses)))

def run_experiment(train, valid, epochs):
    train_x, train_y = train
    valid_x, valid_y = valid
    for e in range(epochs):
        fit(train_x, train_y, epoch=e)
        validate(valid_x, valid_y)

def load_dataset(filename):
    *features, verbs = concrete.load_dataset(filename)
    return wordlist2numpy(verbs), list_of_lists_to_category(features)

In [16]:
import concrete

train = load_dataset('random_train.tsv')
valid = load_dataset('random_validate.tsv')

In [18]:
run_experiment(train, valid, 5)

0 3120/3125 B_acc: 0.8276 T_acc: 0.9000 V_acc: 0.6957 G_acc: 0.5000 P_acc: 1.0000 Loss: 2.0012
  B_acc: 0.8743 T_acc: 0.9329 V_acc: 0.6641 G_acc: 0.7219 P_acc: 0.9866 Loss: 1.5000
1 3120/3125 B_acc: 0.8276 T_acc: 0.8500 V_acc: 0.6957 G_acc: 0.6250 P_acc: 0.9286 Loss: 1.7442
  B_acc: 0.8723 T_acc: 0.9336 V_acc: 0.6701 G_acc: 0.7319 P_acc: 0.9846 Loss: 1.4603
2 3120/3125 B_acc: 0.8276 T_acc: 0.8500 V_acc: 0.7391 G_acc: 0.4375 P_acc: 0.9286 Loss: 1.9011
  B_acc: 0.8714 T_acc: 0.9305 V_acc: 0.6741 G_acc: 0.7551 P_acc: 0.9850 Loss: 1.4413
3 3120/3125 B_acc: 0.8276 T_acc: 0.9000 V_acc: 0.6957 G_acc: 0.4375 P_acc: 1.0000 Loss: 1.8278 B_acc: 0.8929 T_acc: 1.0000 V_acc: 0.6364 G_acc: 0.8125 P_acc: 1.0000 Loss: 1.1379
  B_acc: 0.8768 T_acc: 0.9383 V_acc: 0.6663 G_acc: 0.7501 P_acc: 0.9818 Loss: 1.4170
4 3120/3125 B_acc: 0.8966 T_acc: 0.8500 V_acc: 0.7391 G_acc: 0.6250 P_acc: 1.0000 Loss: 1.5464
  B_acc: 0.8726 T_acc: 0.9359 V_acc: 0.6785 G_acc: 0.7503 P_acc: 0.9846 Loss: 1.4136


In [17]:
len(valid[0])

10000