In [16]:
import numpy as np

import utils

NUM_EMBEDDING = 2000
def word2numpy(txt):
    return np.array([ord(c) for c in txt])

def wordlist2numpy(lines):
    return utils.pad_sequences([word2numpy(line) for line in lines],
                               maxlen=12, dtype=int, value=0)

BINYAN = 'פעל נפעל פיעל פועל הפעיל הופעל התפעל'.split()
TENSE = 'עבר הווה עתיד ציווי'.split()
VOICE = 'ראשון שני שלישי'.split()
GENDER = 'זכר נקבה'.split()
PLURAL = 'יחיד רבים'.split()

NAMES = 'BTVGP'
FEATURES = {
    'B': BINYAN,
    'T': TENSE,
    'V': VOICE,
    'G': GENDER,
    'P': PLURAL,
}

def to_category(name, b):
    return FEATURES[name].index(b)

def from_category(name, index):
    return FEATURES[name][index]

def list_to_category(name, bs):
    return np.array([to_category(name, b) for b in bs])

def list_from_category(name, indexes):
    return [from_category(name, index) for index in indexes]

def list_of_lists_to_category(items):
    return { name: list_to_category(name, item)
             for name, item in zip(NAMES, items) }

In [2]:
import torch

import torch.nn as nn
assert torch.cuda.is_available()

def to_device(d):
    if hasattr(d, 'cuda'):
        return d.cuda()
    return {k: v.cuda() for k, v in d.items()}

def create_model():
    class Model(nn.Module):
        def __init__(self, UNITS):
            super().__init__()
            self.units = UNITS

            self.embed = nn.Embedding(num_embeddings=NUM_EMBEDDING, embedding_dim=UNITS)
            self.lstm1 = nn.LSTM(input_size=UNITS, hidden_size=UNITS, num_layers=1, batch_first=True, bidirectional=True)

            # {k: nn.Linear(in_features=UNITS, out_features=len(v)) for k, v in features.items()}
            self.binyan = nn.Linear(in_features=UNITS, out_features=len(BINYAN))
            self.tense = nn.Linear(in_features=UNITS, out_features=len(TENSE))
            self.voice = nn.Linear(in_features=UNITS, out_features=len(VOICE))
            self.gender = nn.Linear(in_features=UNITS, out_features=len(GENDER))
            self.plural = nn.Linear(in_features=UNITS, out_features=len(PLURAL))

            self.features = {
                'B': self.binyan,
                'T': self.tense,
                'V': self.voice,
                'G': self.gender,
                'P': self.plural,
            }

        def forward(self, x):
            embeds = self.embed(x)

            lstm_out, (h_n, c_n) = self.lstm1(embeds)
            left, right = torch.chunk(h_n, 2, dim=0)
            merge = torch.squeeze(left + right)

            outputs = { k: f(merge) for k, f in self.features.items() }
            return outputs

    model = Model(UNITS=300)

    return to_device(model)


In [None]:

wordlist = ['ידעתי', 'התאפס', 'יאבד']
binyanlist = ['פעל', 'התפעל', 'פיעל']

def sanity():
    with torch.no_grad():
        numpy_inp = wordlist2numpy(wordlist)
        inputs = to_device(torch.from_numpy(numpy_inp).to(torch.int64))
        tag_scores = model(inputs)
        expected = list_to_category(BINYAN, binyanlist)
        print(f'{tag_scores["B"].shape=}')
        print(f"{np.argmax(tag_scores['B'].cpu(), axis=1).shape=}")
        print(f"{expected.shape=}")

sanity()

In [3]:
import concrete

def load_dataset(filename):
    *features, verbs = concrete.load_dataset(filename)
    return wordlist2numpy(verbs), list_of_lists_to_category(features)

train = load_dataset('random_train.tsv')
valid = load_dataset('random_validate.tsv')

In [14]:
BATCH_SIZE = 64

def batch(a):
    ub = a.shape[0] // BATCH_SIZE * BATCH_SIZE
    return torch.from_numpy(a[:ub]).to(torch.int64).split(BATCH_SIZE)

def batch_all_ys(ys):
    res = []
    m = {k: batch(ys[k]) for k in NAMES}
    nbatches = len(m['B'])
    for i in range(nbatches):
        res.append({k: m[k][i] for k in NAMES})
    return res

def accuracy(output, ybatch):
    n = (ybatch != 0).sum()
    c = np.argmax(output, axis=1)
    return ((c == ybatch) & (c != 0)).sum() / n

def fit(model, x_train, y_train, x_valid, y_valid, *, epochs, criterion, optimizer):
    data = {
        'train': (batch(x_train), batch_all_ys(y_train)),
        'valid': (batch(x_train), batch_all_ys(y_train))
    }

    for epoch in range(epochs):
        for phase in ['train', 'valid']:
            total = len(data[phase][0])

            accs = {k: [] for k in NAMES}
            losses = []

            for i, (inputs, labels) in enumerate(zip(*data[phase])):
                if phase == 'train':
                    model.train()
                else:
                    model.eval()

                inputs = to_device(inputs)
                outputs = model(inputs)
                outputs = to_device(outputs)
                labels = to_device(labels)

                loss = sum(criterion(outputs[k], labels[k]) for k in outputs)

                if phase == 'train':
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

                outputs = {k: v.cpu().data.numpy() for k, v in outputs.items()}
                labels = {k: v.cpu().data.numpy() for k, v in labels.items()}

                for k in outputs:
                    accs[k].append(accuracy(outputs[k], labels[k]))

                losses.append(loss.item())
                RUNSIZE = 100
                if phase == 'valid' or i % RUNSIZE == RUNSIZE - 1:
                    print("{} {:4}/{}".format(epoch, i, total), end=' ')
                    for k in accs:
                        print("{}_acc: {:.4f}".format(k, np.mean(accs[k])), end=' ')
                    print("Loss: {:.4f}".format(np.mean(losses)), end='\r')
                    accs = {k: [] for k in NAMES}
                    losses = []
            print()

def predict(model, *verbs):
    verbs = wordlist2numpy(verbs)
    verbs = to_device(torch.from_numpy(verbs).to(torch.int64))
    outputs = model(verbs)
    return {k: from_category(k, torch.argmax(v))
            for k, v in outputs.items()}
    

In [22]:
predict(model, 'השתרמט')

{'B': 'התפעל', 'T': 'עבר', 'V': 'שלישי', 'G': 'זכר', 'P': 'יחיד'}

In [7]:
model = create_model()

print(model)
fit(model,
    *train,
    *valid,
    epochs=5,
    criterion=nn.CrossEntropyLoss(),  # add ignore_index for root
    optimizer=torch.optim.Adam(model.parameters(), lr=1e-3)
)

Model(
  (embed): Embedding(2000, 300)
  (lstm1): LSTM(300, 300, batch_first=True, bidirectional=True)
  (binyan): Linear(in_features=300, out_features=7, bias=True)
  (tense): Linear(in_features=300, out_features=4, bias=True)
  (voice): Linear(in_features=300, out_features=3, bias=True)
  (gender): Linear(in_features=300, out_features=2, bias=True)
  (plural): Linear(in_features=300, out_features=2, bias=True)
)
0 1499/1562 B_acc: 0.8709 T_acc: 0.9308 V_acc: 0.6596 G_acc: 0.7993 P_acc: 0.9833 Loss: 1.4894
0 1561/1562 B_acc: 0.8929 T_acc: 0.9556 V_acc: 0.6889 G_acc: 0.6400 P_acc: 1.0000 Loss: 1.6556T_acc: 0.9412 V_acc: 0.6889 G_acc: 0.8636 P_acc: 0.9677 Loss: 1.4120 B_acc: 0.8929 T_acc: 0.8696 V_acc: 0.5600 G_acc: 0.5600 P_acc: 1.0000 Loss: 1.8911 B_acc: 0.8571 T_acc: 0.9545 V_acc: 0.7907 G_acc: 0.6667 P_acc: 1.0000 Loss: 1.5347 P_acc: 1.0000 Loss: 1.2433 T_acc: 0.9167 V_acc: 0.6905 G_acc: 0.7586 P_acc: 0.9615 Loss: 1.4136
1 1499/1562 B_acc: 0.8725 T_acc: 0.9328 V_acc: 0.6676 G_acc: 0

In [None]:
len(train)