In [1]:
import numpy as np

import utils

NUM_EMBEDDING = 2000
def word2numpy(txt):
    return np.array([ord(c) for c in txt])

def wordlist2numpy(lines):
    return utils.pad_sequences([word2numpy(line) for line in lines],
                               maxlen=12, dtype=int, value=0)

RADICALS = ['.'] + list('אבגדהוזחטיכלמנסעפצקרשת') + ["ג'", "ז'", "צ'", 'שׂ']

BINYAN = 'פעל נפעל פיעל פועל הפעיל הופעל התפעל'.split()
TENSE = 'עבר הווה עתיד ציווי'.split()
VOICE = 'ראשון שני שלישי'.split()
GENDER = 'זכר נקבה'.split()
PLURAL = 'יחיד רבים'.split()

NAMES = ['B', 'T', 'V', 'G', 'P', 'R1', 'R2', 'R3', 'R4']
FEATURES = {
    'B': BINYAN,
    'T': TENSE,
    'V': VOICE,
    'G': GENDER,
    'P': PLURAL,
    'R1': RADICALS,
    'R2': RADICALS,
    'R3': RADICALS,
    'R4': RADICALS,
}

def to_category(name, b):
    return FEATURES[name].index(b)

def from_category(name, index):
    return FEATURES[name][index]

def list_to_category(name, bs):
    return np.array([to_category(name, b) for b in bs])

def list_from_category(name, indexes):
    return [from_category(name, index) for index in indexes]

def list_of_lists_to_category(items):
    return { name: list_to_category(name, item)
             for name, item in zip(NAMES, items) }

In [257]:
import torch

import torch.nn as nn
assert torch.cuda.is_available()

def to_device(d):
    if hasattr(d, 'cuda'):
        return d.cuda()
    return {k: v.cuda() for k, v in d.items()}

def create_model(UNITS):
    class Model(nn.Module):
        def __init__(self, UNITS):
            super().__init__()
            self.units = UNITS

            self.embed = nn.Embedding(num_embeddings=NUM_EMBEDDING, embedding_dim=UNITS)
            self.lstm1 = nn.LSTM(input_size=UNITS, hidden_size=UNITS, num_layers=1, batch_first=True, bias=True, bidirectional=True)
            
            self.binyan = nn.Linear(in_features=UNITS, out_features=len(BINYAN))
            self.tense = nn.Linear(in_features=UNITS, out_features=len(TENSE))
            self.voice = nn.Linear(in_features=UNITS, out_features=len(VOICE))
            self.gender = nn.Linear(in_features=UNITS, out_features=len(GENDER))
            self.plural = nn.Linear(in_features=UNITS, out_features=len(PLURAL))
            
            self.r1 = nn.Linear(in_features=UNITS, out_features=len(RADICALS))
            self.r2 = nn.Linear(in_features=UNITS, out_features=len(RADICALS))
            self.r3 = nn.Linear(in_features=UNITS, out_features=len(RADICALS))
            self.r4 = nn.Linear(in_features=UNITS, out_features=len(RADICALS))

            self.features = {
                'B': self.binyan,
                'T': self.tense,
                'V': self.voice,
                'G': self.gender,
                'P': self.plural,
                
                'R1': self.r1,
                'R2': self.r2,
                'R3': self.r3,
                'R4': self.r4,
            }

        def forward(self, x):
            embeds = self.embed(x)

            lstm_out, (h_n, c_n) = self.lstm1(embeds)
            left, right = torch.chunk(h_n, 2, dim=0)
            merge = torch.squeeze(left + right)

            outputs = { k: f(merge) for k, f in self.features.items() }
            return outputs

    model = Model(UNITS=UNITS)

    return to_device(model)


In [76]:

def sanity():
    model = create_model(100)
    with torch.no_grad():
        verbs = wordlist2numpy(["כשאתאקלם"])
        verbs = to_device(torch.from_numpy(verbs).to(torch.int64))
        tag_scores = model(verbs)
        for k in NAMES:
            print(k)
            v = nn.Softmax()(tag_scores[k]).cpu().detach().numpy()
            print(v)
            print(f'{np.mean(v)=}')
            print(f'{-np.log(1/len(v))=}')
            print()

# sanity()

In [226]:
import concrete

def load_dataset(filename):
    *features, verbs = concrete.load_dataset(filename)
    return wordlist2numpy(verbs), list_of_lists_to_category(features)

train = load_dataset('random_train_100K.tsv')
valid = load_dataset('random_validate.tsv')

In [252]:
BATCH_SIZE = 64

def batch(a):
    ub = a.shape[0] // BATCH_SIZE * BATCH_SIZE
    return to_device(torch.from_numpy(a[:ub]).to(torch.int64)).split(BATCH_SIZE)

def batch_all_ys(ys):
    res = []
    m = {k: batch(ys[k]) for k in NAMES}
    nbatches = len(m['B'])
    for i in range(nbatches):
        res.append({k: m[k][i] for k in NAMES})
    return res

def fit(model, x_train, y_train, x_valid, y_valid, *, epochs, criterion, optimizer, valid=True):
    data = {
        'train': (batch(x_train), batch_all_ys(y_train)),
        'valid': (batch(x_valid), batch_all_ys(y_valid))
    }
    
    for epoch in range(epochs):
        for phase in ['train', 'valid']:
            if not valid and phase == 'valid':
                continue

            if phase == 'train':
                model.train()
            else:
                model.eval()

            total = len(data[phase][0])

            running_corrects = {k: 0.0 for k in NAMES}
            running_divisor = 0
            running_loss = []

            for i, (inputs, labels) in enumerate(zip(*data[phase])):
                
                if phase == 'train':
                    outputs = model(inputs)
                else:
                    with torch.no_grad():
                        outputs = model(inputs)

                loss = sum(criterion(outputs[k], labels[k]) for k in outputs)

                if phase == 'train':
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()
                    
                running_loss.append(loss.item())
                
                running_divisor += inputs.size(0)
                for k in outputs:
                    running_corrects[k] += torch.sum(torch.argmax(outputs[k], dim=1) == labels[k].detach())
                
                RUNSIZE = 100
                
                if phase == 'valid' and i >= total-1 or i % RUNSIZE == RUNSIZE - 1:
                    print("{:2} {:5}/{:5}".format(epoch, i, total), end=' ')
                    for k in running_corrects:
                        print("{}_acc: {:.3f}".format(k, running_corrects[k] / running_divisor), end=' ')
                    print("Loss: {:.4f}".format(np.mean(running_loss)), end='\r')
                    if phase != 'valid':
                        running_corrects = {k: 0.0 for k in NAMES}
                        running_divisor = 0.0
                        losses = []
            print()

@torch.no_grad()
def predict(model, *verbs):
    model.eval()
    verbs = wordlist2numpy(verbs)
    verbs = to_device(torch.from_numpy(verbs).to(torch.int64))
    outputs = model(verbs)
    res = {k: from_category(k, torch.argmax(v))
              for k, v in outputs.items()}
    res['R'] = ''.join(res[k] for k in ['R1', 'R2', 'R3', 'R4']).replace('.', '')
    return res
    

In [258]:
model = create_model(UNITS=300)

print(model)
fit(model,
    *train,
    *valid,
    epochs=6,
    criterion=nn.CrossEntropyLoss(),
    optimizer=torch.optim.Adam(model.parameters(), lr=3e-4),
    valid=True
)

Model(
  (embed): Embedding(2000, 300)
  (lstm1): LSTM(300, 300, batch_first=True, bidirectional=True)
  (dense): Linear(in_features=300, out_features=300, bias=True)
  (act): ReLU()
  (binyan): Linear(in_features=300, out_features=7, bias=True)
  (tense): Linear(in_features=300, out_features=4, bias=True)
  (voice): Linear(in_features=300, out_features=3, bias=True)
  (gender): Linear(in_features=300, out_features=2, bias=True)
  (plural): Linear(in_features=300, out_features=2, bias=True)
  (r1): Linear(in_features=300, out_features=27, bias=True)
  (r2): Linear(in_features=300, out_features=27, bias=True)
  (r3): Linear(in_features=300, out_features=27, bias=True)
  (r4): Linear(in_features=300, out_features=27, bias=True)
)
 0  1499/ 1562 B_acc: 0.820 T_acc: 0.907 V_acc: 0.677 G_acc: 0.779 P_acc: 0.975 R1_acc: 0.929 R2_acc: 0.814 R3_acc: 0.962 R4_acc: 0.953 Loss: 4.94900
 0   155/  156 B_acc: 0.829 T_acc: 0.914 V_acc: 0.692 G_acc: 0.798 P_acc: 0.981 R1_acc: 0.931 R2_acc: 0.827 R3_a

In [251]:
print(predict(model, 'סבסו'))
print(predict(model, 'מקדו'))
print(predict(model, 'נמזר'))
print(predict(model, 'כרדו'))

{'B': 'פעל', 'T': 'עבר', 'V': 'שלישי', 'G': 'זכר', 'P': 'רבים', 'R1': 'ס', 'R2': 'ב', 'R3': '.', 'R4': 'ס', 'R': 'סבס'}
{'B': 'פעל', 'T': 'ציווי', 'V': 'שני', 'G': 'זכר', 'P': 'רבים', 'R1': 'מ', 'R2': 'ק', 'R3': '.', 'R4': 'ד', 'R': 'מקד'}
{'B': 'נפעל', 'T': 'עתיד', 'V': 'ראשון', 'G': 'זכר', 'P': 'רבים', 'R1': 'מ', 'R2': 'ז', 'R3': '.', 'R4': 'ר', 'R': 'מזר'}
{'B': 'פעל', 'T': 'ציווי', 'V': 'שלישי', 'G': 'זכר', 'P': 'רבים', 'R1': 'כ', 'R2': 'ר', 'R3': '.', 'R4': 'ד', 'R': 'כרד'}


In [99]:
print(predict(model, 'הבריל'))
print(predict(model, 'חגוו'))
print(predict(model, 'עגו'))
print(predict(model, 'צירלל'))

{'B': 'הפעיל', 'T': 'עבר', 'V': 'שלישי', 'G': 'זכר', 'P': 'יחיד', 'R1': 'ב', 'R2': 'ר', 'R3': '.', 'R4': 'ל', 'R': 'ברל'}
{'B': 'פעל', 'T': 'עבר', 'V': 'שלישי', 'G': 'זכר', 'P': 'רבים', 'R1': 'ח', 'R2': 'ג', 'R3': '.', 'R4': 'י', 'R': 'חגי'}
{'B': 'פעל', 'T': 'עבר', 'V': 'שלישי', 'G': 'זכר', 'P': 'רבים', 'R1': 'ע', 'R2': 'ג', 'R3': '.', 'R4': 'י', 'R': 'עגי'}
{'B': 'פיעל', 'T': 'עבר', 'V': 'שלישי', 'G': 'זכר', 'P': 'יחיד', 'R1': 'צ', 'R2': 'ר', 'R3': '.', 'R4': 'ל', 'R': 'צרל'}


In [118]:
print(predict(model, "הזדבנתי"))

{'B': 'התפעל', 'T': 'עבר', 'V': 'ראשון', 'G': 'זכר', 'P': 'יחיד', 'R1': 'ז', 'R2': 'ב', 'R3': '.', 'R4': 'נ', 'R': 'זבנ'}


In [109]:
print(predict(model, "ישצו"))

{'B': 'הפעיל', 'T': 'עתיד', 'V': 'שלישי', 'G': 'זכר', 'P': 'רבים', 'R1': 'ש', 'R2': 'י', 'R3': '.', 'R4': 'צ', 'R': 'שיצ'}
