In [1]:
import os
import numpy as np

import dataset
import schedulers

import torch

import torch.nn as nn
assert torch.cuda.is_available()

In [2]:
MAXLEN = 64
def load_data(source, maxlen=MAXLEN, validation=0.1):
    filenames = [os.path.join('texts', f) for f in source]
    train, valid = dataset.load_data(filenames, validation, maxlen=maxlen)
    return train, valid

data_mix = load_data(['poetry', 'rabanit', 'pre_modern'])
data_modern = load_data(validation=0.2, source=['modern'])

In [6]:
UNITS = 300

LETTERS_SIZE = len(dataset.letters_table)
NIQQUD_SIZE = len(dataset.niqqud_table)
DAGESH_SIZE = len(dataset.dagesh_table)
SIN_SIZE = len(dataset.sin_table)

class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.embed = nn.Embedding(num_embeddings=LETTERS_SIZE, embedding_dim=UNITS)
        self.lstm1 = nn.LSTM(input_size=UNITS, hidden_size=UNITS, num_layers=1, batch_first=True, bidirectional=True)
        self.lstm2 = nn.LSTM(input_size=UNITS, hidden_size=UNITS, num_layers=1, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(in_features=UNITS, out_features=NIQQUD_SIZE)

    def forward(self, x):
        embeds = self.embed(x)

        lstm_out, _ = self.lstm1(embeds)
        left, right = torch.chunk(lstm_out, 2, dim=-1)
        merge = left + right

        lstm_out, _ = self.lstm2(merge)
        left, right = torch.chunk(lstm_out, 2, dim=-1)
        merge = left + right + merge

        tag_space = self.fc(merge)
        # tag_scores = F.log_softmax(tag_space, dim=-1)
        return tag_space.permute([0, 2, 1])

model = Model()
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam(model.parameters(), lr=2e-3)

device = torch.device("cuda")
model.to(device)

Model(
  (embed): Embedding(44, 300)
  (lstm1): LSTM(300, 300, batch_first=True, bidirectional=True)
  (lstm2): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=300, out_features=16, bias=True)
)

In [59]:
train, valid = data_modern
with torch.no_grad():
    x = train.normalized[:1]
    inputs = torch.from_numpy(x).to(torch.int64)
    tag_scores = model(inputs)
    print(tag_scores.shape)
    print(valid.niqqud[:1].shape)
    print(tag_scores)

torch.Size([1, 64, 16])
(1, 64)
tensor([[[ 0.0443,  0.1131,  0.1651,  ...,  0.1137,  0.0319,  0.1902],
         [ 0.1396, -0.1169,  0.2076,  ...,  0.0352,  0.2090,  0.2759],
         [ 0.1007, -0.1327,  0.1329,  ..., -0.0743,  0.1096, -0.0350],
         ...,
         [ 0.3901,  0.0842,  0.2003,  ...,  0.3017,  0.1579, -0.1293],
         [ 0.3616,  0.0776,  0.2187,  ...,  0.2765,  0.1573, -0.1133],
         [ 0.2803,  0.0605,  0.2528,  ...,  0.2360,  0.1469, -0.0808]]])


In [4]:
BATCH_SIZE = 32

def batch(a):
    ub = a.shape[0] // BATCH_SIZE * BATCH_SIZE
    return torch.from_numpy(a[:ub]).to(torch.int64).split(BATCH_SIZE)

def accuracy(output, ybatch):
    n = (ybatch != 0).sum()
    c = np.argmax(output, axis=1)
    return ((c == ybatch) & (c != 0)).sum() / n

def fit(data):
    train, _ = data
    x = batch(train.normalized)
    y = batch(train.niqqud)
    epochs = 1
    for epoch in range(1):
        total = len(x)
        acc = []
        for i, (xbatch, ybatch) in enumerate(zip(x, y)):
            xbatch = xbatch.to(device)
            ybatch = ybatch.to(device)
            optimizer.zero_grad()
            output = model(xbatch)
            output = output.to(device)
            loss = criterion(output, ybatch)
            loss.backward()
            optimizer.step()
            output = output.cpu().data.numpy()
            ybatch = ybatch.cpu().data.numpy()

            acc.append(accuracy(output, ybatch))
            if i % 20 == 0:
                print("{:4}/{}".format(i, total), end=' ')
                print("Accuracy: {:.4f}".format(np.mean(acc)), end=' ')
                print("Loss: {:.4f}".format(loss.item()), end='\r')
                acc = []

        print('Epoch: {}/{}.............'.format(epoch+1, epochs), end=' ')
        print("Accuracy: {:.4f}".format(accuracy(output, ybatch)), end=' ')
        print("Loss: {:.4f}".format(loss.item()))

def validate(data):
    _, valid = data
    x = batch(valid.normalized)
    y = batch(valid.niqqud)
    with torch.no_grad():
        acc = []
        losses = []
        for xbatch, ybatch in zip(x, y):
            xbatch = xbatch.to(device)
            ybatch = ybatch.to(device)
            optimizer.zero_grad()
            output = model(xbatch)
            output = output.to(device)
            loss = criterion(output, ybatch)

            output = output.cpu().data.numpy()
            ybatch = ybatch.cpu().data.numpy()

            acc.append(accuracy(output, ybatch))
            losses.append(loss.item())

        print("val_Accuracy: {:.4f}".format(np.mean(acc)), end=' ')
        print("val_Loss: {:.4f}".format(np.mean(losses)))

In [7]:
fit(data_mix)
validate(data_mix)
fit(data_modern)
validate(data_modern)

Epoch: 1/1............. Accuracy: 0.9591 Loss: 0.1236
val_Accuracy: 0.9299 val_Loss: 0.2237
Epoch: 1/1............. Accuracy: 0.9702 Loss: 0.0930
val_Accuracy: 0.9608 val_Loss: 0.1228
