# Dependencies

In [None]:
!pip install datasets PyGithub

Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting PyGithub
  Downloading PyGithub-2.4.0-py3-none-any.whl.metadata (3.9 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting pynacl>=1.4.0 (from PyGithub)
  Downloading PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl.metadata (8.6 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading PyGithub-2.4.0-py3-none-any.whl (362 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m362.6/362.6 

In [None]:
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import urllib.request as request
import requests
import os
import re
#from github import Github

# Drive Mounting

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
root_dir = "/content/gdrive/My Drive/CS584_project/"
base_dir = root_dir + 'inflection_st/'

Mounted at /content/gdrive


# Download Data from Source

In [None]:
g = Github('your_access_token_here')
repo = g.get_repo("sigmorphon/2023InflectionST")
repo2 = g.get_repo("sigmorphon/conll2017")

In [None]:
languages = ["eng", "heb", "spa", "rus", "hun", "tur"]
dset_types = [".dev", ".trn", ".tst", ".covered.dev", ".covered.tst"]

In [None]:
for lang in languages:
  for dset_type in dset_types:
    if not os.path.exists(f"{base_dir}{lang}"):
      os.makedirs(f"{base_dir}{lang}")
    content = repo.get_contents(f"part1/data/{lang}{dset_type}")
    content = content.decoded_content.decode("utf-8")
    with open(f"{base_dir}{lang}/{lang}{dset_type}", "w+") as f:
      f.write(content)

In [None]:
languages_2017 = ["english", "hebrew", "spanish", "russian", "hungarian", "turkish"]
dset_types_2017 = ["-dev", "-train-medium", "-covered-test"]

In [None]:
for lang, dir in zip(languages_2017, languages):
  for dset_type in dset_types_2017:
    if not os.path.exists(f"{base_dir}{dir}"):
      os.makedirs(f"{base_dir}{dir}")
    content = repo2.get_contents(f"all/task1/{lang}{dset_type}")
    content = content.decoded_content.decode("utf-8")
    with open(f"{base_dir}{dir}/{lang}{dset_type}", "w+") as f:
      f.write(content)

# Merge Datasets

In [None]:
splits = ["dev", "train", "test", "covered_test", "covered_dev"]

In [None]:
for lang in languages:
  for split in splits:
    if not os.path.exists(f"{base_dir}{lang}/{split}"):
      os.makedirs(f"{base_dir}{lang}/{split}")

In [None]:
for lang, lang17 in zip(languages, languages_2017):
  for dset_type in dset_types:
    match dset_type:
      case ".dev":
        split = "dev"
      case ".tst":
        split = "test"
      case ".trn":
        split = "train"
      case ".covered.dev":
        split = "covered_dev"
      case ".covered.tst":
        split = "covered_test"
    os.rename(f"{base_dir}{lang}/{lang}{dset_type}", f"{base_dir}{lang}/{split}/{lang}{dset_type}")
  for dset_type in dset_types_2017:
    match dset_type:
      case "-dev":
        split = "dev"
      case "-train-medium":
        split = "train"
      case "-covered-test":
        split = "covered_test"
    os.rename(f"{base_dir}{lang}/{lang17}{dset_type}", f"{base_dir}{lang}/{split}/{lang17}{dset_type}")

In [None]:
def delete_all_tsvs():
  for lang in languages:
    for split in splits:
      dirpath = f"{base_dir}{lang}/{split}"
      if os.path.exists(f"{dirpath}/{lang}_{split}_all.tsv"):
        os.remove(f"{dirpath}/{lang}_{split}_all.tsv")

In [None]:
for lang in languages:
  for split in splits:
    dirpath = f"{base_dir}{lang}/{split}"
    df = pl.DataFrame()
    listdir = os.listdir(dirpath)[1::]
    if len(listdir) > 1:
      for f in listdir:
        df_file = pl.read_csv(f"{dirpath}/{f}", separator="\t", has_header=False)
        if "covered" in split and df_file.shape[1] == 3:
          df_file = df_file.drop("column_2")
          df_file = df_file.rename({"column_3": "column_2"})
        if "-" in f and df_file.shape[1] == 3:
          df_file = df_file.select([pl.col("column_1"), pl.col("column_3"), pl.col("column_2")])
          df_file = df_file.rename({"column_2": "column_3", "column_3": "column_2"})
        df = pl.concat([df, df_file])
      df = df.unique()
      print(df.head())
      df.write_csv(f"{dirpath}/{lang}_{split}_all.tsv", separator="\t")


shape: (5, 3)
┌─────────────┬──────────┬───────────────┐
│ column_1    ┆ column_2 ┆ column_3      │
│ ---         ┆ ---      ┆ ---           │
│ str         ┆ str      ┆ str           │
╞═════════════╪══════════╪═══════════════╡
│ foreappoint ┆ V;PST    ┆ foreappointed │
│ oversalt    ┆ V;NFIN   ┆ oversalt      │
│ fearmonger  ┆ V;PST    ┆ fearmongered  │
│ french      ┆ V;PST    ┆ frenched      │
│ halt        ┆ V;NFIN   ┆ halt          │
└─────────────┴──────────┴───────────────┘
shape: (5, 3)
┌───────────┬─────────────────┬────────────┐
│ column_1  ┆ column_2        ┆ column_3   │
│ ---       ┆ ---             ┆ ---        │
│ str       ┆ str             ┆ str        │
╞═══════════╪═════════════════╪════════════╡
│ clue      ┆ V;V.PTCP;PST    ┆ clued      │
│ infect    ┆ V;3;SG;PRS      ┆ infects    │
│ deliquate ┆ V;PRS;NOM(3,SG) ┆ deliquates │
│ outzany   ┆ V;PRS;NOM(3,SG) ┆ outzanies  │
│ beclout   ┆ V;PST           ┆ beclouted  │
└───────────┴─────────────────┴────────────┘
shap

# Model Development

Qualities we would like for our Inflection model:


*   Seq2Seq
*   Multiheaded (Preferably 2)
*   Sparse
*   Hard Attention?
*   BiLSTM for Encoder/Decoder structure?




In [None]:
! pip install torcheval

Collecting torcheval
  Downloading torcheval-0.0.7-py3-none-any.whl.metadata (8.6 kB)
Downloading torcheval-0.0.7-py3-none-any.whl (179 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/179.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.2/179.2 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torcheval
Successfully installed torcheval-0.0.7


In [None]:
import torch.nn as nn
import torch.nn.functional as F
import os
import io
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader
from torcheval.metrics import MulticlassAccuracy, MulticlassPrecision, MulticlassRecall, MulticlassF1Score, BLEUScore, Perplexity
import math

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

## Attention Mechanisms

In [None]:
class HardAttention(nn.Module):
    def __init__(self, encoder_hidden_size, decoder_hidden_size):
        super(HardAttention, self).__init__()
        self.hidden_size = encoder_hidden_size+decoder_hidden_size
        self.attention_weights = nn.Linear(self.hidden_size, 1)

    def forward(self, encoder_outputs, decoder_hidden):
        batch_size = encoder_outputs.size(0)
        seq_len = encoder_outputs.size(1)

        # Repeat decoder hidden state seq_len times
        decoder_hidden = decoder_hidden.unsqueeze(1).repeat(1, seq_len, 1)

        # Concatenate encoder outputs and decoder hidden state
        attention_input = torch.cat((encoder_outputs, decoder_hidden), dim=2)

        # Calculate attention scores
        attention_scores = self.attention_weights(attention_input)
        attention_scores = attention_scores.squeeze(2)

        # Apply hard attention (select max attention score)
        _, max_idx = torch.max(attention_scores, dim=1)

        # Create one-hot vectors
        hard_attention = torch.zeros_like(attention_scores)
        hard_attention.scatter_(1, max_idx.unsqueeze(1), 1)

        # Apply attention to encoder outputs
        context = torch.bmm(hard_attention.unsqueeze(1), encoder_outputs)
        return context.squeeze(1), hard_attention

In [None]:
class MultiHeadSparseAttention(nn.Module):
    def __init__(self, hidden_size, num_heads=2, dropout=0.1):
        super(MultiHeadSparseAttention, self).__init__()
        self.num_heads = num_heads
        self.hidden_size = hidden_size
        self.head_dim = hidden_size // num_heads

        self.q_linear = nn.Linear(hidden_size, hidden_size)
        self.k_linear = nn.Linear(hidden_size, hidden_size)
        self.v_linear = nn.Linear(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, hidden_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, query, key, value, mask=None, sparsity=0.5):
        batch_size = query.size(0)

        # Linear transformations
        q = self.q_linear(query)
        k = self.k_linear(key)
        v = self.v_linear(value)

        # Split into heads
        q = q.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        k = k.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        v = v.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)

        # Calculate attention scores
        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)

        # Apply sparsity by keeping only top-k values
        top_k = int(scores.size(-1) * sparsity)
        top_scores, _ = torch.topk(scores, top_k, dim=-1)
        threshold = top_scores[..., -1:]
        scores = scores * (scores >= threshold)

        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))

        attention = F.softmax(scores, dim=-1)
        attention = self.dropout(attention)

        # Apply attention to values
        out = torch.matmul(attention, v)
        out = out.transpose(1, 2).contiguous()
        out = out.view(batch_size, -1, self.hidden_size)
        return self.out(out)

## Tailored Encoder-Decoder BiLSTM

In [None]:
class MorphologicalDataset(Dataset):
    def __init__(self, data, word_vocab, inflection_vocab, pad_char_len, pad_inf_len):
        self.data = data
        self.word_vocab = word_vocab
        self.inflection_vocab = inflection_vocab
        self.pad_char_len = pad_char_len
        self.pad_inf_len = pad_inf_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        word = [self.word_vocab[c] if c in self.word_vocab else self.word_vocab['?'] for c in self.data['word'][idx]]
        word = word + [self.word_vocab['*']] * (self.pad_char_len - len(word))
        word = torch.tensor(word, dtype=torch.long)

        inflection = [self.inflection_vocab[tag] for tag in self.data['inflection'][idx].split(';')]
        inflection = inflection + [self.inflection_vocab['*']] * (self.pad_inf_len - len(inflection))
        inflection = torch.tensor(inflection, dtype=torch.long)

        result = [self.word_vocab[c] if c in self.word_vocab else self.word_vocab['?'] for c in self.data['result'][idx]]
        result = result + [self.word_vocab['*']] * (self.pad_char_len - len(result))
        result = torch.tensor(result, dtype=torch.long)

        return word, inflection, result, self.pad_char_len, self.pad_inf_len

In [None]:
class MorphologicalEncoder(nn.Module):
    def __init__(self, char_vocab_size, inflection_vocab_size, hidden_size, embedding_size, num_layers=1, dropout=0.1):
        super(MorphologicalEncoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # Character-level embedding
        self.char_embedding = nn.Embedding(char_vocab_size, embedding_size)
        # Inflection embedding
        self.inflection_embedding = nn.Embedding(inflection_vocab_size, embedding_size)

        # BiLSTM for processing characters
        self.char_bilstm = nn.LSTM(embedding_size, hidden_size // 2, num_layers,
                                  batch_first=True, bidirectional=True)

        # Linear transformation for inflection features
        self.inflection_linear = nn.Linear(embedding_size, hidden_size)

        self.dropout = nn.Dropout(dropout)

    def forward(self, src_chars, inflections):
        # Process characters
        char_embedded = self.dropout(self.char_embedding(src_chars))
        char_outputs, (hidden, cell) = self.char_bilstm(char_embedded)

        # Process inflection tags
        inflection_embedded = self.dropout(self.inflection_embedding(inflections))
        inflection_features = self.inflection_linear(torch.mean(inflection_embedded, dim=1))

        # Combine character and inflection information
        inflection_features = inflection_features.unsqueeze(1).expand(-1, char_outputs.size(1), -1)
        encoder_outputs = char_outputs + inflection_features

        return encoder_outputs, hidden, cell

In [None]:
class MorphologicalDecoder(nn.Module):
    def __init__(self, char_vocab_size, hidden_size, embedding_size, num_layers=1, dropout=0.1):
        super(MorphologicalDecoder, self).__init__()
        self.hidden_size = hidden_size
        self.char_vocab_size = char_vocab_size

        self.char_embedding = nn.Embedding(char_vocab_size, embedding_size)
        self.bilstm = nn.LSTM(embedding_size + hidden_size, hidden_size // 2, num_layers,
                             batch_first=True, bidirectional=True)

        self.hard_attention = HardAttention(hidden_size, embedding_size)
        self.multi_head_attention = MultiHeadSparseAttention(hidden_size)

        self.char_predictor = nn.Linear(hidden_size*2, char_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell, encoder_outputs):
        input = input.unsqueeze(1)
        embedded = self.dropout(self.char_embedding(input))

        # Apply hard attention
        context, attention = self.hard_attention(encoder_outputs, hidden[-1])

        # Concatenate embedding and context
        rnn_input = torch.cat((embedded, context.unsqueeze(1)), dim=2)

        # Pass through BiLSTM
        output, (hidden, cell) = self.bilstm(rnn_input, (hidden, cell))

        # Apply multi-head attention
        output = self.multi_head_attention(output, encoder_outputs, encoder_outputs)

        # Make predictions
        prediction = self.char_predictor(torch.cat((output.squeeze(1), context), dim=1))
        return prediction, hidden, cell, attention

In [None]:
class MorphologicalInflectionModel(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(MorphologicalInflectionModel, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def from_pretrained(self, pth_file):
        state_dict = torch.load(pth_file)
        self.load_state_dict(state_dict)
        self.to(self.device)
        self.eval()
        return self

    def forward(self, src_chars, inflections, trg, teacher_forcing_ratio=0.5):
        batch_size = src_chars.size(0)
        trg_len = trg.size(1)
        char_vocab_size = self.decoder.char_vocab_size

        outputs = torch.zeros(batch_size, trg_len, char_vocab_size).to(self.device)

        # Encoder
        encoder_outputs, hidden, cell = self.encoder(src_chars, inflections)

        # First input to decoder is start token
        input = trg[:, 0]

        for t in range(1, trg_len):
            output, hidden, cell, attention = self.decoder(input, hidden, cell, encoder_outputs)
            outputs[:, t] = output

            # Teacher forcing
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[:, t] if teacher_force else top1

        return outputs


In [None]:
def train_model(model, train_loader, criterion, optimizer, device, vocab):
    model.train()
    total_loss = 0
    accuracy = MulticlassAccuracy(num_classes=len(vocab)).to(device)
    precision = MulticlassPrecision(num_classes=len(vocab)).to(device)
    recall = MulticlassRecall(num_classes=len(vocab)).to(device)
    f1 = MulticlassF1Score(num_classes=len(vocab)).to(device)
    bleu = BLEUScore(n_gram=1)

    for batch_idx, (src_chars, inflections, trg, src_len, inf_len) in enumerate(tqdm(train_loader)):
        src_chars = src_chars.to(device)
        inflections = inflections.to(device)
        trg = trg.to(device)
        src_len = src_len.to(device)
        inf_len = inf_len.to(device)

        optimizer.zero_grad()
        output = model(src_chars, inflections, trg)
        # print(output.shape)
        # print(trg.shape)

        batch_out_decoded = []
        batch_trg_decoded = []

        for i in output:
          output_decoded = i.argmax(1)
          output_decoded = [list(vocab.keys())[j.item()] for j in output_decoded]
          output_decoded = ''.join(output_decoded)
          batch_out_decoded.append(output_decoded)

        for i in trg:
          trg_decoded = [list(vocab.keys())[j.item()] for j in i]
          trg_decoded = ''.join(trg_decoded)
          batch_trg_decoded.append(trg_decoded)

        for output_decoded, trg_decoded in zip(batch_out_decoded, batch_trg_decoded):
          bleu.update(output_decoded, [trg_decoded])

        output_dim = output.shape[-1]
        output = output[:, 1:].contiguous().view(-1, output_dim)
        trg = trg[:, 1:].contiguous().view(-1)

        accuracy.update(output, trg)
        precision.update(output, trg)
        recall.update(output, trg)
        f1.update(output, trg)

        loss = criterion(output, trg)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    acc_comp = accuracy.compute()
    prec_comp = precision.compute()
    rec_comp = recall.compute()
    f1_comp = f1.compute()
    bleu_comp = bleu.compute()

    return total_loss / len(train_loader), acc_comp, prec_comp, rec_comp, f1_comp, bleu_comp

In [None]:
def create_char_vocab(data):
    char_vocab = {}
    char_vocab['*'] = 0 # pad
    char_vocab['?'] = 1 # unk
    char_vocab['^'] = 2 # start word
    char_vocab['$'] = 3 # end word
    for item in data['word']:
        for char in item:
            if char not in char_vocab:
                char_vocab[char] = len(char_vocab)
    return char_vocab

In [None]:
def create_inflection_vocab(data):
    inflection_vocab = {}
    inflection_vocab['*'] = 0 # pad
    inflection_vocab['?'] = 1 # unk
    inflection_vocab['^'] = 2 # start word
    inflection_vocab['$'] = 3 # end word
    for item in data['inflection']:
        for tag in item.split(';'):
            if tag not in inflection_vocab:
                inflection_vocab[tag] = len(inflection_vocab)
    return inflection_vocab

In [None]:
def custom_collate_fn(batch):
    src_chars = [item[0] for item in batch]
    inflections = [item[1] for item in batch]
    trg = [item[2] for item in batch]
    trg = torch.stack(trg)
    src_chars = torch.stack(src_chars)
    inflections = torch.stack(inflections)
    src_chars_lengths = [item[3] for item in batch]
    src_chars_lengths = torch.tensor(src_chars_lengths)
    inflections_lengths = [item[4] for item in batch]
    inflections_lengths = torch.tensor(inflections_lengths)


    return src_chars, inflections, trg, src_chars_lengths, inflections_lengths

In [None]:
def trainModelSetup(trainData, pad_char_len, pad_inflection_len, hidden_size=256, embedding_size=128, batch_size=256, num_epochs=10, num_layers=1):
    # Create vocabularies
    char_vocab = create_char_vocab(trainData)  # Create from your dataset
    inflection_vocab = create_inflection_vocab(trainData)  # Create from your dataset

    history = []

    # Create model components
    encoder = MorphologicalEncoder(
        char_vocab_size=len(char_vocab),
        inflection_vocab_size=len(inflection_vocab),
        hidden_size=hidden_size,
        embedding_size=embedding_size,
        num_layers=num_layers
    )

    decoder = MorphologicalDecoder(
        char_vocab_size=len(char_vocab),
        hidden_size=hidden_size,
        embedding_size=embedding_size,
        num_layers=num_layers
    )

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = MorphologicalInflectionModel(encoder, decoder, device).to(device)

    # Create dataset and dataloader
    dataset = MorphologicalDataset(trainData, char_vocab, inflection_vocab, pad_char_len, pad_inflection_len)
    train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate_fn)

    # Training setup
    criterion = nn.CrossEntropyLoss(ignore_index=char_vocab['*'])
    optimizer = torch.optim.Adam(model.parameters())

    # Training loop
    for epoch in tqdm(range(num_epochs)):
        loss, acc, prec, rec, f1, bleu = train_model(model, train_loader, criterion, optimizer, device, char_vocab)
        print(f'Epoch: {epoch+1}, Loss: {loss:.4f}, Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, F1: {f1:.4f}, BLEU: {bleu:.4f}')
        history.append({'loss': loss, 'acc': acc, 'prec': prec, 'rec': rec, 'f1': f1, 'bleu': bleu})

    # Save the model
    if not os.path.exists(f'{base_dir}models'):
        os.makedirs(f'{base_dir}models')
    model_name = input("Enter the model name: ")
    torch.save(model.state_dict(), f'{base_dir}models/{model_name}.pth')
    return model, char_vocab, inflection_vocab, history, model_name

## Data Loading and Preparation

In [None]:
for split in ["covered_test", "dev", "train"]:
  if not os.path.exists(f"{base_dir}{split}_all"):
      os.makedirs(f"{base_dir}{split}_all")
  if not os.path.exists(f"{base_dir}{split}_all/{split}.tsv"):
    df = pl.DataFrame()
    for lang in languages:
      dirpath = f"{base_dir}{lang}/{split}"
      df_file = pl.read_csv(f"{dirpath}/{lang}_{split}_all.tsv", separator='\t')
      df = pl.concat([df, df_file])
    df.write_csv(f"{base_dir}{split}_all/{split}.tsv", separator='\t')

In [None]:
df_test = pl.read_csv(f"{base_dir}covered_test_all/covered_test.tsv", separator='\t')
df_dev = pl.read_csv(f"{base_dir}dev_all/dev.tsv", separator='\t')
df_train = pl.read_csv(f"{base_dir}train_all/train.tsv", separator='\t')

In [None]:
df_test.head()

column_1,column_2
str,str
"""nymshift""","""V;V.PTCP;PST"""
"""ordinate""","""V;3;SG;PRS"""
"""misvalue""","""V;V.PTCP;PST"""
"""transpierce""","""V;PST"""
"""diffuse""","""V;PST"""


In [None]:
df_train.head()

column_1,column_2,column_3
str,str,str
"""clue""","""V;V.PTCP;PST""","""clued"""
"""infect""","""V;3;SG;PRS""","""infects"""
"""deliquate""","""V;PRS;NOM(3,SG)""","""deliquates"""
"""outzany""","""V;PRS;NOM(3,SG)""","""outzanies"""
"""beclout""","""V;PST""","""beclouted"""


In [None]:
df_dev.head()

column_1,column_2,column_3
str,str,str
"""foreappoint""","""V;PST""","""foreappointed"""
"""oversalt""","""V;NFIN""","""oversalt"""
"""fearmonger""","""V;PST""","""fearmongered"""
"""french""","""V;PST""","""frenched"""
"""halt""","""V;NFIN""","""halt"""


In [None]:
df_test = df_test.rename({"column_1": "word", "column_2": "inflection"})
df_dev = df_dev.rename({"column_1": "word", "column_2": "inflection", "column_3": "result"})
df_train = df_train.rename({"column_1": "word", "column_2": "inflection", "column_3": "result"})

In [None]:
df_test.head()

word,inflection
str,str
"""nymshift""","""V;V.PTCP;PST"""
"""ordinate""","""V;3;SG;PRS"""
"""misvalue""","""V;V.PTCP;PST"""
"""transpierce""","""V;PST"""
"""diffuse""","""V;PST"""


In [None]:
df_train.head()

word,inflection,result
str,str,str
"""clue""","""V;V.PTCP;PST""","""clued"""
"""infect""","""V;3;SG;PRS""","""infects"""
"""deliquate""","""V;PRS;NOM(3,SG)""","""deliquates"""
"""outzany""","""V;PRS;NOM(3,SG)""","""outzanies"""
"""beclout""","""V;PST""","""beclouted"""


In [None]:
df_dev.head()

word,inflection,result
str,str,str
"""foreappoint""","""V;PST""","""foreappointed"""
"""oversalt""","""V;NFIN""","""oversalt"""
"""fearmonger""","""V;PST""","""fearmongered"""
"""french""","""V;PST""","""frenched"""
"""halt""","""V;NFIN""","""halt"""


In [None]:
df_test.shape

(11989, 2)

In [None]:
df_train.shape

(65846, 3)

In [None]:
df_dev.shape

(11994, 3)

In [None]:
df_train = df_train.with_columns([(('^' + pl.col("word")) + '$').alias("word"), (('^' + pl.col("result")) + '$').alias("result")])

In [None]:
df_train.head()

word,inflection,result
str,str,str
"""^clue$""","""V;V.PTCP;PST""","""^clued$"""
"""^infect$""","""V;3;SG;PRS""","""^infects$"""
"""^deliquate$""","""V;PRS;NOM(3,SG)""","""^deliquates$"""
"""^outzany$""","""V;PRS;NOM(3,SG)""","""^outzanies$"""
"""^beclout$""","""V;PST""","""^beclouted$"""


In [None]:
df_dev = df_dev.with_columns([(('^' + pl.col("word")) + '$').alias("word"), (('^' + pl.col("result")) + '$').alias("result")])

In [None]:
df_test = df_test.with_columns((('^' + pl.col("word")) + '$').alias("word"))

In [None]:
pad_char_len = max([max(df_test['word'].str.len_chars()), max(df_train['word'].str.len_chars()), max(df_dev['word'].str.len_chars()), max(df_train['result'].str.len_chars()), max(df_dev['result'].str.len_chars())])

In [None]:
pad_char_len

43

In [None]:
pad_inflection_len = max([max(df_test['inflection'].str.split(';').map_elements(lambda x: len(x), return_dtype=int)),
     max(df_train['inflection'].str.split(';').map_elements(lambda x: len(x), return_dtype=int)),
     max(df_dev['inflection'].str.split(';').map_elements(lambda x: len(x), return_dtype=int))])

# 1-Layer Model Training

In [None]:
torch.cuda.empty_cache()

In [None]:
model, char_vocab, inflection_vocab, history, model_name = trainModelSetup(df_train, pad_char_len=pad_char_len, pad_inflection_len=pad_inflection_len)

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 1, Loss: 2.5424, Accuracy: 0.0881, Precision: 0.0881, Recall: 0.0881, F1: 0.0881, BLEU: 0.0000


  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 2, Loss: 1.0803, Accuracy: 0.1982, Precision: 0.1982, Recall: 0.1982, F1: 0.1982, BLEU: 0.0001


  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 3, Loss: 0.7990, Accuracy: 0.2195, Precision: 0.2195, Recall: 0.2195, F1: 0.2195, BLEU: 0.0010


  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 4, Loss: 0.6442, Accuracy: 0.2322, Precision: 0.2322, Recall: 0.2322, F1: 0.2322, BLEU: 0.0044


  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 5, Loss: 0.5425, Accuracy: 0.2411, Precision: 0.2411, Recall: 0.2411, F1: 0.2411, BLEU: 0.0092


  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 6, Loss: 0.4648, Accuracy: 0.2477, Precision: 0.2477, Recall: 0.2477, F1: 0.2477, BLEU: 0.0118


  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 7, Loss: 0.4147, Accuracy: 0.2521, Precision: 0.2521, Recall: 0.2521, F1: 0.2521, BLEU: 0.0135


  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 8, Loss: 0.3644, Accuracy: 0.2563, Precision: 0.2563, Recall: 0.2563, F1: 0.2563, BLEU: 0.0150


  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 9, Loss: 0.3214, Accuracy: 0.2598, Precision: 0.2598, Recall: 0.2598, F1: 0.2598, BLEU: 0.0163


  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 10, Loss: 0.2935, Accuracy: 0.2621, Precision: 0.2621, Recall: 0.2621, F1: 0.2621, BLEU: 0.0167
Enter the model name: e10_bs256_lr001_layers1


ValueError: too many values to unpack (expected 3)

# 1-Layer Model Eval

In [None]:
def load_MI_model(pth_file, layers=1):
    char_vocab = create_char_vocab(df_train)
    inflection_vocab = create_inflection_vocab(df_train)
    encoder = MorphologicalEncoder(
        char_vocab_size=len(char_vocab),
        inflection_vocab_size=len(inflection_vocab),
        hidden_size=256,
        embedding_size=128,
        num_layers=layers
    )
    decoder = MorphologicalDecoder(
        char_vocab_size=len(char_vocab),
        hidden_size=256,
        embedding_size=128,
        num_layers=layers
    )
    model = MorphologicalInflectionModel(encoder, decoder, device)
    state_dict = torch.load(pth_file)
    model.load_state_dict(state_dict)
    model.to(device)
    model.eval()
    return model, char_vocab, inflection_vocab

In [None]:
saved_model, char_vocab, inflection_vocab = load_MI_model(f'{base_dir}models/e10_bs256_lr001_layers1.pth')

  state_dict = torch.load(pth_file)


In [None]:
def evaluate_model(model, test_loader, vocab, device):
    model.eval()
    criterion = nn.CrossEntropyLoss(ignore_index=char_vocab['*'])
    total_loss = 0
    accuracy = MulticlassAccuracy(num_classes=len(vocab)).to(device)
    precision = MulticlassPrecision(num_classes=len(vocab)).to(device)
    recall = MulticlassRecall(num_classes=len(vocab)).to(device)
    f1 = MulticlassF1Score(num_classes=len(vocab)).to(device)
    bleu = BLEUScore(n_gram=1)

    with torch.no_grad():
        for batch_idx, (src_chars, inflections, trg, src_len, inf_len) in enumerate(tqdm(test_loader)):
            src_chars = src_chars.to(device)
            inflections = inflections.to(device)
            trg = trg.to(device)
            src_len = src_len.to(device)
            inf_len = inf_len.to(device)

            output = model(src_chars, inflections, trg)

            batch_out_decoded = []
            batch_trg_decoded = []

            for i in output:
              output_decoded = i.argmax(1)
              output_decoded = [list(vocab.keys())[j.item()] for j in output_decoded]
              output_decoded = ''.join(output_decoded)
              batch_out_decoded.append(output_decoded)

            for i in trg:
              trg_decoded = [list(vocab.keys())[j.item()] for j in i]
              trg_decoded = ''.join(trg_decoded)
              batch_trg_decoded.append(trg_decoded)

            for output_decoded, trg_decoded in zip(batch_out_decoded, batch_trg_decoded):
              bleu.update(output_decoded, [trg_decoded])

            output_dim = output.shape[-1]
            output = output[:, 1:].contiguous().view(-1, output_dim)
            trg = trg[:, 1:].contiguous().view(-1)

            accuracy.update(output, trg)
            precision.update(output, trg)
            recall.update(output, trg)
            f1.update(output, trg)

            loss = criterion(output, trg)
            total_loss += loss.item()
    accuracy = accuracy.compute()
    precision = precision.compute()
    recall = recall.compute()
    f1 = f1.compute()
    bleu = bleu.compute()

    return (total_loss / len(test_loader), accuracy, precision, recall, f1, bleu)

In [None]:
dev_dataset = MorphologicalDataset(df_dev, char_vocab, inflection_vocab, pad_char_len, pad_inflection_len)
dev_loader = DataLoader(dev_dataset, batch_size=256, shuffle=True, collate_fn=custom_collate_fn)

In [None]:
metrics = evaluate_model(saved_model, dev_loader, char_vocab, device=device)

  0%|          | 0/47 [00:00<?, ?it/s]

In [None]:
metrics

(0.30269236291976687,
 tensor(0.2534, device='cuda:0'),
 tensor(0.2534, device='cuda:0'),
 tensor(0.2534, device='cuda:0'),
 tensor(0.2534, device='cuda:0'),
 tensor(0.0104, dtype=torch.float64))

In [None]:
sample = df_dev.sample(1)

In [None]:
sample

word,inflection,result
str,str,str
"""^завезти$""","""V;FUT;1;PL""","""^завезем$"""


In [None]:
sample_set = MorphologicalDataset(sample, char_vocab, inflection_vocab, pad_char_len, pad_inflection_len)

In [None]:
sample_loader = DataLoader(sample_set, batch_size=1, shuffle=False, collate_fn=custom_collate_fn)

In [None]:
batch_index, (src_chars, inflections, trg, src_len, inf_len) = next(enumerate(sample_loader))

In [None]:
src_chars

tensor([[  2,  26,   7,  17,   7,   4,  14,   9,   5,  14,   9,  12, 152,  21,
          27,  14,  24,   3,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0]])

In [None]:
inflections

tensor([[  4, 189,  17, 188,  59, 184,  42,   0]])

In [None]:
trg

tensor([[  2,  26,   7,  17,   7,   4,  14,   9,   5,  14,   9,  12, 152,  21,
         152,  21, 122,  15,   5,  14,   4,  14,  24,  27, 152, 153, 152,  27,
           3,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0]])

In [None]:
src_chars = src_chars.to(device)
inflections = inflections.to(device)
trg = trg.to(device)
src_len = src_len.to(device)
inf_len = inf_len.to(device)

In [None]:
output = saved_model(src_chars, inflections, trg)
output_dim = output.shape[-1]
output = output[:, 1:].contiguous().view(-1, output_dim)
trg = trg[:, 1:].contiguous().view(-1)

NameError: name 'saved_model' is not defined

In [None]:
output

tensor([[-13.1458,   1.2188, -11.0248,  ..., -19.1590,  -8.7312,  -9.7911],
        [-11.0097,  -2.4627, -11.5518,  ...,  -7.8461, -11.9451,  -5.9273],
        [-11.3596,   7.1703, -10.5617,  ...,  -9.9154,  -1.9361,  -3.0348],
        ...,
        [-11.9426,   7.9529, -11.9244,  ..., -22.4644, -14.0809, -13.5733],
        [-11.8796,   7.8761, -11.8704,  ..., -22.2445, -14.0312, -13.4868],
        [-11.7131,   7.5764, -11.6340,  ..., -21.9099, -13.2301, -13.2041]],
       device='cuda:0', grad_fn=<ViewBackward0>)

In [None]:
output.shape

torch.Size([42, 159])

In [None]:
argmax_out = output.argmax(1)

In [None]:
argmax_out

tensor([11, 15,  9, 22,  6, 21, 14, 21, 14,  3,  7,  3,  3,  3,  3,  3,  3,  3,
         3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
         3,  3,  3,  3,  3,  3], device='cuda:0')

In [None]:
trg

tensor([11, 15,  9, 22,  6, 21, 14, 21, 14,  3,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0], device='cuda:0')

In [None]:
argmax_out_decoded = [list(char_vocab.keys())[i.item()] for i in argmax_out if i > 3]

In [None]:
argmax_out_decoded = ''.join(argmax_out_decoded)

In [None]:
argmax_out_decoded

'tonsurarae'

In [None]:
trg_decoded = ''.join([list(char_vocab.keys())[i.item()] for i in trg if i > 3])

In [None]:
trg_decoded

'tonsurara'

With relatively small depth and across multiple languages, we can see that the model has this conjugation basically entirely correct. In fact, it doesn't even really make a mistake on the inflection here, but rather just a spelling error.

# 2-Layer Model

In [None]:
model2layer, char_vocab, inflection_vocab, history, model_name = trainModelSetup(df_train, pad_char_len=pad_char_len, pad_inflection_len=pad_inflection_len, num_layers=2)

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 1, Loss: 2.5757, Accuracy: 0.0856, Precision: 0.0856, Recall: 0.0856, F1: 0.0856, BLEU: 0.0000


  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 2, Loss: 1.3007, Accuracy: 0.1783, Precision: 0.1783, Recall: 0.1783, F1: 0.1783, BLEU: 0.0000


  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 3, Loss: 0.9212, Accuracy: 0.2073, Precision: 0.2073, Recall: 0.2073, F1: 0.2073, BLEU: 0.0001


  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 4, Loss: 0.8574, Accuracy: 0.2130, Precision: 0.2130, Recall: 0.2130, F1: 0.2130, BLEU: 0.0002


  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 5, Loss: 0.6707, Accuracy: 0.2282, Precision: 0.2282, Recall: 0.2282, F1: 0.2282, BLEU: 0.0006


  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 6, Loss: 0.5771, Accuracy: 0.2373, Precision: 0.2373, Recall: 0.2373, F1: 0.2373, BLEU: 0.0026


  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 7, Loss: 0.5000, Accuracy: 0.2447, Precision: 0.2447, Recall: 0.2447, F1: 0.2447, BLEU: 0.0072


  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 8, Loss: 0.4233, Accuracy: 0.2511, Precision: 0.2511, Recall: 0.2511, F1: 0.2511, BLEU: 0.0123


  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 9, Loss: 0.3485, Accuracy: 0.2574, Precision: 0.2574, Recall: 0.2574, F1: 0.2574, BLEU: 0.0149


  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 10, Loss: 0.3083, Accuracy: 0.2607, Precision: 0.2607, Recall: 0.2607, F1: 0.2607, BLEU: 0.0165
Enter the model name: e10_bs256_lr001_layers2


# 2-Layer Model Evaluation

In [None]:
model2layer, char_vocab, inflection_vocab = load_MI_model(f'{base_dir}models/e10_bs256_lr001_layers2.pth', layers=2)

  state_dict = torch.load(pth_file)


In [None]:
metrics = evaluate_model(model2layer, dev_loader, char_vocab, device=device)

  0%|          | 0/47 [00:00<?, ?it/s]

In [None]:
metrics

(0.3497601779217416,
 tensor(0.2517, device='cuda:0'),
 tensor(0.2517, device='cuda:0'),
 tensor(0.2517, device='cuda:0'),
 tensor(0.2517, device='cuda:0'),
 tensor(0.0103, dtype=torch.float64))

In [None]:
output = model2layer(src_chars, inflections, trg)
output_dim = output.shape[-1]
output = output[:, 1:].contiguous().view(-1, output_dim)
trg = trg[:, 1:].contiguous().view(-1)

In [None]:
argmax_out = output.argmax(1)

In [None]:
argmax_out

tensor([ 26,   7,  17,   7,   4,  14,   9,   5,  14,   9,  12, 152,  21, 152,
         21, 122,  15,   5,  14,   4,  14,  24,  27, 152, 153, 152,  27,   3,
        152,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3],
       device='cuda:0')

In [None]:
trg

tensor([ 26,   7,  17,   7,   4,  14,   9,   5,  14,   9,  12, 152,  21, 152,
         21, 122,  15,   5,  14,   4,  14,  24,  27, 152, 153, 152,  27,   3,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       device='cuda:0')

In [None]:
argmax_out_decoded = ''.join([list(char_vocab.keys())[i.item()] for i in argmax_out])
trg_decoded = ''.join([list(char_vocab.keys())[i.item()] for i in trg])
print(f"Predicted: {argmax_out_decoded}")
print(f"Actual:    {trg_decoded}")

Predicted: heyecanlandırır olacakmışım$ı$$$$$$$$$$$$$
Actual:    heyecanlandırır olacakmışım$**************


It seems again that we have this issue where the conjugation is basically 100% correct but begins predicting additional non-pad tokens after the initial EOW token. One way that we can stop this from happening is to "normalize" the output before it is fed back into the model by editing the output tensor to contain only pad tokens after the first EOW token is seen.

# Prediction Normalizer

In [None]:
def normalize_output(output):
    outlen = len(output)
    normalized_output = []
    for i in output:
      normalized_output.append(i)
      if i == 3:
        break
    if len(normalized_output) < outlen:
      for i in range(outlen - len(normalized_output)):
        normalized_output.append(0)
    return torch.tensor(normalized_output)

# Reworked Training

In [None]:
def train_model_normalized(model, train_loader, criterion, optimizer, device, vocab):
    model.train()
    total_loss = 0
    accuracy = MulticlassAccuracy(num_classes=len(vocab)).to(device)
    precision = MulticlassPrecision(num_classes=len(vocab)).to(device)
    recall = MulticlassRecall(num_classes=len(vocab)).to(device)
    f1 = MulticlassF1Score(num_classes=len(vocab)).to(device)
    bleu = BLEUScore(n_gram=1)

    for batch_idx, (src_chars, inflections, trg, src_len, inf_len) in enumerate(tqdm(train_loader)):
        src_chars = src_chars.to(device)
        inflections = inflections.to(device)
        trg = trg.to(device)
        src_len = src_len.to(device)
        inf_len = inf_len.to(device)

        optimizer.zero_grad()
        output = model(src_chars, inflections, trg)
        # print(output.shape)
        # print(trg.shape)

        batch_out_decoded = []
        batch_trg_decoded = []

        for i in output:
          output_decoded = i.argmax(1)
          output_decoded = normalize_output(output_decoded) # added normalizer here
          output_decoded = [list(vocab.keys())[j.item()] for j in output_decoded]
          output_decoded = ''.join(output_decoded)
          batch_out_decoded.append(output_decoded)

        for i in trg:
          trg_decoded = [list(vocab.keys())[j.item()] for j in i]
          trg_decoded = ''.join(trg_decoded)
          batch_trg_decoded.append(trg_decoded)

        for output_decoded, trg_decoded in zip(batch_out_decoded, batch_trg_decoded):
          bleu.update(output_decoded, [trg_decoded])

        output_dim = output.shape[-1]
        output = output.contiguous().view(-1, output_dim)
        trg = trg.contiguous().view(-1)

        accuracy.update(output, trg)
        precision.update(output, trg)
        recall.update(output, trg)
        f1.update(output, trg)

        loss = criterion(output, trg)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    acc_comp = accuracy.compute()
    prec_comp = precision.compute()
    rec_comp = recall.compute()
    f1_comp = f1.compute()
    bleu_comp = bleu.compute()

    return total_loss / len(train_loader), acc_comp, prec_comp, rec_comp, f1_comp, bleu_comp

In [None]:
def trainModelSetup_normalized(trainData, pad_char_len, pad_inflection_len, hidden_size=256, embedding_size=128, batch_size=256, num_epochs=10, num_layers=1):
    # Create vocabularies
    char_vocab = create_char_vocab(trainData)  # Create from your dataset
    inflection_vocab = create_inflection_vocab(trainData)  # Create from your dataset

    history = []

    # Create model components
    encoder = MorphologicalEncoder(
        char_vocab_size=len(char_vocab),
        inflection_vocab_size=len(inflection_vocab),
        hidden_size=hidden_size,
        embedding_size=embedding_size,
        num_layers=num_layers
    )

    decoder = MorphologicalDecoder(
        char_vocab_size=len(char_vocab),
        hidden_size=hidden_size,
        embedding_size=embedding_size,
        num_layers=num_layers
    )

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = MorphologicalInflectionModel(encoder, decoder, device).to(device)

    # Create dataset and dataloader
    dataset = MorphologicalDataset(trainData, char_vocab, inflection_vocab, pad_char_len, pad_inflection_len)
    train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate_fn)

    # Training setup
    criterion = nn.CrossEntropyLoss(ignore_index=char_vocab['*'])
    optimizer = torch.optim.Adam(model.parameters())
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)

    # Training loop
    for epoch in tqdm(range(num_epochs)):
        loss, acc, prec, rec, f1, bleu = train_model_normalized(model, train_loader, criterion, optimizer, device, char_vocab)
        print(f'Epoch: {epoch+1}, Loss: {loss:.4f}, Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, F1: {f1:.4f}, BLEU: {bleu:.4f}')
        history.append({'loss': loss, 'acc': acc, 'prec': prec, 'rec': rec, 'f1': f1, 'bleu': bleu})
        scheduler.step(loss)

    # Save the model
    if not os.path.exists(f'{base_dir}models'):
        os.makedirs(f'{base_dir}models')
    model_name = input("Enter the model name: ")
    torch.save(model.state_dict(), f'{base_dir}models/{model_name}.pth')
    return model, char_vocab, inflection_vocab, history, model_name

In [None]:
torch.cuda.empty_cache()

# Retraining 1-Layer Model

In [None]:
model1layer_normalized, char_vocab, inflection_vocab, history, model_name = trainModelSetup_normalized(df_train, pad_char_len=pad_char_len, pad_inflection_len=pad_inflection_len)

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 1, Loss: 2.5577, Accuracy: 0.0809, Precision: 0.0809, Recall: 0.0809, F1: 0.0809, BLEU: 0.0000


  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 2, Loss: 1.1692, Accuracy: 0.1875, Precision: 0.1875, Recall: 0.1875, F1: 0.1875, BLEU: 0.0001


  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 3, Loss: 0.8235, Accuracy: 0.2169, Precision: 0.2169, Recall: 0.2169, F1: 0.2169, BLEU: 0.0025


  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 4, Loss: 0.6617, Accuracy: 0.2309, Precision: 0.2309, Recall: 0.2309, F1: 0.2309, BLEU: 0.0108


  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 5, Loss: 0.5516, Accuracy: 0.2405, Precision: 0.2405, Recall: 0.2405, F1: 0.2405, BLEU: 0.0206


  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 6, Loss: 0.4642, Accuracy: 0.2479, Precision: 0.2479, Recall: 0.2479, F1: 0.2479, BLEU: 0.0314


  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 7, Loss: 0.4130, Accuracy: 0.2526, Precision: 0.2526, Recall: 0.2526, F1: 0.2526, BLEU: 0.0400


  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 8, Loss: 0.3681, Accuracy: 0.2562, Precision: 0.2562, Recall: 0.2562, F1: 0.2562, BLEU: 0.0474


  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 9, Loss: 0.3236, Accuracy: 0.2599, Precision: 0.2599, Recall: 0.2599, F1: 0.2599, BLEU: 0.0546


  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 10, Loss: 0.2883, Accuracy: 0.2629, Precision: 0.2629, Recall: 0.2629, F1: 0.2629, BLEU: 0.0597
Enter the model name: e10_bs256_lr001_layers2_normalized


# 1-Layer Model Normed Eval

In [None]:
def evaluate_model_normalized(model, test_loader, vocab, device):
    model.eval()
    criterion = nn.CrossEntropyLoss(ignore_index=vocab['*'])
    total_loss = 0
    accuracy = MulticlassAccuracy(num_classes=len(vocab)).to(device)
    precision = MulticlassPrecision(num_classes=len(vocab)).to(device)
    recall = MulticlassRecall(num_classes=len(vocab)).to(device)
    f1 = MulticlassF1Score(num_classes=len(vocab)).to(device)
    bleu = BLEUScore(n_gram=1)

    with torch.no_grad():
        for batch_idx, (src_chars, inflections, trg, src_len, inf_len) in enumerate(tqdm(test_loader)):
            src_chars = src_chars.to(device)
            inflections = inflections.to(device)
            trg = trg.to(device)
            src_len = src_len.to(device)
            inf_len = inf_len.to(device)

            output = model(src_chars, inflections, trg)

            batch_out_decoded = []
            batch_trg_decoded = []

            for i in output:
              output_decoded = i.argmax(1)
              output_decoded = normalize_output(output_decoded) # added normalizer here
              output_decoded = [list(vocab.keys())[j.item()] for j in output_decoded]
              output_decoded = ''.join(output_decoded)
              batch_out_decoded.append(output_decoded)

            for i in trg:
              trg_decoded = [list(vocab.keys())[j.item()] for j in i]
              trg_decoded = ''.join(trg_decoded)
              batch_trg_decoded.append(trg_decoded)

            for output_decoded, trg_decoded in zip(batch_out_decoded, batch_trg_decoded):
              bleu.update(output_decoded, [trg_decoded])

            output_dim = output.shape[-1]
            output = output[:, 1:].contiguous().view(-1, output_dim)
            trg = trg[:, 1:].contiguous().view(-1)

            accuracy.update(output, trg)
            precision.update(output, trg)
            recall.update(output, trg)
            f1.update(output, trg)

            loss = criterion(output, trg)
            total_loss += loss.item()
    accuracy = accuracy.compute()
    precision = precision.compute()
    recall = recall.compute()
    f1 = f1.compute()
    bleu = bleu.compute()

    return (total_loss / len(test_loader), accuracy, precision, recall, f1, bleu)

In [None]:
metrics = evaluate_model_normalized(model1layer_normalized, dev_loader, char_vocab, device=device)

  0%|          | 0/47 [00:00<?, ?it/s]

In [None]:
metrics

(0.32878925952505555,
 tensor(0.2523, device='cuda:0'),
 tensor(0.2523, device='cuda:0'),
 tensor(0.2523, device='cuda:0'),
 tensor(0.2523, device='cuda:0'),
 tensor(0.0448, dtype=torch.float64))

In [None]:
sample = df_dev.sample(1)
sample_set = MorphologicalDataset(sample, char_vocab, inflection_vocab, pad_char_len, pad_inflection_len)
sample_loader = DataLoader(sample_set, batch_size=1, shuffle=False, collate_fn=custom_collate_fn)
batch_index, (src_chars, inflections, trg, src_len, inf_len) = next(enumerate(sample_loader))
src_chars = src_chars.to(device)
inflections = inflections.to(device)
trg = trg.to(device)
src_len = src_len.to(device)
inf_len = inf_len.to(device)

In [None]:
sample

word,inflection,result
str,str,str
"""^lefordíthatatlan$""","""ADJ;TERM(PL)""","""^lefordíthatatlanokig$"""


In [None]:
output = model1layer_normalized(src_chars, inflections, trg)
output_dim = output.shape[-1]
output = output[:, 1:].contiguous().view(-1, output_dim)
trg = trg[:, 1:].contiguous().view(-1)

In [None]:
output_decoded = output.argmax(1)
output_decoded = normalize_output(output_decoded)
output_decoded = [list(char_vocab.keys())[j.item()] for j in output_decoded]
output_decoded = ''.join(output_decoded)

trg_decoded = [list(char_vocab.keys())[j.item()] for j in trg]
trg_decoded = ''.join(trg_decoded)

print(f"Predicted: {output_decoded}")
print(f"Actual:    {trg_decoded}")

Predicted: lefordtthatatlanokig$*********************
Actual:    lefordíthatatlanokig$*********************


For this one, the conjugation is a letter apart. It is still able to capture the majority of the meaning. Perhaps if we increase the number of epochs the model will learn even better on the data.

In [None]:
torch.cuda.empty_cache()

# 1-Layer 20 Epochs

In [None]:
model1layer_e20_norm, char_vocab, inflection_vocab, history, model_name = trainModelSetup_normalized(df_train, pad_char_len=pad_char_len, pad_inflection_len=pad_inflection_len, num_epochs=20)

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 1, Loss: 2.4250, Accuracy: 0.0931, Precision: 0.0931, Recall: 0.0931, F1: 0.0931, BLEU: 0.0000


  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 2, Loss: 1.1031, Accuracy: 0.1939, Precision: 0.1939, Recall: 0.1939, F1: 0.1939, BLEU: 0.0002


  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 3, Loss: 0.8030, Accuracy: 0.2183, Precision: 0.2183, Recall: 0.2183, F1: 0.2183, BLEU: 0.0026


  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 4, Loss: 0.6462, Accuracy: 0.2316, Precision: 0.2316, Recall: 0.2316, F1: 0.2316, BLEU: 0.0095


  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 5, Loss: 0.5403, Accuracy: 0.2409, Precision: 0.2409, Recall: 0.2409, F1: 0.2409, BLEU: 0.0198


  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 6, Loss: 0.4500, Accuracy: 0.2487, Precision: 0.2487, Recall: 0.2487, F1: 0.2487, BLEU: 0.0307


  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 7, Loss: 0.3923, Accuracy: 0.2537, Precision: 0.2537, Recall: 0.2537, F1: 0.2537, BLEU: 0.0404


  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 8, Loss: 0.3547, Accuracy: 0.2570, Precision: 0.2570, Recall: 0.2570, F1: 0.2570, BLEU: 0.0474


  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 9, Loss: 0.3129, Accuracy: 0.2606, Precision: 0.2606, Recall: 0.2606, F1: 0.2606, BLEU: 0.0540


  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 10, Loss: 0.2795, Accuracy: 0.2633, Precision: 0.2633, Recall: 0.2633, F1: 0.2633, BLEU: 0.0576


  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 11, Loss: 0.2532, Accuracy: 0.2654, Precision: 0.2654, Recall: 0.2654, F1: 0.2654, BLEU: 0.0618


  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 12, Loss: 0.2303, Accuracy: 0.2673, Precision: 0.2673, Recall: 0.2673, F1: 0.2673, BLEU: 0.0662


  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 13, Loss: 0.2163, Accuracy: 0.2685, Precision: 0.2685, Recall: 0.2685, F1: 0.2685, BLEU: 0.0686


  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 14, Loss: 0.2023, Accuracy: 0.2696, Precision: 0.2696, Recall: 0.2696, F1: 0.2696, BLEU: 0.0699


  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 15, Loss: 0.1864, Accuracy: 0.2709, Precision: 0.2709, Recall: 0.2709, F1: 0.2709, BLEU: 0.0727


  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 16, Loss: 0.1788, Accuracy: 0.2715, Precision: 0.2715, Recall: 0.2715, F1: 0.2715, BLEU: 0.0738


  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 17, Loss: 0.1709, Accuracy: 0.2721, Precision: 0.2721, Recall: 0.2721, F1: 0.2721, BLEU: 0.0751


  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 18, Loss: 0.1588, Accuracy: 0.2730, Precision: 0.2730, Recall: 0.2730, F1: 0.2730, BLEU: 0.0760


  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 19, Loss: 0.1475, Accuracy: 0.2737, Precision: 0.2737, Recall: 0.2737, F1: 0.2737, BLEU: 0.0778


  0%|          | 0/258 [00:00<?, ?it/s]

Epoch: 20, Loss: 0.1481, Accuracy: 0.2739, Precision: 0.2739, Recall: 0.2739, F1: 0.2739, BLEU: 0.0769
Enter the model name: e20_bs256_lr001_layers1_normalized


# 1-Layer 20 Epochs Evaluation

In [None]:
metrics = evaluate_model_normalized(model1layer_e20_norm, dev_loader, char_vocab, device=device)

  0%|          | 0/47 [00:00<?, ?it/s]

In [None]:
metrics

(0.22123740676869738,
 tensor(0.2605, device='cuda:0'),
 tensor(0.2605, device='cuda:0'),
 tensor(0.2605, device='cuda:0'),
 tensor(0.2605, device='cuda:0'),
 tensor(0.0581, dtype=torch.float64))

In [None]:
sample = df_dev.sample(1)
sample_set = MorphologicalDataset(sample, char_vocab, inflection_vocab, pad_char_len, pad_inflection_len)
sample_loader = DataLoader(sample_set, batch_size=1, shuffle=False, collate_fn=custom_collate_fn)
batch_index, (src_chars, inflections, trg, src_len, inf_len) = next(enumerate(sample_loader))
src_chars = src_chars.to(device)
inflections = inflections.to(device)
trg = trg.to(device)
src_len = src_len.to(device)
inf_len = inf_len.to(device)

In [None]:
sample

word,inflection,result
str,str,str
"""^bileşik$""","""N;ACC;SG;PSS3S""","""^bileşiğini$"""


In [None]:
output = model1layer_e20_norm(src_chars, inflections, trg)
output_dim = output.shape[-1]
output = output[:, 1:].contiguous().view(-1, output_dim)
trg = trg[:, 1:].contiguous().view(-1)

In [None]:
output_decoded = output.argmax(1)
output_decoded = normalize_output(output_decoded)
output_decoded = [list(char_vocab.keys())[j.item()] for j in output_decoded]
output_decoded = ''.join(output_decoded)

trg_decoded = [list(char_vocab.keys())[j.item()] for j in trg]
trg_decoded = ''.join(trg_decoded)

print(f"Predicted: {output_decoded}")
print(f"Actual:    {trg_decoded}")

Predicted: bileşiğini$*******************************
Actual:    bileşiğini$*******************************


# New Model Architecture
Let's try a model architecture with 2 different models:
- A classifier model that distinguishes between input languages
- A pretrained model that evaluates the input based on the detected input language

## Creating New Dataset for Language Classification

In [None]:
languages = ["eng", "heb", "spa", "rus", "hun", "tur"] # 0 = english, 1 = hebrew, etc

In [None]:
splits = ["covered_test", "dev", "train"]

In [None]:
for index, lang in enumerate(languages):
  for split in splits:
    dirpath = f"{base_dir}{lang}/{split}"
    df = pl.DataFrame()
    listdir = os.listdir(dirpath)[1::]
    if len(listdir) > 1:
      for f in listdir:
        df_file = pl.read_csv(f"{dirpath}/{f}", separator="\t", has_header=False)
        if "covered" in split and df_file.shape[1] == 3:
          df_file = df_file.drop("column_2")
          df_file = df_file.rename({"column_3": "column_2"})
        if "-" in f and df_file.shape[1] == 3:
          df_file = df_file.select([pl.col("column_1"), pl.col("column_3"), pl.col("column_2")])
          df_file = df_file.rename({"column_2": "column_3", "column_3": "column_2"})
        df = pl.concat([df, df_file])
      df = df.unique()
      df = df.with_columns(pl.lit(index).alias("language"))
      print(df.head())
      df.write_csv(f"{dirpath}/{lang}_{split}_all_langclass.tsv", separator="\t")

shape: (5, 3)
┌───────────┬─────────────────┬──────────┐
│ column_1  ┆ column_2        ┆ language │
│ ---       ┆ ---             ┆ ---      │
│ str       ┆ str             ┆ i32      │
╞═══════════╪═════════════════╪══════════╡
│ co-chair  ┆ V;V.PTCP;PRS    ┆ 0        │
│ overgloom ┆ V;PRS;NOM(3,SG) ┆ 0        │
│ newname   ┆ V;PRS;NOM(3,SG) ┆ 0        │
│ imbower   ┆ V;NFIN          ┆ 0        │
│ bivy      ┆ V;PST           ┆ 0        │
└───────────┴─────────────────┴──────────┘
shape: (5, 4)
┌──────────────┬─────────────────┬──────────────┬──────────┐
│ column_1     ┆ column_2        ┆ column_3     ┆ language │
│ ---          ┆ ---             ┆ ---          ┆ ---      │
│ str          ┆ str             ┆ str          ┆ i32      │
╞══════════════╪═════════════════╪══════════════╪══════════╡
│ underfinance ┆ V;NFIN          ┆ underfinance ┆ 0        │
│ deracemize   ┆ V;V.PTCP;PST    ┆ deracemized  ┆ 0        │
│ latibulize   ┆ V;V.PTCP;PRS    ┆ latibulizing ┆ 0        │
│ interwrea

In [None]:
for split in ["covered_test", "dev", "train"]:
  if not os.path.exists(f"{base_dir}{split}_all_langclass"):
      os.makedirs(f"{base_dir}{split}_all_langclass")
  if not os.path.exists(f"{base_dir}{split}_all_langclass/{split}.tsv"):
    df = pl.DataFrame()
    for lang in languages:
      dirpath = f"{base_dir}{lang}/{split}"
      df_file = pl.read_csv(f"{dirpath}/{lang}_{split}_all_langclass.tsv", separator='\t')
      df = pl.concat([df, df_file])
    df.write_csv(f"{base_dir}{split}_all_langclass/{split}.tsv", separator='\t')

In [None]:
df_langclass_test = pl.read_csv(f"{base_dir}covered_test_all_langclass/covered_test.tsv", separator='\t')
df_langclass_dev = pl.read_csv(f"{base_dir}dev_all_langclass/dev.tsv", separator='\t')
df_langclass_train = pl.read_csv(f"{base_dir}train_all_langclass/train.tsv", separator='\t')

In [None]:
df_langclass_test = df_langclass_test.rename({"column_1": "word", "column_2": "inflection", "label": "label"})
df_langclass_dev = df_langclass_dev.rename({"column_1": "word", "column_2": "inflection", "column_3": "result", "label": "label"})
df_langclass_train = df_langclass_train.rename({"column_1": "word", "column_2": "inflection", "column_3": "result", "label": "label"})

In [None]:
df_langclass_test.head()

word,inflection,language
str,str,i64
"""co-chair""","""V;V.PTCP;PRS""",0
"""overgloom""","""V;PRS;NOM(3,SG)""",0
"""newname""","""V;PRS;NOM(3,SG)""",0
"""imbower""","""V;NFIN""",0
"""bivy""","""V;PST""",0


## Creating New Model for Language Classification

In [None]:
class LanguageDetectionDataset(Dataset):
    def __init__(self, data, word_vocab, inflection_vocab, pad_char_len, pad_inf_len):
        self.data = data
        self.word_vocab = word_vocab
        self.inflection_vocab = inflection_vocab
        self.pad_char_len = pad_char_len
        self.pad_inf_len = pad_inf_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        word = [self.word_vocab[c] if c in self.word_vocab else self.word_vocab['?'] for c in self.data['word'][idx]]
        word = word + [self.word_vocab['*']] * (self.pad_char_len - len(word))
        word = torch.tensor(word, dtype=torch.long)

        inflection = [self.inflection_vocab[tag] for tag in self.data['inflection'][idx].split(';')]
        inflection = inflection + [self.inflection_vocab['*']] * (self.pad_char_len - len(inflection))
        inflection = torch.tensor(inflection, dtype=torch.long)

        label = self.data['language'][idx]

        return word, inflection, label, self.pad_char_len, self.pad_inf_len

In [None]:
class LanguageDetectionModel(nn.Module):
    def __init__(self, word_vocab_size, inflection_vocab_size, hidden_size, embedding_size, num_layers):
        super(LanguageDetectionModel, self).__init__()
        self.word_embedding = nn.Embedding(word_vocab_size, embedding_size)
        self.inflection_embedding = nn.Embedding(inflection_vocab_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size * 2, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 6)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, word, inflection):
        word_embedded = self.word_embedding(word)
        inflection_embedded = self.inflection_embedding(inflection)
        x = torch.cat((word_embedded, inflection_embedded), dim=2)
        out, _ = self.lstm(x)
        #print(out.shape)
        out = self.fc(out[:, -1, :])
        #print(out.shape)
        out = self.softmax(out)
        #print(out.shape)
        return out

In [None]:
def train_language_detection_model(model, train_loader, criterion, optimizer, device, pad_char_len, pad_inf_len):
    model.train()
    total_loss = 0
    accuracy = MulticlassAccuracy(num_classes=6).to(device)
    precision = MulticlassPrecision(num_classes=6).to(device)
    recall = MulticlassRecall(num_classes=6).to(device)

    for (batch_idx, (word, inflection, label, src_len, inf_len)) in enumerate(tqdm(train_loader)):
        word = word.to(device)
        inflection = inflection.to(device)
        label = label.to(device)
        src_len = src_len.to(device)
        inf_len = inf_len.to(device)

        optimizer.zero_grad()
        output = model(word, inflection)
        label = torch.argmax(label, dim=1)
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        accuracy.update(output, label)
        precision.update(output, label)
        recall.update(output, label)

    acc_comp = accuracy.compute()
    prec_comp = precision.compute()
    rec_comp = recall.compute()

    return total_loss / len(train_loader), acc_comp, prec_comp, rec_comp


In [None]:
def create_word_vocab(data):
    char_vocab = {}
    char_vocab['*'] = 0 # pad
    char_vocab['?'] = 1 # unk
    char_vocab['^'] = 2 # start word
    char_vocab['$'] = 3 # end word
    for item in data['word']:
        for char in item:
            if char not in char_vocab:
                char_vocab[char] = len(char_vocab)
    return char_vocab

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
def custom_collate_fn_langclass(batch):
    onehot = OneHotEncoder()
    src_chars = [item[0] for item in batch]
    inflections = [item[1] for item in batch]
    label = [item[2] for item in batch]
    src_chars = torch.stack(src_chars)
    inflections = torch.stack(inflections)
    labels = onehot.fit_transform(np.array(label).reshape(-1, 1)).toarray()
    labels = torch.tensor(labels, dtype=torch.float32)
    src_chars_lengths = [item[3] for item in batch]
    src_chars_lengths = torch.tensor(src_chars_lengths)
    inflections_lengths = [item[4] for item in batch]
    inflections_lengths = torch.tensor(inflections_lengths)

    return src_chars, inflections, labels, src_chars_lengths, inflections_lengths

In [None]:
def trainModelSetup_language_detection(trainData, pad_char_len, pad_inflection_len, hidden_size=128, embedding_size=256, batch_size=128, num_epochs=10, num_layers=1):
    # Create vocabularies
    word_vocab = create_word_vocab(trainData)
    inflection_vocab = create_inflection_vocab(trainData)

    history = []

    # Create model components
    model = LanguageDetectionModel(
        word_vocab_size=len(word_vocab),
        inflection_vocab_size=len(inflection_vocab),
        hidden_size=hidden_size,
        embedding_size=embedding_size,
        num_layers=num_layers
    )
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    dataset = LanguageDetectionDataset(trainData, word_vocab, inflection_vocab, pad_char_len, pad_inflection_len)
    train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate_fn_langclass)

    # Training setup
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters())
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)

    for epoch in tqdm(range(num_epochs)):
        loss, acc, prec, rec = train_language_detection_model(model, train_loader, criterion, optimizer, device, pad_char_len, pad_inflection_len)
        print(f'Epoch: {epoch+1}, Loss: {loss:.4f}, Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}')
        history.append({'loss': loss, 'acc': acc, 'prec': prec, 'rec': rec})
        scheduler.step(loss)

        # Save the model
    if not os.path.exists(f'{base_dir}models'):
        os.makedirs(f'{base_dir}models')
    model_name = input("Enter the model name: ")
    torch.save(model.state_dict(), f'{base_dir}models/{model_name}.pth')
    return model, word_vocab, inflection_vocab, history, model_name

In [None]:
def evaluate_language_detection_model(model, test_loader, device, word_vocab, pad_char_len, pad_inf_len):
  model.eval()
  total_loss = 0
  accuracy = MulticlassAccuracy(num_classes=6).to(device)
  precision = MulticlassPrecision(num_classes=6).to(device)
  recall = MulticlassRecall(num_classes=6).to(device)
  criterion = nn.CrossEntropyLoss()
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

  with torch.no_grad():
    for batch in tqdm(test_loader):
      word, inflection, label, src_len, inf_len = batch
      word = word.to(device)
      inflection = inflection.to(device)
      label = label.to(device)
      src_len = src_len.to(device)
      inf_len = inf_len.to(device)

      output = model(word, inflection)
      label = torch.argmax(label, dim=1)
      loss = criterion(output, label)

      total_loss += loss.item()
      accuracy.update(output, label)
      precision.update(output, label)
      recall.update(output, label)

  acc_comp = accuracy.compute()
  prec_comp = precision.compute()
  rec_comp = recall.compute()

  return total_loss / len(test_loader), acc_comp, prec_comp, rec_comp

In [None]:
pad_char_len = max([max(df_langclass_test['word'].str.len_chars()), max(df_langclass_train['word'].str.len_chars()), max(df_langclass_dev['word'].str.len_chars()), max(df_langclass_train['result'].str.len_chars()), max(df_langclass_dev['result'].str.len_chars())])

In [None]:
pad_inflection_len = max([max(df_langclass_test['inflection'].str.split(';').map_elements(lambda x: len(x), return_dtype=int)),
     max(df_langclass_train['inflection'].str.split(';').map_elements(lambda x: len(x), return_dtype=int)),
     max(df_langclass_dev['inflection'].str.split(';').map_elements(lambda x: len(x), return_dtype=int))])

In [None]:
model, word_vocab, inflection_vocab, history, model_name = trainModelSetup_language_detection(df_langclass_train, pad_char_len=pad_char_len, pad_inflection_len=pad_inflection_len)



  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/515 [00:00<?, ?it/s]

Epoch: 1, Loss: 1.3176, Accuracy: 0.7288, Precision: 0.7288, Recall: 0.7288


  0%|          | 0/515 [00:00<?, ?it/s]

Epoch: 2, Loss: 1.0519, Accuracy: 0.9931, Precision: 0.9931, Recall: 0.9931


  0%|          | 0/515 [00:00<?, ?it/s]

Epoch: 3, Loss: 1.0503, Accuracy: 0.9940, Precision: 0.9940, Recall: 0.9940


  0%|          | 0/515 [00:00<?, ?it/s]

Epoch: 4, Loss: 1.0481, Accuracy: 0.9960, Precision: 0.9960, Recall: 0.9960


  0%|          | 0/515 [00:00<?, ?it/s]

Epoch: 5, Loss: 1.0478, Accuracy: 0.9962, Precision: 0.9962, Recall: 0.9962


  0%|          | 0/515 [00:00<?, ?it/s]

Epoch: 6, Loss: 1.0474, Accuracy: 0.9964, Precision: 0.9964, Recall: 0.9964


  0%|          | 0/515 [00:00<?, ?it/s]

Epoch: 7, Loss: 1.0466, Accuracy: 0.9972, Precision: 0.9972, Recall: 0.9972


  0%|          | 0/515 [00:00<?, ?it/s]

Epoch: 8, Loss: 1.0462, Accuracy: 0.9977, Precision: 0.9977, Recall: 0.9977


  0%|          | 0/515 [00:00<?, ?it/s]

Epoch: 9, Loss: 1.0459, Accuracy: 0.9979, Precision: 0.9979, Recall: 0.9979


  0%|          | 0/515 [00:00<?, ?it/s]

Epoch: 10, Loss: 1.0461, Accuracy: 0.9977, Precision: 0.9977, Recall: 0.9977
Enter the model name: langclass


## Eval of Langclass

In [None]:
dev_dataset = LanguageDetectionDataset(df_langclass_dev, word_vocab, inflection_vocab, pad_char_len, pad_inflection_len)

In [None]:
dev_loader = DataLoader(dev_dataset, batch_size=256, shuffle=True, collate_fn=custom_collate_fn_langclass)

In [None]:
metrics = evaluate_language_detection_model(model, dev_loader, device, word_vocab, pad_char_len, pad_inflection_len)

  0%|          | 0/47 [00:00<?, ?it/s]

In [None]:
print(f"Loss: {metrics[0]}, Accuracy: {metrics[1]}, Precision: {metrics[2]}, Recall: {metrics[3]}")

Loss: 1.0574422643539754, Accuracy: 0.9863333106040955, Precision: 0.9863333106040955, Recall: 0.9863333106040955


The classifier has a 98.6% accuracy for correctly classifying the input language. This should be good enough for our purposes.

## Train Models By Language

In [None]:
model_params = {
    "eng": {
        "num_epochs": 10,
        "num_layers": 1,
        "batch_size": 128,
    },
    "spa": {
        "num_epochs": 10,
        "num_layers": 1,
        "batch_size": 128,
    },
    "rus": {
        "num_epochs": 10,
        "num_layers": 1,
        "batch_size": 128,
    },
    "heb": {
        "num_epochs": 10,
        "num_layers": 1,
        "batch_size": 128,
    },
    "hun": {
        "num_epochs": 10,
        "num_layers": 1,
        "batch_size": 128,
    },
    "tur": {
        "num_epochs": 10,
        "num_layers": 1,
        "batch_size": 128,
    }
}

In [None]:
def trainModelSetup_language_detection_by_lang(langs, model_params):
  models = {}

  for index, lang in enumerate(langs):

    # Get data splits
    df_train_lang = df_langclass_train.filter(df_langclass_train['language'] == index)
    df_dev_lang = df_langclass_dev.filter(df_langclass_dev['language'] == index)
    df_test_lang = df_langclass_test.filter(df_langclass_test['language'] == index)
    df_train_lang = df_train_lang.select([pl.col("word"), pl.col("inflection"), pl.col("result")])
    df_dev_lang = df_dev_lang.select([pl.col("word"), pl.col("inflection"), pl.col("result")])
    df_test_lang = df_test_lang.select([pl.col("word"), pl.col("inflection")])
    df_train_lang = df_train_lang.with_columns([(('^' + pl.col("word")) + '$').alias("word"), (('^' + pl.col("result")) + '$').alias("result")])
    df_dev_lang = df_dev_lang.with_columns([(('^' + pl.col("word")) + '$').alias("word"), (('^' + pl.col("result")) + '$').alias("result")])
    df_test_lang = df_test_lang.with_columns([(('^' + pl.col("word")) + '$').alias("word")])

    # get pad lengths for chars and inflections
    pad_char_len = max([max(df_train_lang['word'].str.len_chars()), max(df_dev_lang['word'].str.len_chars()), max(df_test_lang['word'].str.len_chars()), max(df_train_lang['result'].str.len_chars()), max(df_dev_lang['result'].str.len_chars())])
    pad_inflection_len = max([max(df_train_lang['inflection'].str.split(';').map_elements(lambda x: len(x), return_dtype=int)),
     max(df_dev_lang['inflection'].str.split(';').map_elements(lambda x: len(x), return_dtype=int)),
     max(df_test_lang['inflection'].str.split(';').map_elements(lambda x: len(x), return_dtype=int))])

    # train the model
    model, word_vocab, inflection_vocab, history, model_name = trainModelSetup_normalized(df_train_lang, pad_char_len=pad_char_len, pad_inflection_len=pad_inflection_len, num_epochs=model_params[lang]["num_epochs"], num_layers=model_params[lang]["num_layers"], batch_size=model_params[lang]["batch_size"])
    dev_dataset = MorphologicalDataset(df_dev_lang, word_vocab, inflection_vocab, pad_char_len, pad_inflection_len)
    dev_loader = DataLoader(dev_dataset, batch_size=128, shuffle=True, collate_fn=custom_collate_fn)
    metrics = evaluate_model_normalized(model, dev_loader, word_vocab, device=device)
    print(f"Language: {lang}")
    print(f"Loss: {metrics[0]}, Accuracy: {metrics[1]}, Precision: {metrics[2]}, Recall: {metrics[3]}, F1: {metrics[4]}, Bleu: {metrics[5]}")

    models[lang] = [model, word_vocab, inflection_vocab, history, model_name]

    # evaluate on samples
    sample = df_dev_lang.sample(100)
    sample_set = MorphologicalDataset(sample, word_vocab, inflection_vocab, pad_char_len=pad_char_len, pad_inf_len=pad_inflection_len)
    sample_loader = DataLoader(sample_set, batch_size=1, shuffle=False, collate_fn=custom_collate_fn)
    predictions_df = pl.DataFrame()
    for batch_index, (src_chars, inflections, trg, src_len, inf_len) in enumerate(sample_loader):
      src_chars = src_chars.to(device)
      inflections = inflections.to(device)
      trg = trg.to(device)
      src_len = src_len.to(device)
      inf_len = inf_len.to(device)
      output = model(src_chars, inflections, trg)
      output_dim = output.shape[-1]
      output = output.contiguous().view(-1, output_dim)
      trg = trg.contiguous().view(-1)
      argmax_out = output.argmax(1)
      argmax_out_decoded = normalize_output(argmax_out)
      argmax_out_decoded = [list(word_vocab.keys())[i.item()] for i in argmax_out_decoded if i > 3]
      output_decoded = ''.join(argmax_out_decoded)
      trg_decoded = [list(word_vocab.keys())[i.item()] for i in trg if i > 3]
      trg_decoded = ''.join(trg_decoded)

      # check predictions
      predictions_df = pl.concat([predictions_df, pl.DataFrame({"Predicted": [output_decoded], "Actual": [trg_decoded]})])
      accuracy = (predictions_df["Predicted"] == predictions_df["Actual"])
    accuracy = np.average(accuracy.to_numpy())
    print(f"Accuracy: {accuracy}")
  return models

In [None]:
models = trainModelSetup_language_detection_by_lang(languages, model_params)

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 1, Loss: 2.6249, Accuracy: 0.1117, Precision: 0.1117, Recall: 0.1117, F1: 0.1117, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 2, Loss: 1.0733, Accuracy: 0.3368, Precision: 0.3368, Recall: 0.3368, F1: 0.3368, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 3, Loss: 0.6595, Accuracy: 0.4116, Precision: 0.4116, Recall: 0.4116, F1: 0.4116, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 4, Loss: 0.5328, Accuracy: 0.4305, Precision: 0.4305, Recall: 0.4305, F1: 0.4305, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 5, Loss: 0.5022, Accuracy: 0.4345, Precision: 0.4345, Recall: 0.4345, F1: 0.4345, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 6, Loss: 0.4944, Accuracy: 0.4354, Precision: 0.4354, Recall: 0.4354, F1: 0.4354, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 7, Loss: 0.4909, Accuracy: 0.4357, Precision: 0.4357, Recall: 0.4357, F1: 0.4357, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 8, Loss: 0.4789, Accuracy: 0.4371, Precision: 0.4371, Recall: 0.4371, F1: 0.4371, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 9, Loss: 0.4769, Accuracy: 0.4376, Precision: 0.4376, Recall: 0.4376, F1: 0.4376, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 10, Loss: 0.4896, Accuracy: 0.4357, Precision: 0.4357, Recall: 0.4357, F1: 0.4357, BLEU: 0.0000
Enter the model name: eng_pt_e10


  0%|          | 0/16 [00:00<?, ?it/s]

Language: eng
Loss: 0.09075779374688864, Accuracy: 0.4713689088821411, Precision: 0.4713689088821411, Recall: 0.4713689088821411, F1: 0.4713689088821411, Bleu: 0.0
Accuracy: 0.86


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 1, Loss: 3.0044, Accuracy: 0.0773, Precision: 0.0773, Recall: 0.0773, F1: 0.0773, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 2, Loss: 2.0343, Accuracy: 0.1798, Precision: 0.1798, Recall: 0.1798, F1: 0.1798, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 3, Loss: 1.3617, Accuracy: 0.2870, Precision: 0.2870, Recall: 0.2870, F1: 0.2870, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 4, Loss: 1.0755, Accuracy: 0.3332, Precision: 0.3332, Recall: 0.3332, F1: 0.3332, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 5, Loss: 0.9068, Accuracy: 0.3609, Precision: 0.3609, Recall: 0.3609, F1: 0.3609, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 6, Loss: 0.7764, Accuracy: 0.3804, Precision: 0.3804, Recall: 0.3804, F1: 0.3804, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 7, Loss: 0.7035, Accuracy: 0.3916, Precision: 0.3916, Recall: 0.3916, F1: 0.3916, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 8, Loss: 0.6401, Accuracy: 0.4003, Precision: 0.4003, Recall: 0.4003, F1: 0.4003, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 9, Loss: 0.5951, Accuracy: 0.4073, Precision: 0.4073, Recall: 0.4073, F1: 0.4073, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 10, Loss: 0.5707, Accuracy: 0.4107, Precision: 0.4107, Recall: 0.4107, F1: 0.4107, BLEU: 0.0000
Enter the model name: heb_pt_e10


  0%|          | 0/16 [00:00<?, ?it/s]

Language: heb
Loss: 0.3663374036550522, Accuracy: 0.33628836274147034, Precision: 0.33628836274147034, Recall: 0.33628836274147034, F1: 0.33628836274147034, Bleu: 0.0
Accuracy: 0.57


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 1, Loss: 2.6132, Accuracy: 0.0925, Precision: 0.0925, Recall: 0.0925, F1: 0.0925, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 2, Loss: 1.3603, Accuracy: 0.2620, Precision: 0.2620, Recall: 0.2620, F1: 0.2620, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 3, Loss: 0.8455, Accuracy: 0.3412, Precision: 0.3412, Recall: 0.3412, F1: 0.3412, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 4, Loss: 0.7100, Accuracy: 0.3610, Precision: 0.3610, Recall: 0.3610, F1: 0.3610, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 5, Loss: 0.6030, Accuracy: 0.3764, Precision: 0.3764, Recall: 0.3764, F1: 0.3764, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 6, Loss: 0.5373, Accuracy: 0.3858, Precision: 0.3858, Recall: 0.3858, F1: 0.3858, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 7, Loss: 0.5108, Accuracy: 0.3887, Precision: 0.3887, Recall: 0.3887, F1: 0.3887, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 8, Loss: 0.5903, Accuracy: 0.3803, Precision: 0.3803, Recall: 0.3803, F1: 0.3803, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 9, Loss: 0.4600, Accuracy: 0.3953, Precision: 0.3953, Recall: 0.3953, F1: 0.3953, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 10, Loss: 0.4434, Accuracy: 0.3966, Precision: 0.3966, Recall: 0.3966, F1: 0.3966, BLEU: 0.0000
Enter the model name: spa_pt_e10


  0%|          | 0/16 [00:00<?, ?it/s]

Language: spa
Loss: 0.22644745651632547, Accuracy: 0.3974309265613556, Precision: 0.3974309265613556, Recall: 0.3974309265613556, F1: 0.3974309265613556, Bleu: 0.0
Accuracy: 0.72


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 1, Loss: 2.9345, Accuracy: 0.0520, Precision: 0.0520, Recall: 0.0520, F1: 0.0520, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 2, Loss: 1.5905, Accuracy: 0.1579, Precision: 0.1579, Recall: 0.1579, F1: 0.1579, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 3, Loss: 0.9411, Accuracy: 0.2158, Precision: 0.2158, Recall: 0.2158, F1: 0.2158, BLEU: 0.0004


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 4, Loss: 0.8187, Accuracy: 0.2266, Precision: 0.2266, Recall: 0.2266, F1: 0.2266, BLEU: 0.0015


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 5, Loss: 0.7090, Accuracy: 0.2355, Precision: 0.2355, Recall: 0.2355, F1: 0.2355, BLEU: 0.0052


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 6, Loss: 0.6521, Accuracy: 0.2409, Precision: 0.2409, Recall: 0.2409, F1: 0.2409, BLEU: 0.0106


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 7, Loss: 0.6180, Accuracy: 0.2436, Precision: 0.2436, Recall: 0.2436, F1: 0.2436, BLEU: 0.0104


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 8, Loss: 0.5993, Accuracy: 0.2450, Precision: 0.2450, Recall: 0.2450, F1: 0.2450, BLEU: 0.0137


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 9, Loss: 0.5663, Accuracy: 0.2476, Precision: 0.2476, Recall: 0.2476, F1: 0.2476, BLEU: 0.0184


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 10, Loss: 0.5548, Accuracy: 0.2485, Precision: 0.2485, Recall: 0.2485, F1: 0.2485, BLEU: 0.0197
Enter the model name: rus_pt_e10


  0%|          | 0/16 [00:00<?, ?it/s]

Language: rus
Loss: 0.33296933211386204, Accuracy: 0.24994820356369019, Precision: 0.24994820356369019, Recall: 0.24994820356369019, F1: 0.24994820356369019, Bleu: 0.02599814298978644
Accuracy: 0.59


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 1, Loss: 2.9058, Accuracy: 0.0935, Precision: 0.0935, Recall: 0.0935, F1: 0.0935, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 2, Loss: 1.4739, Accuracy: 0.2811, Precision: 0.2811, Recall: 0.2811, F1: 0.2811, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 3, Loss: 0.9801, Accuracy: 0.3513, Precision: 0.3513, Recall: 0.3513, F1: 0.3513, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 4, Loss: 0.8718, Accuracy: 0.3685, Precision: 0.3685, Recall: 0.3685, F1: 0.3685, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 5, Loss: 0.7979, Accuracy: 0.3810, Precision: 0.3810, Recall: 0.3810, F1: 0.3810, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 6, Loss: 0.7531, Accuracy: 0.3901, Precision: 0.3901, Recall: 0.3901, F1: 0.3901, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 7, Loss: 0.7243, Accuracy: 0.3951, Precision: 0.3951, Recall: 0.3951, F1: 0.3951, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 8, Loss: 0.7804, Accuracy: 0.3909, Precision: 0.3909, Recall: 0.3909, F1: 0.3909, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 9, Loss: 0.6525, Accuracy: 0.4086, Precision: 0.4086, Recall: 0.4086, F1: 0.4086, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 10, Loss: 0.6125, Accuracy: 0.4159, Precision: 0.4159, Recall: 0.4159, F1: 0.4159, BLEU: 0.0000
Enter the model name: hun_pt_e10


  0%|          | 0/16 [00:00<?, ?it/s]

Language: hun
Loss: 0.4656764008104801, Accuracy: 0.4434782564640045, Precision: 0.4434782564640045, Recall: 0.4434782564640045, F1: 0.4434782564640045, Bleu: 0.0
Accuracy: 0.21


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 1, Loss: 2.7984, Accuracy: 0.0698, Precision: 0.0698, Recall: 0.0698, F1: 0.0698, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 2, Loss: 2.0862, Accuracy: 0.1344, Precision: 0.1344, Recall: 0.1344, F1: 0.1344, BLEU: 0.0001


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 3, Loss: 1.2897, Accuracy: 0.2440, Precision: 0.2440, Recall: 0.2440, F1: 0.2440, BLEU: 0.0086


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 4, Loss: 0.9319, Accuracy: 0.2961, Precision: 0.2961, Recall: 0.2961, F1: 0.2961, BLEU: 0.0557


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 5, Loss: 0.7449, Accuracy: 0.3214, Precision: 0.3214, Recall: 0.3214, F1: 0.3214, BLEU: 0.1206


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 6, Loss: 0.6589, Accuracy: 0.3333, Precision: 0.3333, Recall: 0.3333, F1: 0.3333, BLEU: 0.1647


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 7, Loss: 0.5321, Accuracy: 0.3484, Precision: 0.3484, Recall: 0.3484, F1: 0.3484, BLEU: 0.2230


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 8, Loss: 0.4812, Accuracy: 0.3541, Precision: 0.3541, Recall: 0.3541, F1: 0.3541, BLEU: 0.2520


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 9, Loss: 0.4526, Accuracy: 0.3577, Precision: 0.3577, Recall: 0.3577, F1: 0.3577, BLEU: 0.2668


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 10, Loss: 0.4192, Accuracy: 0.3610, Precision: 0.3610, Recall: 0.3610, F1: 0.3610, BLEU: 0.2812
Enter the model name: tur_pt_e10


  0%|          | 0/16 [00:00<?, ?it/s]

Language: tur
Loss: 0.28037508204579353, Accuracy: 0.34251922369003296, Precision: 0.34251922369003296, Recall: 0.34251922369003296, F1: 0.34251922369003296, Bleu: 0.19595517865559733
Accuracy: 0.46


In [None]:
# new model params
model_params_new = {
    "eng": { # eng ok 0.92
        "num_epochs": 10,
        "num_layers": 1,
        "batch_size": 128,
    },
    "spa": { # spa ok 0.72
        "num_epochs": 10,
        "num_layers": 1,
        "batch_size": 128,
    },
    "rus": { # rus ok 0.92
        "num_epochs": 10,
        "num_layers": 3,
        "batch_size": 128,
    },
    "heb": { # heb ok 0.93
        "num_epochs": 10,
        "num_layers": 2,
        "batch_size": 128,
    },
    "hun": { # hun ok 0.78
        "num_epochs": 10,
        "num_layers": 4,
        "batch_size": 128,
    },
    "tur": { # tur ok 0.90
        "num_epochs": 10,
        "num_layers": 3,
        "batch_size": 128,
    }
}

In [None]:
models = trainModelSetup_language_detection_by_lang(languages, model_params_new)

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 1, Loss: 2.7239, Accuracy: 0.1052, Precision: 0.1052, Recall: 0.1052, F1: 0.1052, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 2, Loss: 1.0241, Accuracy: 0.3502, Precision: 0.3502, Recall: 0.3502, F1: 0.3502, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 3, Loss: 0.7234, Accuracy: 0.4014, Precision: 0.4014, Recall: 0.4014, F1: 0.4014, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 4, Loss: 0.5283, Accuracy: 0.4318, Precision: 0.4318, Recall: 0.4318, F1: 0.4318, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 5, Loss: 0.5093, Accuracy: 0.4340, Precision: 0.4340, Recall: 0.4340, F1: 0.4340, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 6, Loss: 0.4943, Accuracy: 0.4357, Precision: 0.4357, Recall: 0.4357, F1: 0.4357, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 7, Loss: 0.4821, Accuracy: 0.4370, Precision: 0.4370, Recall: 0.4370, F1: 0.4370, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 8, Loss: 0.4797, Accuracy: 0.4373, Precision: 0.4373, Recall: 0.4373, F1: 0.4373, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 9, Loss: 0.4681, Accuracy: 0.4385, Precision: 0.4385, Recall: 0.4385, F1: 0.4385, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 10, Loss: 0.4671, Accuracy: 0.4385, Precision: 0.4385, Recall: 0.4385, F1: 0.4385, BLEU: 0.0000
Enter the model name: eng_pt_l1_e10


  0%|          | 0/16 [00:00<?, ?it/s]

Language: eng
Loss: 0.10868765646591783, Accuracy: 0.469507098197937, Precision: 0.469507098197937, Recall: 0.469507098197937, F1: 0.469507098197937, Bleu: 0.0
Accuracy: 0.83


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 1, Loss: 3.0482, Accuracy: 0.0730, Precision: 0.0730, Recall: 0.0730, F1: 0.0730, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 2, Loss: 2.1758, Accuracy: 0.1527, Precision: 0.1527, Recall: 0.1527, F1: 0.1527, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 3, Loss: 1.5574, Accuracy: 0.2552, Precision: 0.2552, Recall: 0.2552, F1: 0.2552, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 4, Loss: 1.1986, Accuracy: 0.3131, Precision: 0.3131, Recall: 0.3131, F1: 0.3131, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 5, Loss: 1.0150, Accuracy: 0.3406, Precision: 0.3406, Recall: 0.3406, F1: 0.3406, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 6, Loss: 0.8791, Accuracy: 0.3614, Precision: 0.3614, Recall: 0.3614, F1: 0.3614, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 7, Loss: 0.7932, Accuracy: 0.3742, Precision: 0.3742, Recall: 0.3742, F1: 0.3742, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 8, Loss: 0.7215, Accuracy: 0.3852, Precision: 0.3852, Recall: 0.3852, F1: 0.3852, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 9, Loss: 0.6680, Accuracy: 0.3931, Precision: 0.3931, Recall: 0.3931, F1: 0.3931, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 10, Loss: 0.6409, Accuracy: 0.3974, Precision: 0.3974, Recall: 0.3974, F1: 0.3974, BLEU: 0.0000
Enter the model name: heb_pt_l2_e10


  0%|          | 0/16 [00:00<?, ?it/s]

Language: heb
Loss: 0.4477317240089178, Accuracy: 0.3277057111263275, Precision: 0.3277057111263275, Recall: 0.3277057111263275, F1: 0.3277057111263275, Bleu: 0.0
Accuracy: 0.44


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 1, Loss: 2.5770, Accuracy: 0.0969, Precision: 0.0969, Recall: 0.0969, F1: 0.0969, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 2, Loss: 1.2907, Accuracy: 0.2731, Precision: 0.2731, Recall: 0.2731, F1: 0.2731, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 3, Loss: 0.8544, Accuracy: 0.3401, Precision: 0.3401, Recall: 0.3401, F1: 0.3401, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 4, Loss: 0.6875, Accuracy: 0.3644, Precision: 0.3644, Recall: 0.3644, F1: 0.3644, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 5, Loss: 0.6203, Accuracy: 0.3742, Precision: 0.3742, Recall: 0.3742, F1: 0.3742, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 6, Loss: 0.5556, Accuracy: 0.3835, Precision: 0.3835, Recall: 0.3835, F1: 0.3835, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 7, Loss: 0.5044, Accuracy: 0.3900, Precision: 0.3900, Recall: 0.3900, F1: 0.3900, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 8, Loss: 0.4761, Accuracy: 0.3934, Precision: 0.3934, Recall: 0.3934, F1: 0.3934, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 9, Loss: 0.4758, Accuracy: 0.3935, Precision: 0.3935, Recall: 0.3935, F1: 0.3935, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 10, Loss: 0.4485, Accuracy: 0.3964, Precision: 0.3964, Recall: 0.3964, F1: 0.3964, BLEU: 0.0000
Enter the model name: spa_pt_l1_e10


  0%|          | 0/16 [00:00<?, ?it/s]

Language: spa
Loss: 0.24609193950891495, Accuracy: 0.3987821042537689, Precision: 0.3987821042537689, Recall: 0.3987821042537689, F1: 0.3987821340560913, Bleu: 0.0
Accuracy: 0.8


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 1, Loss: 3.0637, Accuracy: 0.0425, Precision: 0.0425, Recall: 0.0425, F1: 0.0425, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 2, Loss: 2.3201, Accuracy: 0.0878, Precision: 0.0878, Recall: 0.0878, F1: 0.0878, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 3, Loss: 1.3924, Accuracy: 0.1723, Precision: 0.1723, Recall: 0.1723, F1: 0.1723, BLEU: 0.0001


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 4, Loss: 1.2208, Accuracy: 0.1936, Precision: 0.1936, Recall: 0.1936, F1: 0.1936, BLEU: 0.0002


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 5, Loss: 0.9613, Accuracy: 0.2113, Precision: 0.2113, Recall: 0.2113, F1: 0.2113, BLEU: 0.0003


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 6, Loss: 0.7670, Accuracy: 0.2281, Precision: 0.2281, Recall: 0.2281, F1: 0.2281, BLEU: 0.0016


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 7, Loss: 1.3145, Accuracy: 0.1878, Precision: 0.1878, Recall: 0.1878, F1: 0.1878, BLEU: 0.0002


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 8, Loss: 0.9432, Accuracy: 0.2142, Precision: 0.2142, Recall: 0.2142, F1: 0.2142, BLEU: 0.0005


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 9, Loss: 0.6719, Accuracy: 0.2374, Precision: 0.2374, Recall: 0.2374, F1: 0.2374, BLEU: 0.0031


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 10, Loss: 0.6167, Accuracy: 0.2422, Precision: 0.2422, Recall: 0.2422, F1: 0.2422, BLEU: 0.0086
Enter the model name: rus_pt_l3_e10


  0%|          | 0/16 [00:00<?, ?it/s]

Language: rus
Loss: 0.4321311730891466, Accuracy: 0.2383076697587967, Precision: 0.2383076697587967, Recall: 0.2383076697587967, F1: 0.2383076697587967, Bleu: 0.0027855153203342627
Accuracy: 0.48


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 1, Loss: 3.0600, Accuracy: 0.0809, Precision: 0.0809, Recall: 0.0809, F1: 0.0809, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 2, Loss: 1.9649, Accuracy: 0.2136, Precision: 0.2136, Recall: 0.2136, F1: 0.2136, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 3, Loss: 2.1120, Accuracy: 0.2097, Precision: 0.2097, Recall: 0.2097, F1: 0.2097, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 4, Loss: 0.9489, Accuracy: 0.3597, Precision: 0.3597, Recall: 0.3597, F1: 0.3597, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 5, Loss: 1.7744, Accuracy: 0.2610, Precision: 0.2610, Recall: 0.2610, F1: 0.2610, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 6, Loss: 1.0039, Accuracy: 0.3490, Precision: 0.3490, Recall: 0.3490, F1: 0.3490, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 7, Loss: 0.9016, Accuracy: 0.3687, Precision: 0.3687, Recall: 0.3687, F1: 0.3687, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 8, Loss: 0.8381, Accuracy: 0.3735, Precision: 0.3735, Recall: 0.3735, F1: 0.3735, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 9, Loss: 0.7236, Accuracy: 0.3922, Precision: 0.3922, Recall: 0.3922, F1: 0.3922, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 10, Loss: 0.6813, Accuracy: 0.4004, Precision: 0.4004, Recall: 0.4004, F1: 0.4004, BLEU: 0.0000
Enter the model name: hun_pt_l4_e10


  0%|          | 0/16 [00:00<?, ?it/s]

Language: hun
Loss: 0.44265686348080635, Accuracy: 0.44031983613967896, Precision: 0.44031983613967896, Recall: 0.44031983613967896, F1: 0.44031983613967896, Bleu: 0.0
Accuracy: 0.25


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 1, Loss: 2.9560, Accuracy: 0.0552, Precision: 0.0552, Recall: 0.0552, F1: 0.0552, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 2, Loss: 2.4553, Accuracy: 0.0957, Precision: 0.0957, Recall: 0.0957, F1: 0.0957, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 3, Loss: 1.7899, Accuracy: 0.1680, Precision: 0.1680, Recall: 0.1680, F1: 0.1680, BLEU: 0.0000


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 4, Loss: 1.1927, Accuracy: 0.2507, Precision: 0.2507, Recall: 0.2507, F1: 0.2507, BLEU: 0.0013


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 5, Loss: 0.8970, Accuracy: 0.2955, Precision: 0.2955, Recall: 0.2955, F1: 0.2955, BLEU: 0.0385


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 6, Loss: 0.8987, Accuracy: 0.3046, Precision: 0.3046, Recall: 0.3046, F1: 0.3046, BLEU: 0.1068


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 7, Loss: 0.6484, Accuracy: 0.3320, Precision: 0.3320, Recall: 0.3320, F1: 0.3320, BLEU: 0.1733


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 8, Loss: 0.5002, Accuracy: 0.3501, Precision: 0.3501, Recall: 0.3501, F1: 0.3501, BLEU: 0.2333


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 9, Loss: 0.4248, Accuracy: 0.3586, Precision: 0.3586, Recall: 0.3586, F1: 0.3586, BLEU: 0.2713


  0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 10, Loss: 0.3921, Accuracy: 0.3626, Precision: 0.3626, Recall: 0.3626, F1: 0.3626, BLEU: 0.2905
Enter the model name: tur_pt_l3_e10


  0%|          | 0/16 [00:00<?, ?it/s]

Language: tur
Loss: 0.2078070230782032, Accuracy: 0.3488612771034241, Precision: 0.3488612771034241, Recall: 0.3488612771034241, F1: 0.3488612771034241, Bleu: 0.21978404737025428
Accuracy: 0.59


In [None]:
tokenizers = [(model[1], model[2]) for model in models.values()]
tokenizers = {lang: tokenizers for lang, tokenizers in zip(languages, tokenizers)}

In [None]:
word_vocab

{'*': 0,
 '?': 1,
 '^': 2,
 '$': 3,
 'b': 4,
 'e': 5,
 's': 6,
 'i': 7,
 'n': 8,
 'm': 9,
 'o': 10,
 'v': 11,
 'd': 12,
 'r': 13,
 'a': 14,
 'y': 15,
 'h': 16,
 'u': 17,
 't': 18,
 'c': 19,
 'p': 20,
 'w': 21,
 'l': 22,
 'f': 23,
 'x': 24,
 'k': 25,
 'g': 26,
 '-': 27,
 'z': 28,
 'H': 29,
 'q': 30,
 'j': 31,
 "'": 32,
 'A': 33,
 'Y': 34,
 'T': 35,
 'M': 36,
 'W': 37,
 'G': 38,
 'æ': 39,
 'E': 40,
 'ö': 41,
 'B': 42,
 'I': 43,
 'D': 44,
 'L': 45,
 'V': 46,
 'S': 47,
 'C': 48,
 'œ': 49,
 'F': 50,
 'ë': 51,
 'é': 52,
 'X': 53,
 'O': 54,
 'R': 55,
 '_': 56,
 '1': 57,
 'ה': 58,
 'ִ': 59,
 'ש': 60,
 'ְ': 61,
 'ׁ': 62,
 'ת': 63,
 'ַ': 64,
 'ּ': 65,
 'ד': 66,
 'ֵ': 67,
 'ל': 68,
 'ע': 69,
 'ו': 70,
 'ר': 71,
 'צ': 72,
 'ט': 73,
 'ב': 74,
 'ח': 75,
 'נ': 76,
 'ף': 77,
 'א': 78,
 'ָ': 79,
 'פ': 80,
 'ֹ': 81,
 'ק': 82,
 'ן': 83,
 'מ': 84,
 'ם': 85,
 'ג': 86,
 'ֶ': 87,
 'ך': 88,
 'ז': 89,
 'כ': 90,
 'י': 91,
 'ׂ': 92,
 'ֻ': 93,
 'ס': 94,
 'ֱ': 95,
 'ץ': 96,
 'ֲ': 97,
 'ñ': 98,
 'í': 99,
 'ü': 100,

In [None]:
inflection_vocab

{'*': 0,
 '?': 1,
 '^': 2,
 '$': 3,
 'V': 4,
 'NFIN': 5,
 'PRS': 6,
 'NOM(3,SG)': 7,
 'V.PTCP': 8,
 'PST': 9,
 '3': 10,
 'SG': 11,
 'column_2': 12,
 '2': 13,
 'PL': 14,
 'FEM': 15,
 'IMP': 16,
 'MASC': 17,
 '1': 18,
 'FUT': 19,
 'N': 20,
 'PSSD': 21,
 'PSS(1,PL)': 22,
 'PSS(2,SG,FEM)': 23,
 'SGDEF': 24,
 'PSS1P': 25,
 'PSS(3,PL,FEM)': 26,
 'NDEF': 27,
 'PSS(3,PL,MASC)': 28,
 'PSS(3,SG,FEM)': 29,
 'V.MSDR': 30,
 'PSS(3,SG,MASC)': 31,
 'PSS(2,PL,FEM)': 32,
 'DEF': 33,
 'PSS2P': 34,
 'PSS2S': 35,
 'PSS(1,SG)': 36,
 'PSS(2,SG,MASC)': 37,
 'PSS(2,PL,MASC)': 38,
 'FUT:MASC': 39,
 'PSS1S': 40,
 'PLDEF': 41,
 'IND': 42,
 'PFV': 43,
 'NOM(1,SG)': 44,
 'NOM(3,PL)': 45,
 'DAT(3,SG)': 46,
 'NOM(FORM,2,SG)': 47,
 'SBJV': 48,
 'ACC(1,PL)': 49,
 'ACC(3,SG)': 50,
 'NOM(INFM,2,SG)': 51,
 'DAT(2,PL)': 52,
 'NOM(INFM,2,PL)': 53,
 'LGSPEC2': 54,
 'V.CVB': 55,
 'DAT(2,SG)': 56,
 'ACC(2,PL)': 57,
 'NOM(2,SG,INFM)': 58,
 'ACC(3,PL)': 59,
 'DAT(1,SG)': 60,
 'ACC(2,SG)': 61,
 'DAT(1,PL)': 62,
 'COND': 63,
 'NO

In [None]:
import json

In [None]:
if not os.path.exists(f"{base_dir}tokenizers"):
  os.makedirs(f"{base_dir}tokenizers")
#json.dump({"word": word_vocab, "inflection": inflection_vocab}, open(f"{base_dir}/tokenizers/class_tokenizer.json", "w"))
json.dump(tokenizers, open(f"{base_dir}/tokenizers/lang_tokenizers.json", "w"))
json.dump(model_params_new, open(f"{base_dir}/tokenizers/model_params.json", "w"))

## Combine Models

In [None]:
class langclassDatasetForEval(Dataset):
  def __init__(self, data, word_vocab, inflection_vocab):
        self.data = data
        self.word_vocab = word_vocab
        self.inflection_vocab = inflection_vocab
        self.pad_char_len = 41

  def __len__(self):
      return len(self.data)

  def __getitem__(self, idx):
      word = [self.word_vocab[c] if c in self.word_vocab else self.word_vocab['?'] for c in self.data['word'][idx]]
      word = word + [self.word_vocab['*']] * (self.pad_char_len - len(word))
      word = torch.tensor(word, dtype=torch.long)

      inflection = [self.inflection_vocab[tag] for tag in self.data['inflection'][idx].split(';')]
      inflection = inflection + [self.inflection_vocab['*']] * (self.pad_char_len - len(inflection))
      inflection = torch.tensor(inflection, dtype=torch.long)

      return word, inflection

In [None]:
def custom_collate_fn_langclass_eval(batch):
    src_chars = [item[0] for item in batch]
    inflections = [item[1] for item in batch]
    src_chars = torch.stack(src_chars)
    inflections = torch.stack(inflections)
    return src_chars, inflections

In [None]:
class MIDatasetForEval(Dataset):
  def __init__(self, data, word_vocab, inf_vocab):
        self.data = data
        self.word_vocab = word_vocab
        self.inflection_vocab = inf_vocab
        self.pad_char_len = 41
        self.pad_inflection_len = 8

  def __len__(self):
      return len(self.data)

  def __getitem__(self, idx):

      word = [self.word_vocab[c] if c in self.word_vocab else self.word_vocab['?'] for c in self.data['word'][idx]]
      word = word + [self.word_vocab['*']] * (self.pad_char_len - len(word))
      word = torch.tensor(word, dtype=torch.long)

      inflection = [self.inflection_vocab[tag] for tag in self.data['inflection'][idx].split(';')]
      inflection = inflection + [self.inflection_vocab['*']] * (self.pad_inflection_len - len(inflection))
      inflection = torch.tensor(inflection, dtype=torch.long)

      df_index = self.data["index"][idx]

      return word, inflection, df_index

In [None]:
class MorphologicalInflectionModelForEval(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(MorphologicalInflectionModelForEval, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def from_pretrained(self, pth_file):
        state_dict = torch.load(pth_file)
        self.load_state_dict(state_dict)
        self.to(self.device)
        self.eval()
        return self

    def forward(self, src_chars, inflections):
        batch_size = src_chars.size(0)
        trg_len = 41
        char_vocab_size = self.decoder.char_vocab_size

        outputs = torch.zeros(batch_size, trg_len, char_vocab_size).to(self.device)

        # Encoder
        encoder_outputs, hidden, cell = self.encoder(src_chars, inflections)

        # First input to decoder is start token
        input = torch.zeros((batch_size, trg_len)).long().to(self.device)[:, 0]

        for t in range(1, trg_len):
            output, hidden, cell, attention = self.decoder(input, hidden, cell, encoder_outputs)
            outputs[:, t] = output
            top1 = output.argmax(1)
            input = top1

        return outputs

In [None]:
def custom_collate_fn_MI_eval(batch):
    src_chars = [item[0] for item in batch]
    inflections = [item[1] for item in batch]
    df_index = [item[2] for item in batch]
    src_chars = torch.stack(src_chars)
    inflections = torch.stack(inflections)
    df_index = torch.tensor(df_index)
    return src_chars, inflections, df_index

In [None]:
def load_langclass(pth_file, word, inf, device):
  char_vocab = word
  inflection_vocab = inf
  model = LanguageDetectionModel(
      word_vocab_size=len(char_vocab),
      inflection_vocab_size=len(inf),
      hidden_size=128,
      embedding_size=256,
      num_layers=1
  )
  state_dict = torch.load(pth_file)
  model.load_state_dict(state_dict)
  model.eval()
  return model, char_vocab, inflection_vocab

In [None]:
def load_MI_model(pth_file, char, inf, params, device):
    char_vocab = char
    inflection_vocab = inf
    encoder = MorphologicalEncoder(
        char_vocab_size=len(char_vocab),
        inflection_vocab_size=len(inflection_vocab),
        hidden_size=256,
        embedding_size=128,
        num_layers=params["num_layers"]
    )
    decoder = MorphologicalDecoder(
        char_vocab_size=len(char_vocab),
        hidden_size=256,
        embedding_size=128,
        num_layers=params["num_layers"]
    )
    model = MorphologicalInflectionModelForEval(encoder, decoder, device)
    state_dict = torch.load(pth_file)
    model.load_state_dict(state_dict)
    model.to(device)
    model.eval()
    return model, char_vocab, inflection_vocab

In [None]:
model_pths = {
    "eng": "eng_pt_l1_e10.pth",
    "heb": "heb_pt_l2_e10.pth",
    "spa": "spa_pt_l1_e10.pth",
    "rus": "rus_pt_l3_e10.pth",
    "hun": "hun_pt_l4_e10.pth",
    "tur": "tur_pt_l3_e10.pth"
}

In [None]:
device

device(type='cuda', index=0)

In [None]:
torch.cuda.empty_cache()

In [None]:
def langclass_w_MI(data, model_pths, device):
  languages = ["eng", "heb", "spa", "rus", "hun", "tur"]
  class_tokenizer = json.load(open(f"{base_dir}/tokenizers/class_tokenizer.json", "r"))
  word_vocab = class_tokenizer["word"]
  inflection_vocab = class_tokenizer["inflection"]
  langclass, _, _ = load_langclass(f"{base_dir}/models/langclass.pth", word_vocab, inflection_vocab, device)
  data = data.with_columns((('^' + pl.col("word")) + '$').alias("word"))
  dataset = langclassDatasetForEval(data, word_vocab, inflection_vocab)
  loader = DataLoader(dataset, batch_size=128, shuffle=False, collate_fn=custom_collate_fn_langclass_eval)
  predictions = []
  for batch_index, (src_chars, inflections) in enumerate(loader):
    lang = langclass(src_chars, inflections)
    lang = lang.argmax(1)
    lang = lang.cpu().detach().numpy()
    predictions.extend(lang)
  data = data.with_columns(pl.Series(predictions).alias("language"))
  data = data.with_row_index()
  data = data.sort("language")
  print(data.head())
  available_langs = data.select("language").unique()
  available_langs = available_langs.to_numpy()
  available_langs = available_langs.flatten()
  print(available_langs)
  tokenizers = json.load(open(f"{base_dir}/tokenizers/lang_tokenizers.json", "r"))
  predictions_MI = []
  model_params = json.load(open(f"{base_dir}/tokenizers/model_params.json", "r"))
  for lang in available_langs:
    data_lang = data.filter(pl.col("language") == lang)
    name_lang = languages[lang]
    model_pth = f"{base_dir}models/{model_pths[name_lang]}"
    char_vocab = tokenizers[name_lang][0]
    inflection_vocab = tokenizers[name_lang][1]
    lang_dataset = MIDatasetForEval(data_lang, char_vocab, inflection_vocab)
    lang_loader = DataLoader(lang_dataset, batch_size=128, shuffle=False, collate_fn=custom_collate_fn_MI_eval)
    model, _, _ = load_MI_model(model_pth, char_vocab, inflection_vocab, model_params[name_lang], device)
    for batch_index, (src_chars, inflections, df_index) in enumerate(lang_loader):
      src_chars = src_chars.to(device)
      inflections = inflections.to(device)
      output = model(src_chars, inflections)
      for out, idx in zip(output, df_index):
        output_decoded = normalize_output(out.argmax(1))
        output_decoded = [list(char_vocab.keys())[i.item()] for i in output_decoded if i > 3]
        output_decoded = ''.join(output_decoded)
        predictions_MI.append((idx, output_decoded))
  predictions_MI = pl.DataFrame(predictions_MI, schema=["index", "result"])
  predictions_MI = predictions_MI.with_columns(pl.col("index").cast(pl.UInt32).alias("index"))
  data = data.join(predictions_MI, on="index")
  data = data.sort("index")
  data = data.drop("index")
  data = data.with_columns(pl.col("word").map_elements(lambda x: x.replace('^', '')).map_elements(lambda x: x.replace('$', '')).alias("word"))
  return data

In [None]:
df_test_sampled = df_test.sample(10)
df_test_sampled = df_test_sampled.rename({"column_1": "word", "column_2": "inflection"})

In [None]:
results = langclass_w_MI(df_test_sampled, model_pths, device)

  state_dict = torch.load(pth_file)
  state_dict = torch.load(pth_file)


shape: (5, 4)
┌───────┬─────────────┬─────────────────┬──────────┐
│ index ┆ word        ┆ inflection      ┆ language │
│ ---   ┆ ---         ┆ ---             ┆ ---      │
│ u32   ┆ str         ┆ str             ┆ i64      │
╞═══════╪═════════════╪═════════════════╪══════════╡
│ 0     ┆ ^gowl$      ┆ V;V.PTCP;PRS    ┆ 0        │
│ 1     ┆ ^fustigate$ ┆ V;PST           ┆ 0        │
│ 2     ┆ ^פָּשַׁע$       ┆ V;NFIN          ┆ 1        │
│ 4     ┆ ^עָבַר$       ┆ V;2;SG;IMP;MASC ┆ 1        │
│ 7     ┆ ^הפליץ$     ┆ V;SG;PRS;FEM    ┆ 1        │
└───────┴─────────────┴─────────────────┴──────────┘
[0 1 3 4 5]


  return dispatch(args[0].__class__)(*args, **kw)
  data = data.with_columns(pl.col("word").map_elements(lambda x: x.replace('^', '')).map_elements(lambda x: x.replace('$', '')).alias("word"))
  data = data.with_columns(pl.col("word").map_elements(lambda x: x.replace('^', '')).map_elements(lambda x: x.replace('$', '')).alias("word"))


In [None]:
results

word,inflection,language,result
str,str,i64,str
"""gowl""","""V;V.PTCP;PRS""",0,"""gowing"""
"""fustigate""","""V;PST""",0,"""fustgated"""
"""פָּשַׁע""","""V;NFIN""",1,"""לִפְשְׁע"""
"""leszármazástan""","""N;IN+ESS;PL""",4,"""leszármazástanbaa"""
"""עָבַר""","""V;2;SG;IMP;MASC""",1,"""עֲבַר"""
"""solmak""","""V;INFR;PST;PERF;POS;DECL;NOM(1…",5,"""solmuştuk"""
"""terület""","""N;ON+ALL(SG)""",4,"""területre"""
"""הפליץ""","""V;SG;PRS;FEM""",1,"""הפליץ"""
"""пропускной""","""ADJ;DAT;FEM;SG""",3,"""пропускнум"""
"""néptánc""","""N;TERM;SG""",4,"""néptáncok"""


It does stuff but it's not great.....