# Old Ukrainian Morphological Parser

This ipython notebook provides a demonstration for the inference of morphological parser for Old Ukranian.
It combines rule-based morphological parser with contextual disambiguation using fine-tuned `XLM-RoBERTa` model with custom classificator.


In [10]:
!pip install git+https://github.com/dashaignatenko/old-uk-rule-based.git

Collecting git+https://github.com/dashaignatenko/old-uk-rule-based.git
  Cloning https://github.com/dashaignatenko/old-uk-rule-based.git to /tmp/pip-req-build-jnb5hris
  Running command git clone --filter=blob:none --quiet https://github.com/dashaignatenko/old-uk-rule-based.git /tmp/pip-req-build-jnb5hris
  Resolved https://github.com/dashaignatenko/old-uk-rule-based.git to commit 8f4f8b6522537d5114b44984467cfeb9d433a325
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [11]:
import re
from collections import defaultdict
import pyconll
import torch
import numpy as np
from transformers import XLMRobertaTokenizer, XLMRobertaModel
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder
from old_uk_parser import analyse_token, normalize_chars

from huggingface_hub import hf_hub_download

import torch.nn.functional as F
import sentencepiece

Define possible morphological feature values and POS-to-feature mappings

In [12]:
feature_vocab = {
    "Case": ["Nom", "Acc", "Gen", "Dat", "Ins", "Loc", "Voc"],
    "Gender": ["Masc", "Fem", "Neut"],
    "Number": ["Sing", "Plur", "Dual", "Count"],
    "Tense": ["Pres", "Past", "Fut", ""],
    "Mood": ["Ind", "Imp"],
    "Person": ["1", "2", "3", ""],
    "Voice": ["Act", "Mid", "Pass", ""],
    "VerbForm": ["Fin", "Inf", "Part", "PartRes", "Conv", ""],
    "Degree": ["Pos", "Cmp", "Sup"],
    "Variant": ["Full", "Short"]
}

pos_features = {
    "NOUN": ["Case", "Gender", "Number"],
    "PROPN": ["Case", "Gender", "Number"],
    "VERB": ["Tense", "Mood", "Person", "Voice", "VerbForm"],
    "ADJ": ["Case", "Gender", "Number", "Degree", "Variant"]
}

label_encoders = {feature: LabelEncoder().fit(values) for feature, values in feature_vocab.items()}

tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")

In [13]:
# Define the morphological disambiguation model class
class MorphologicalDisambiguator(nn.Module):
    def __init__(self, feature_vocab, pos_features):
        super(MorphologicalDisambiguator, self).__init__()
        self.transformer = XLMRobertaModel.from_pretrained("xlm-roberta-base")
        self.hidden_size = self.transformer.config.hidden_size
        self.pos_features = pos_features
        self.analysis_encoders = nn.ModuleDict({
            pos: nn.Linear(sum(len(feature_vocab[f]) for f in feats), self.hidden_size)
            for pos, feats in pos_features.items()
        })
        self.classifiers = nn.ModuleDict({
            pos: nn.ModuleDict({
                feature: nn.Linear(self.hidden_size * 2, len(feature_vocab[feature]))
                for feature in feats
            })
            for pos, feats in pos_features.items()
        })

    def forward(self, input_ids, attention_mask, analysis_features, pos):
        transformer_out = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        context_emb = transformer_out.last_hidden_state[:, 0, :]
        analysis_emb = self.analysis_encoders[pos](analysis_features.mean(dim=1))
        combined_emb = torch.cat([context_emb, analysis_emb], dim=-1)
        logits = {}
        for feature in self.pos_features[pos]:
            logits[feature] = self.classifiers[pos][feature](combined_emb)
        return logits


# Pipeline inference function:
# - Analyze each token with rule-based parser
# - Generate feature vectors
# - Use transformer to predict the most likely feature combination
# - Match to the best rule-based analysis

def morphological_inference(text_with_pos, model, device="cuda" if torch.cuda.is_available() else "cpu"):
    model.to(device)
    model.eval()
    results = []

    for i, (word, upos) in enumerate(text_with_pos):
        if upos not in ["NOUN", "VERB", "ADJ", "PROPN"]:
            results.append({
                # "word": word,
                "word": normalize_chars(word),
                "pos": upos,
                "analysis": analyse_token(word, upos),
                "disambiguated": None
            })
            continue

        context_window = 7
        start_idx = max(0, i - context_window)
        end_idx = min(len(text_with_pos), i + context_window + 1)
        context = [text_with_pos[j][0] for j in range(start_idx, end_idx)]
        target_idx = i - start_idx

        context_text = " ".join(context)
        inputs = tokenizer(
            context_text,
            max_length=128,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        input_ids = inputs["input_ids"].to(device)
        attention_mask = inputs["attention_mask"].to(device)

        analyses = analyse_token(word, upos)
        if not analyses:
            results.append({
                "word": word,
                "pos": upos,
                "analysis": [],
                "disambiguated": None
            })
            continue

        analysis_features = []
        for analysis in analyses:
            feats = analysis.get("features", {})
            one_hot = []
            try:
                for feature in pos_features[upos]:
                    value_set = feats.get(feature, {''})
                    value = next(iter(value_set)) if value_set else ""
                    if upos == "ADJ":
                        value = value if value else "Pos" if feature == "Degree" else "Full" if feature == "Variant" else value
                    elif upos == "VERB":
                        value = value if value else "Ind" if feature == "Mood" else value
                    encoded = np.zeros(len(feature_vocab[feature]))
                    if value:
                        encoded[label_encoders[feature].transform([value])[0]] = 1
                    one_hot.extend(encoded)
                analysis_features.append(one_hot)
            except ValueError as e:
                print(f"Skipping word '{word}' due to unseen label: {str(e)}")
                continue

        max_analyses = 10
        if len(analysis_features) < max_analyses:
            analysis_features.extend([[0] * sum(len(feature_vocab[f]) for f in pos_features[upos])] * (max_analyses - len(analysis_features)))
        else:
            analysis_features = analysis_features[:max_analyses]
        analysis_tensor = torch.tensor(analysis_features, dtype=torch.float).to(device)

        with torch.no_grad():
            logits = model(input_ids, attention_mask, analysis_tensor.unsqueeze(0), upos)

        predicted_features = {}
        for feature in pos_features[upos]:
            pred_idx = torch.argmax(logits[feature], dim=1).item()
            predicted_features[feature] = label_encoders[feature].inverse_transform([pred_idx])[0]

        best_analysis = None
        best_match_score = -1
        for analysis in analyses:
            analysis_feats = analysis.get("features", {})
            match_score = sum(1 for k, v in predicted_features.items() if next(iter(analysis_feats.get(k, {""}))) == v)
            if match_score > best_match_score:
                best_match_score = match_score
                best_analysis = analysis

        results.append({
            "word": word,
            "pos": upos,
            "analysis": analyses,
            "disambiguated": {
                "word_form": best_analysis["word_form"],
                "root": best_analysis["root"],
                "suffix": best_analysis["suffix"],
                "lemma": best_analysis["lemma"],
                "features": predicted_features, #выводятся предсказанные трансформером фичи
                # альтернативно, можно выводить лучший из анализов словарно-правилового парсера
                # "features": best_analysis['features'],
                "reflex": best_analysis.get("reflex", "")
            } if best_analysis else None
        })

    return results

In [14]:
model_path = hf_hub_download(repo_id="dasha-ign/old_uk-disambiguation", filename="old_uk_model-3105.bin")
config_path = hf_hub_download(repo_id="dasha-ign/old_uk-disambiguation", filename="config.json")

model = MorphologicalDisambiguator(feature_vocab=feature_vocab, pos_features=pos_features)
model.load_state_dict(torch.load(model_path))
model.eval()

MorphologicalDisambiguator(
  (transformer): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, 

In [15]:
# Exemplary output
text_with_pos = [['во', 'ADP'],
    ['имя', 'NOUN'],
    ['отца', 'NOUN'],
    ['и', 'CCONJ'],
    ['сына', 'NOUN'],
    ['и', 'CCONJ'],
    ['святого', 'ADJ'],
    ['духа', 'NOUN'],
    ['.', 'PUNCT'],
    ['аминь', 'INTJ'],
    ['.', 'PUNCT']
]

results = morphological_inference(text_with_pos, model)

for result in results:
    print(f"\nWord: {result['word']}")
    print(f"POS: {result['pos']}")
    if result['pos'] == 'PUNCT':
        continue
    print("Analyses from parser:")
    for analysis in result['analysis']:
        print(f"  {analysis}")
    if result['disambiguated']:
        print("Disambiguated analysis:")
        print(f"  {result['disambiguated']}")
    else:
        print("No disambiguated analysis available.")


Word: во
POS: ADP
Analyses from parser:
  {'word_form': 'во', 'root': 'во', 'suffix': '', 'lemma': 'во', 'features': {}}
  {'word_form': 'во', 'root': 'во', 'suffix': '', 'lemma': 'въ', 'features': {}}
No disambiguated analysis available.

Word: имя
POS: NOUN
Analyses from parser:
  {'word_form': 'имя', 'root': 'им', 'suffix': 'я', 'lemma': ['имя'], 'features': {'Case': {'Acc'}, 'Gender': {'Neut'}, 'Number': {'Sing'}}}
  {'word_form': 'имя', 'root': 'им', 'suffix': 'я', 'lemma': ['имя'], 'features': {'Case': {'Gen'}, 'Gender': {'Neut'}, 'Number': {'Sing'}}}
  {'word_form': 'имя', 'root': 'им', 'suffix': 'я', 'lemma': ['имя'], 'features': {'Case': {'Nom'}, 'Gender': {'Masc'}, 'Number': {'Sing'}}}
  {'word_form': 'имя', 'root': 'им', 'suffix': 'я', 'lemma': ['имя'], 'features': {'Case': {'Gen'}, 'Gender': {'Masc'}, 'Number': {'Sing'}}}
  {'word_form': 'имя', 'root': 'им', 'suffix': 'я', 'lemma': ['имя'], 'features': {'Case': {'Nom'}, 'Gender': {'Neut'}, 'Number': {'Sing'}}}
  {'word_for