In [24]:
import json
import pickle

import matplotlib.pyplot as plt
import regex as re  # https://pypi.org/project/regex/
import stanza
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import display  # noqa: A004
from ipywidgets import IntProgress
from nltk import RegexpParser, Tree
from pyjarowinkler.distance import get_jaro_distance

InteractiveShell.ast_node_interactivity = 'none'
%alias open_html open
plt.style.use(['dark_background'])

# If necessary, uncomment these lines to add language data,
# stanza.download('la', package='ittb')
# stanza.download('la', package='proiel')
# stanza.download('la', package='perseus')


In [25]:
# define ancillary functions
highlights = 0  # track highlights
parse_complete = 0  # track fully parsed sentences


def p_bar(mx, desc):
    """Set up a progress bar."""
    global p  # noqa: PLW0603
    p = IntProgress(min=0, max=mx, description=desc)
    return p


def is_pn(token, threshold=0.8):
    """Return True if the token is a personal name, False otherwise.

    Takes a token and compares it to a list of personal names.
    Returns True if the Jaro-Winkler distance is above threshold,
    otherwise returns False.
    """
    token = token.strip().lower()
    return any(
        get_jaro_distance(
            token,
            pn,
            winkler=True,
            scaling=0.1,
        )
        > threshold
        for pn in proper_nouns
    )


def element_to_html(element):
    """Convert NLTK element (token or tree) to HTML representation."""
    _html = ''
    if type(element) is Tree:
        tag = element.label().split('.')[0] if '.' in element.label() else element.label()
        colour = colours.get(tag.split('-')[0] if '-' in tag else tag)

        highlight = ''
        if 'HI' in element.label():
            highlight = 'hi'
            global highlights  # noqa: PLW0603
            highlights += 1

        _html = f'<div class="unit"> \
            <div class="unit_label" style="color: {colour};" title="{key.get(tag)}"> \
            {tag}</div><div class="unit_content {highlight}" style="border-color: {colour};">'
        for subelement in element:
            if type(subelement) is Tree:
                _html += element_to_html(subelement)
            else:
                # if subelement[0] != '.' and 'PUNCT' not in subelement[1]:
                _html += f'<div class="token" title="{subelement[1]}">{subelement[0]}</div>'

        _html += '</div></div>'

    else:
        # if element[0] != '.':
        _html += f'<div class="token" title="{element[1]}">{element[0]}</div>'

    return _html


def render_dataset(tree, start=0, end=100):
    """Generate an annotated HTML representation from an NLTK tree.

    The start and end parameters determine how much of the tree is processed.
    """
    global highlights  # noqa: PLW0603
    highlights = 0
    status = ''

    result = '<!DOCTYPE html><html><head><meta charset="utf-8">\
                <link rel="stylesheet" href="viz_styles.css"/></head><body>'

    for i, sentence in enumerate(tree[start:], start=1):
        if [e for e in sentence if type(e) is Tree]:
            status = 'red'
            if all(type(e) is Tree for e in sentence):
                status = 'green'

        result += f'<div class="sentence"><div class="line_no {status}">{i}</div><div class="container">'
        for el in sentence:
            result += element_to_html(el)

        if end and i == end:
            break

        result += '</div></div>'

    result += '</body></html>'

    if highlights:
        print(f'{highlights} highlighted elements.')

    return result


def compile_statistics(tree):
    """Compile statistics from an NLTK tree."""
    parsed_sentences = 0
    ops = 0

    def parse(element, ops):
        if type(element) is Tree:
            if 'OP' in element.label():
                ops += 1

            for subelement in element:
                if type(subelement) is Tree:
                    ops = parse(subelement, ops)
        return ops

    for sentence in tree:
        if all(type(e) is Tree for e in sentence):
            parsed_sentences += 1

        for element in sentence:
            ops = parse(element, ops)

    return {'parsed': parsed_sentences, 'ops': ops}


In [26]:
# load clean dataset
with open('data/lines_clean_dev.txt') as file:  # dev lines
    corpus = file.read()

# load glosses
with open('data/gloss_dictionary.json') as file:
    lemma_dict = json.loads(file.read())

# load noun list
with open('data/nouns.json') as file:
    nouns = json.loads(file.read())

# load proper noun list
with open('data/proper_nouns.json') as file:
    proper_nouns = [pn.lower() for pn in json.loads(file.read())]


# load unit list
with open('data/measurement_units.json') as file:
    units = json.loads(file.read())


In [None]:
# special words and lemmata for which the lemma should be included in the POS tags
special_words = [
    'item',
    'et',
    'cum',
    'in',
    'de',
    'primo',
    'ad',
    'sive',
    'seu',
    'an',
    'lo',
    'aliquos',
]

special_lemmata = [
    'domina',
    'dominus',
    'sanctus',
    'dictus',
    'dico',
    'alius',
    'subdo',
    'uxor',
    'dimidium',
    'circa',
    'sine',
    'medius',
    'idem',
    'supradico',
    'habeo',
    'mulier',
    'homo',
    'muliebris',
    'situs',
    'scio',
    'quidam',
    'sistere',
    'pro',
    'pignus',
    'folrare',
    'camera',
    'appello',
    'vocare',
    'iuxta',
    'isque',
    'aliqui',
    'sanus',
    'sequor',
    'clausa',
    'garnire',
    'frezatus',
    'munire',
    'sum',
    'scilicet',
    'modus',
    'alias',
    'qui',
    'praedictus',
    'ponere',
    'atque',
    'meianus',
    'confrontare',
    'apud',
    'folratura',
    'impignorare',
    'invenio',
    'hic',
    'bonum',
    'prout',
    'infra',
    'ecce',
    'parafernalia',
    'dos',
    'jocalis',
    'reperire',
    'mobilis',
    'peto',
    'condam',
    'omnis',
    'annus',
    'servire',
    'moneta',
    'currere',
    'valeo',
    'ascendere',
    'ultra',
    'avi',
    'maternus',
    'ipse',
    'frater',
    'frodium',
    'dicere',
    'videlicet',
    'terra',
    'depingere',
    'mutuare',
    'tenere',
    'causa',
    'mutuum',
    'manus',
    'per',
    'armigeratus',
    'munio',
    'oratio',
    'describere',
]

# add units extracted manually to list
units = [
    *units,
    'solidus',
    'florenus',
    'denarius',
    'currus',
    'grossus',
    'scutum',
    'censualis',
]

numerals = [
    'unus',
    'duo',
    'duodena',
    'duodenus',
    'sex',
]

locations = {
    'camera': 'room',
    'aula': 'room',
    'coquina': 'room',
    'cellarium': 'room',
    'transversia': 'lm',
    'hospicium': 'imm',
    'socolus': 'room',
    'capella': 'room',
    'bastida': 'lm',
    'domus': 'lm',
    'molendinum': 'imm',
    'carreria': 'lm',
    'portus': 'lm',
    'porticus': 'room',
    'operatorium': 'imm',
    'lignum': 'imm',
    'viridarium': 'imm',
    'canton': 'lm',
    'canto': 'lm',
    'monialis': 'lm',
    'vinea': 'imm',
    'macellarius': 'imm',
    'saux': 'lm',
}

typos = {
    'itum': ('item', 'item'),
    'quoddum': ('quoddam', 'quidam'),
    'bassinteum': ('bassinetum', 'bacinus'),
}

# define key and colour concordance (used to visualise results)
key = {
    'AGT': 'Agent',
    'AP-Cu': 'Asset phrase - currency',
    'AP-Loan': 'Asset phrase - loan related',
    'AP-Oth': 'Asset phrase - other',
    'AP-Rs': 'Asset phrase - real state',
    'ATT': 'Attribute',
    'ATT-Abl': 'Attribute - ablative',
    'ATT-Add': 'Attribute - additive',
    'ATT-Adj': 'Attribute - adjectival',
    'ATT-Cont': 'Attribute - containment',
    'ATT-Deco': 'Attribute - decorative',
    'ATT-Gen': 'Attribute - genitive',
    'ATT-Gend': 'Attribute - gender',
    'ATT-Ger': 'Attribute - gerundive',
    'ATT-Mag': 'Attribute - magnitude',
    'ATT-Part': 'Attribute - participial',
    'ATT-Pos': 'Attribute - posession',
    'ATT-Ref': 'Attribute - reference',
    'ATT-Stat': 'Attribute - state',
    'ATT-Sub': 'Attribute - substractive',
    'LOC-lm': 'Location - landmark',
    'LOC-rm': 'Location - room',
    'LOC-rs': 'Location - immobilia',
    'NAME': 'Name',
    'OBJ': 'Object',
    'OBJ-Alt': 'Object - alternates',
    'OBJ-c': 'Object - compound',
    'OBJ-x': 'Object - estimated/contextual',
    'OP': 'Object phrase',
    'OP-Cont': 'Object phrase - container',
    'pAbl': 'Particle - ablative',
    'pAdd': 'Particle - additive',
    'pAlt': 'Particle - alternative',
    'pComp': 'Particle - comparative',
    'pCont': 'Particle - containement',
    'pCoor': 'Particle - coordinating conjunction',
    'pCtx': 'Particle - context',
    'pDeco': 'Particle - decorative',
    'pList': 'Particle - list marker',
    'pLoc': 'Particle - locative',
    'pPurp': 'Particle - purpose/function',
    'pQual': 'Particle - qualification',
    'pRef': 'Particle - reference',
    'pRnd': 'Particle - rounding',
    'pStat': 'Particle - state',
    'pSub': 'Particle - substrative',
    'QT': 'Quantity',
    'QT-Gen': 'Quantity - genitive',
    'RP': 'Room phrase',
    'UNIT': 'Unit of measurement',
}

colours = {
    '1': '#ffffff',  # white
    'NAME': '#ead03a',  # yellow
    'LOC': '#a27230',  # brown
    '4': '#ea8f3a',  # orange
    'AP': '#ff6e6e',  # red
    'UNIT': '#ffa7b5',  # pink
    'RP': '#db5ce0',  # magenta
    'AGT': '#8c5ce0',  # purple
    'QT': '#006cff',  # blue
    'NP': '#3a9eea',  # light blue
    'OP': '#3aea74',  # light green
    'ALT': '#30a250',  # green
    'OBJ': '#b2d29f',  # mint green
    'ATT': '#2ebdad',  # cyan
}

In [28]:
# create a configuration object for Stanza
config = {
    'processors': 'tokenize,pos,lemma',
    'lang': 'la',
    'tokenize_pretokenized': True,
}

# create a Stanza pipeline and analyse the corpus
nlp = stanza.Pipeline(**config)
doc = nlp(corpus)
print('Corpus analysis completed.')


2025-06-02 16:21:18 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  â€¦

2025-06-02 16:21:18 INFO: Downloaded file to /Users/gabep/stanza_resources/resources.json
2025-06-02 16:21:18 INFO: Loading these models for language: la (Latin):
| Processor | Package       |
-----------------------------
| tokenize  | ittb          |
| pos       | ittb_nocharlm |
| lemma     | ittb_nocharlm |

2025-06-02 16:21:18 INFO: Using device: cpu
2025-06-02 16:21:18 INFO: Loading: tokenize
2025-06-02 16:21:18 INFO: Loading: pos
2025-06-02 16:21:19 INFO: Loading: lemma
2025-06-02 16:21:19 INFO: Done loading processors!


Corpus analysis completed.


In [29]:
# extract tokens with POS-tags and apply overrides
tagged_sentences = []  # store results
lemmata = {}  # also generate a word -> lemma concordance
total_sentences = len(doc.sentences)
display(p_bar(total_sentences, 'Compiling tree'))  # show progress bar
print(f'Processing {total_sentences} sentences...')

for sentence in doc.sentences:
    sent = []
    for word in sentence.words:
        _pos = word.pos
        _xpos = word.xpos
        _feats = word.feats if word.feats else 'Feats=0'
        _lemma = word.lemma
        _word = word.text

        if typos.get(word.text):
            _word = typos[word.text][0]
            _lemma = typos[word.text][1]

        # apply overrides
        if _lemma in nouns:
            _feats += '|Object=1'
            if _pos == 'PUNCT':
                _pos = 'NOUN'

        if _pos == 'PUNCT':
            _pos = 'ADJ'
            _feats = 'Feats=0'

        if _lemma in numerals or _word.isdigit() or re.fullmatch(r'[ivxcm]+', _word):
            _pos = 'NUM'

        if _lemma in units:
            _feats += '|Function=Unit'
            # _pos = 'UNIT'

        if _word in proper_nouns:
            # if is_pn(word.text.lower(), 0.95):
            # _pos = 'PNAME'
            _feats += '|Capital=1'

        if _lemma in locations:
            _feats += f'|Function=Location|Type={locations[_lemma]}'
            # _pos = 'LOC'

        # add lemma for special cases
        if _word in special_words or _lemma in special_lemmata:
            token = (_word, f'{_pos}-{_feats}|Lemma={_lemma}')
        else:
            token = (_word, f'{_pos}-{_feats}')

        sent.append(token)
        lemmata[_word] = _lemma

    tagged_sentences.append(sent)
    p.value += 1

print(f'Extraction completed on {len(tagged_sentences)} sentences.')

# persist the lemmata to disk
with open('results/lemmata.pickle', 'wb') as file:
    pickle.dump(lemmata, file)


IntProgress(value=0, description='Compiling tree', max=204)

Processing 204 sentences...
Extraction completed on 204 sentences.


In [30]:
# open file with chunking rules
with open('rules/chunk_rules.chk') as file:
    rules = file.readlines()

# parse and generate grammar
rg_grammar = ''

for rule in rules:
    if rule.strip() and rule[0] != '#':
        clean_rule = f'{rule[: rule.index("#")]}\n' if '#' in rule else rule
        rg_grammar += clean_rule

# chunk the dataset based on the rules we defined
rg_parser = RegexpParser(rg_grammar)  # define a chunker with our grammar
parsed_sentences = []  # define list to store parsed data
display(p_bar(total_sentences, 'Chunking corpus'))  # show progress bar

for sentence in tagged_sentences:
    parsed_sentences.append(rg_parser.parse(sentence))
    p.value += 1

print(f'Chunking completed. {len(parsed_sentences)} sentences processed.')

# persist the parsed sentences
with open('results/parsed_sentences.pickle', 'wb') as file:
    pickle.dump(parsed_sentences, file)


IntProgress(value=0, description='Chunking corpus', max=204)

Chunking completed. 204 sentences processed.


In [34]:
# visualize the results
viz_data = render_dataset(parsed_sentences, 0, 0)

stats = compile_statistics(parsed_sentences)
print(
    f'{stats["parsed"]} out of {total_sentences} sentences fully parsed',
    f'({round(stats["parsed"] * 100 / total_sentences, 2)}%)',
)

if parse_complete:
    print(f'Previous run: {parse_complete} sentences ({round(parse_complete * 100 / total_sentences, 2)}%)')

parse_complete = stats['parsed']

# save results to a local file
with open('results-viz/chunks.html', 'w') as file:
    file.write(viz_data)


4 out of 204 sentences fully parsed (1.96%)
Previous run: 4 sentences (1.96%)


In [33]:
# open visualization in new tab
!open 'results-viz/chunks.html'


In [35]:
# clean the tree to keep only level 1 tags
l1_tags = {
    'OP': 'OBJECT',
    'OP-Cont': 'CONTAINER',
    'RP': 'ROOM',
    'AP-Cu': 'CURRENCY',
    'AP-Rs': 'PROPERTY',
    'AP-Loan': 'LOAN',
    'AP-Oth': 'ASSET',
}

shallow_tree_sentences = []

for sentence in parsed_sentences:
    clean_sentence = []
    for element in sentence:
        if type(element) is Tree:
            if element.label() in l1_tags:
                clean_sentence.append(Tree(l1_tags[element.label()], element.flatten().leaves()))
            else:
                clean_sentence += element.flatten().leaves()
        else:
            clean_sentence.append(element)

    shallow_tree_sentences.append(Tree('S', clean_sentence))

print(f'{len(shallow_tree_sentences)} shallow sentences generated.')
# shallow_tree_sentences[0].draw()


204 shallow sentences generated.


In [38]:
# persist the parsed sentences
with open('results/parsed_sentences_shallow.pickle', 'wb') as file:
    pickle.dump(shallow_tree_sentences, file)

# visualize the results
viz_shallow_data = render_dataset(shallow_tree_sentences, 0, 0)

# save results to a local file
with open('results-viz/chunks_shallow.html', 'w') as file:
    file.write(viz_shallow_data)


In [39]:
# open visualization in new tab
!open 'results-viz/chunks_shallow.html'