In [1]:
# Useful functions
import regex

def roman_number(inp: str) -> int:
    """
    Source: https://stackoverflow.com/questions/19308177/converting-roman-numerals-to-integers-in-python
    Author: https://stackoverflow.com/users/1201737/r366y
    :param num:
    :return:

    >>> roman_number("XXIV")
    24
    """
    roman_numerals = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
    result = 0
    for i, c in enumerate(inp.upper()):
        if (i+1) == len(inp) or roman_numerals[c] >= roman_numerals[inp[i+1]]:
            result += roman_numerals[c]
        else:
            result -= roman_numerals[c]
    return result

def uvji(val):
    return val.replace("V", "U").replace("v", "u").replace("J", "I").replace("j", "i")

def get_tasks(morph):
    return dict([
        m.split("=") 
        for m in morph.split("|")
    ])

# Useful constants

WRONG_CLITICS = {"quis1"}
DOTS_EXCEPT_APOSTROPHES = r".?!\"“”\"«»…\[\]\(\)„“"
TASKS = "form,lemma,Deg,Numb,Person,Mood_Tense_Voice,Case,Gend,Dis,pos".split(",")

In [3]:


clitics = []
files = {
    
}

with open("latin-chretien-v2.tsv") as f:
    header = []
    cur_text = None
    previous_anno = None
    anno = None
    for lineno, line in enumerate(f):
        line = line.strip().split("\t")
        if lineno == 0:
            header = line
            continue
            
        previous_anno = anno
        anno = dict(zip(header, line))
        if anno["form"].startswith("urn:"):
            cur_text = anno["form"]
            files[anno["form"]] = []
            continue
            
        
        if anno["form"] in DOTS_EXCEPT_APOSTROPHES:
            if files[cur_text][-1] != {}:
                files[cur_text].append({})
            continue
            
        anno["Dis"] = "_"
        
        anno.update(get_tasks(anno["morph"]))
        anno["Mood_Tense_Voice"] = "|".join([
            anno.get(part, "_")
            for part in "Mood_Tense_Voice".split("_")
        ]).replace("_|_|_", "_")
        
        
        if anno["lemma"].isnumeric():
            if int(anno["lemma"]) > 3:
                anno["lemma"] = anno["form"] = "3"
                
        if anno["lemma"][-1].isnumeric() and len(anno["lemma"]) > 1:
            anno["lemma"], anno["Dis"] = anno["lemma"][:-1], anno["lemma"][-1]
            
        if anno["lemma"] == "[ROMAN_NUMBER]":
            anno["lemma"] = anno["form"] = roman_number(anno["form"])
            if anno["lemma"] > 3:
                anno["lemma"] = anno["form"] = "3"
        
        if anno["lemma"] == "[Greek]":
            continue
            
        if anno["POS"] == "OUT":
            print(anno)
            
        if anno["POS"] == "PUNC":
            continue
            
        if anno["POS"] == "VERaux":
            anno["POS"] = "VER"
            
        if len(files[cur_text]) and files[cur_text][-1] and files[cur_text][-1] == previous_anno \
            and files[cur_text][-1]["form"] == anno["form"]:
            if anno["lemma"] not in WRONG_CLITICS:
                clitics.append(anno["lemma"])
                files[cur_text][-1]["lemma"] = files[cur_text][-1]["lemma"]+"界"+anno["lemma"]
                continue
        
        anno["lemma"] = uvji(anno["lemma"])
        anno["form"] = uvji(anno["form"])
        
        if "." in anno["form"]:
            print(anno)
        
        if True and anno["POS"].startswith("NOM"):
            anno["POS"] = "NOM"
        anno["pos"] = anno["POS"]
        
        files[cur_text].append(anno)

In [4]:
# Write
        
with open("lasla-model-ready.tsv", "w") as full:
    full.write("\t".join(TASKS)+"\n")
    for file in files:
        print(f"{file} has {len([an for an in files[file] if an])} tokens "
              f"and {len([an for an in files[file] if not an])} sentences")
        with open(f"lasla-model-ready/{file}.tsv", "w") as f:
            f.write("\t".join(TASKS)+"\n")
            for annot in files[file]:
                if not annot:
                    f.write("\n")
                    continue
                f.write("\t".join([annot.get(h, "_") for h in TASKS])+"\n")
                full.write("\t".join([annot.get(h, "_") for h in TASKS])+"\n")
    full.write("\n")
print(clitics)

urn:cts:latinLit:stoa0275.stoa022.opp-lat1:3 has 558 tokens and 36 sentences
urn:cts:latinLit:stoa0275.stoa027.opp-lat2:9-10 has 621 tokens and 36 sentences
urn:cts:latinLit:stoa0040.stoa003.opp-lat4:17.4 has 2467 tokens and 128 sentences
urn:cts:latinLit:stoa0040.stoa011.opp-lat4:262.1-262.4 has 571 tokens and 17 sentences
urn:cts:latinLit:stoa0270.stoa002.opp-lat2:9-10 has 503 tokens and 27 sentences
urn:cts:latinLit:stoa0238.stoa002.perseus-lat2:pr.1-1.20 has 464 tokens and 15 sentences
urn:cts:latinLit:stoa0096.stoa003.opp-lat2:1.35-1.37 has 411 tokens and 42 sentences
urn:cts:latinLit:stoa0104a.stoa010.opp-lat1:6-8 has 654 tokens and 31 sentences
urn:cts:latinLit:stoa0249a.stoa002.opp-lat1:6.53-6.60 has 527 tokens and 25 sentences
urn:cts:latinLit:stoa0076c.stoa002.opp-lat2:8.8-8.10 has 540 tokens and 17 sentences
urn:cts:latinLit:stoa0022.stoa044.opp-lat1:1-8 has 555 tokens and 35 sentences
urn:cts:latinLit:stoa0054.stoa001a.opp-lat1:1-2 has 928 tokens and 40 sentences
urn:cts:la