In [25]:
# Useful functions
import regex

def roman_number(inp: str) -> int:
    """
    Source: https://stackoverflow.com/questions/19308177/converting-roman-numerals-to-integers-in-python
    Author: https://stackoverflow.com/users/1201737/r366y
    :param num:
    :return:

    >>> roman_number("XXIV")
    24
    """
    roman_numerals = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
    result = 0
    for i, c in enumerate(inp.upper()):
        if (i+1) == len(inp) or roman_numerals[c] >= roman_numerals[inp[i+1].upper()]:
            result += roman_numerals[c]
        else:
            result -= roman_numerals[c]
    print(inp, result)
    return result

def uvji(val):
    return val.replace("V", "U").replace("v", "u").replace("J", "I").replace("j", "i")

def get_tasks(morph):
    return dict([
        m.split("=") 
        for m in morph.split("|")
    ])

# Useful constants

WRONG_CLITICS = {"quis1"}
DOTS_EXCEPT_APOSTROPHES = r".?!\"“”\"«»…\[\]\(\)„“"
TASKS = "form,lemma,Deg,Numb,Person,Mood_Tense_Voice,Case,Gend,Dis,pos".split(",")

In [38]:
def read_input(tsv_file):
    clitics = []
    files = {}

    with open(tsv_file) as f:
        header = []
        cur_text = None
        previous_anno = None
        anno = None
        for lineno, line in enumerate(f):
            line = line.strip().split("\t")
            if lineno == 0:
                header = line
                continue

            previous_anno = anno
            anno = dict(zip(header, line))
            if anno["lemma"] == "[METADATA]":
                cur_text = anno["form"]
                files[anno["form"]] = []
                continue


            if anno["form"] in DOTS_EXCEPT_APOSTROPHES:
                if files[cur_text][-1] != {}:
                    files[cur_text].append({})
                continue

            anno["Dis"] = "_"

            anno.update(get_tasks(anno["morph"]))
            anno["Mood_Tense_Voice"] = "|".join([
                anno.get(part, "_")
                for part in "Mood_Tense_Voice".split("_")
            ]).replace("_|_|_", "_")


            if anno["lemma"].isnumeric():
                if int(anno["lemma"]) > 3:
                    anno["lemma"] = anno["form"] = "3"

            if anno["lemma"][-1].isnumeric() and len(anno["lemma"]) > 1:
                anno["lemma"], anno["Dis"] = anno["lemma"][:-1], anno["lemma"][-1]

            if anno["lemma"] == "[ROMAN_NUMBER]":
                anno["lemma"] = anno["form"] = roman_number(anno["form"])
                if anno["lemma"] > 3:
                    anno["lemma"] = anno["form"] = "3"
                else:
                    anno["lemma"] = anno["form"] = str(anno["form"])

            if anno["lemma"] == "[Greek]":
                continue

            if anno["POS"] == "OUT":
                print(anno)

            if anno["POS"] == "PUNC":
                if anno["form"] in "?.;!)(][":
                    files[cur_text].append(None)
                continue

            if anno["POS"] == "VERaux":
                anno["POS"] = "VER"

            if len(files[cur_text]) and files[cur_text][-1] and files[cur_text][-1] == previous_anno \
                and files[cur_text][-1]["form"] == anno["form"]:
                if anno["lemma"] not in WRONG_CLITICS:
                    clitics.append(anno["lemma"])
                    files[cur_text][-1]["lemma"] = files[cur_text][-1]["lemma"]+"界"+anno["lemma"]
                    continue
            elif len(files[cur_text]) and files[cur_text][-1] and files[cur_text][-1]["form"] == anno["form"][1:-1]:
                    clitics.append(anno["lemma"])
                    files[cur_text][-1]["lemma"] = files[cur_text][-1]["lemma"]+"界"+anno["lemma"]
                    continue

            anno["lemma"] = uvji(anno["lemma"])
            anno["form"] = uvji(anno["form"])

            if "." in anno["form"]:
                print(anno)

            if False and anno["POS"].startswith("NOM"):
                anno["POS"] = "NOM"
            anno["pos"] = anno["POS"]

            files[cur_text].append(anno)
    return files, clitics

In [39]:
Titles = {
    "urn:cts:latinLit:stoa0275.stoa022.opp-lat1:3" : "Tertullien, *De pallio*",
    "urn:cts:latinLit:stoa0275.stoa027.opp-lat2:9-10" : "Tertullien, *De spectaculis*",
    "urn:cts:latinLit:stoa0040.stoa003.opp-lat4:17.4" : "Augustin, *De civitate Dei*",
    "urn:cts:latinLit:stoa0040.stoa011.opp-lat4:262.1-262.4" : "Augustin, *Lettre CCLXII*",
    "urn:cts:latinLit:stoa0270.stoa002.opp-lat2:9-10" : "Sulpice Sévère, *Vita Martini*",
    "urn:cts:latinLit:stoa0238.stoa002.perseus-lat2:pr.1-1.20" : "Prudence, *Psychomachie*",
    "urn:cts:latinLit:stoa0096.stoa003.opp-lat2:1.35-1.37" : "Commodien, *Instructiones*",
    "urn:cts:latinLit:stoa0104a.stoa010.opp-lat1:6-8" : "Cyprien de Carthage, *De unitate Ecclesiae*",
    "urn:cts:latinLit:stoa0249a.stoa002.opp-lat1:6.53-6.60" : "Salvien de Marseille, *De gubernatione Dei*",
    "urn:cts:latinLit:stoa0076c.stoa002.opp-lat2:8.8-8.10" : "Jean Cassien, *Institutiones*",
    "urn:cts:latinLit:stoa0022.stoa044.opp-lat1:1-8" : "Ambroise de Milan, *De Tobia*",
    "urn:cts:latinLit:stoa0054.stoa001a.opp-lat1:1-2" : "Bède le Vénérable, *De locis sanctis*",
    "urn:cts:latinLit:stoa0149b.stoa001.opp-lat1:2" : "Hilaire de Poitiers, *Tractatus super psalmos*",
    "urn:cts:latinLit:stoa0171.stoa002.opp-lat1:26-27" : "Lactance, *De mortibus persecutoruma*",
    "urn:cts:latinLit:stoa0162.stoa024.opp-lat1:1.1.1-1.2.3" : "Jérôme, *Commentaire sur Jérémie*",
    "urn:cts:latinLit:stoa0058.stoa023.perseus-lat1:3" : "Boèce, *Contra Eutychen et Nestorium*",
    "urn:cts:latinLit:stoa0143.stoa001:@.30-2.31" : "Grégoire de Tours, *Historia Francorum*",
    "urn:cts:latinLit:stoa0261.stoa002:4.3" : "Sidoine Apollinaire, *Lettres*",
    "urn:cts:latinLit:stoa0112.stoa001:1-3" : "Eginhard, *Vie de Charlemagne*",
    # Phase 2
    "[REF:urn:cts:latinLit:stoa0187b.stoa002:1-4]": "Mamertin, *Panegyricus dicto Juliano imperatori*",
    "[REF:urn:cts:latinLit:stoa0287.stoa001:1.20]": "Végèce, *Epitome de re militari*",
    "[REF:urn:cts:latinLit:stoa0186-stoa001:3.14]": "Macrobe, *Saturnales*",
    "[REF:urn:cts:latinLit:phi2331.phi004.perseus-lat1:25-27]": "*Histoire Auguste*, Marc Aurèle",
    "[REF:urn:cts:latinLit:stoa0041a.stoa005:1.18-1.28]": "Caelius Aurelianus, *Gynaeciorum Sorani*",
    "[REF:urn:cts:latinLit:stoa0110.stoa002a:47-63]": "Donat, *In Bucolicis Vergilii commentarium, praefatio*",
    "[REF:urn:cts:latinLit:stoa0285c.stoa001]": "Vacca, *Vita M. Annaei Lucani*",
    "[REF:urn:cts:latinLit:stoa0116c.stoa001:1-2]": "Euantius, *De comoedia uel de fabula*",
    "[REF:urn:cts:latinLit:stoa0107.stoa001:38-40]": "Darès de Phrygie, *De excidio Troiae historia*",
    "[REF:urn:cts:latinLit:stoa0023.stoa001.perseus-lat2:20.3.1-20.3.12]": "Ammien Marcellin, *Res gestae*",
    "[REF:urn:cts:latinLit:stoa0146b.stoa001:5.2]": "Hégésippe, *Histoires*",
    # Phase 3
    "[REF:More]": "Thomas More, *Utopia*",
    "[REF:LegendeAndre]": "Jacques de Voragine, *Saint André* (Legende Dorée)",
    "[REF:LegendeAntoine]": "Jacques de Voragine, *Saint Antoine* (Legende Dorée)",
    "[REF:LegendeAlexis]": "Jacques de Voragine, *Saint Alexis* (Legende Dorée)",
    "[REF:LegendeSylvestre]": "Jacques de Voragine, *Saint Sylvestre* (Legende Dorée)",
    "[REF:LegendeLucie]": "Jacques de Voragine, *Saint Lucie* (Legende Dorée)",
    "[REF:LegendeMarieMadeleine]": "Jacques de Voragine, *Saint Marie-Madeleine* (Legende Dorée)",
    "[REF:LegendeThomas]": "Jacques de Voragine, *Saint Thomas* (Legende Dorée)",
    "[REF:LegendeFrancois]": "Jacques de Voragine, *Saint François* (Legende Dorée)",
}

Titles = {
    key: value.replace("*", "").replace(",", " -- ").replace("(", "").replace(")", "")
    for key, value in Titles.items()
}

In [40]:
import glob
files, all_clits = {}, []
for file in glob.glob("./raw/*.tsv"):
    content, clits = read_input(file)
    files.update(content)
    all_clits.extend(clits)

{'form': 'Tib.', 'lemma': 'Tiberius', 'POS': 'NOMpro', 'morph': 'Case=Gen|Numb=Sing', 'Dis': '_', 'Case': 'Gen', 'Numb': 'Sing', 'Mood_Tense_Voice': '_'}
{'form': 'M.', 'lemma': 'Marcus', 'POS': 'NOMpro', 'morph': 'Case=Nom|Numb=Sing', 'Dis': '_', 'Case': 'Nom', 'Numb': 'Sing', 'Mood_Tense_Voice': '_'}
{'form': 'M.', 'lemma': 'Marcus', 'POS': 'NOMpro', 'morph': 'Case=Dat|Numb=Sing', 'Dis': '_', 'Case': 'Dat', 'Numb': 'Sing', 'Mood_Tense_Voice': '_'}
XL 40
L 50
XX 20
XXX 30
III 3
{'form': 'C.', 'lemma': 'Caius', 'POS': 'NOMpro', 'morph': 'Case=Abl|Numb=Sing', 'Dis': '_', 'Case': 'Abl', 'Numb': 'Sing', 'Mood_Tense_Voice': '_'}
{'form': 'M.', 'lemma': 'Marcus', 'POS': 'NOMpro', 'morph': 'Case=Nom|Numb=Sing', 'Dis': '_', 'Case': 'Nom', 'Numb': 'Sing', 'Mood_Tense_Voice': '_'}
{'form': 'M.', 'lemma': 'Marcus', 'POS': 'NOMpro', 'morph': 'Case=Acc|Numb=Sing', 'Dis': '_', 'Case': 'Acc', 'Numb': 'Sing', 'Mood_Tense_Voice': '_'}
III 3
{'form': 'C.', 'lemma': 'Caius', 'POS': 'NOMpro', 'morph': 'C

In [41]:
# Write
        
with open("lasla-model-ready.tsv", "w") as full:
    full.write("\t".join(TASKS).replace("form", "token")+"\n")
    for file in files:
        print(f"{file} has {len([an for an in files[file] if an])} tokens "
              f"and {len([an for an in files[file] if not an])} sentences")
        with open(f"lasla-model-ready/{Titles.get(file, file)}.tsv", "w") as f:
            f.write("\t".join(TASKS).replace("form", "token")+"\n")
            for annot in files[file]:
                if not annot:
                    f.write("\n")
                    full.write("\n")
                    continue
                f.write("\t".join([annot.get(h, "_") for h in TASKS])+"\n")
                full.write("\t".join([annot.get(h, "_") for h in TASKS])+"\n")
    full.write("\n")
print(all_clits)

[REF:urn:cts:latinLit:stoa0187b.stoa002:1-4] has 695 tokens and 33 sentences
[REF:urn:cts:latinLit:stoa0287.stoa001:1.20] has 555 tokens and 31 sentences
[REF:urn:cts:latinLit:stoa0186-stoa001:3.14] has 715 tokens and 44 sentences
[REF:urn:cts:latinLit:phi2331.phi004.perseus-lat1:25-27] has 555 tokens and 44 sentences
[REF:urn:cts:latinLit:stoa0041a.stoa005:1.18-1.28] has 646 tokens and 50 sentences
[REF:urn:cts:latinLit:stoa0110.stoa002a:47-63] has 688 tokens and 29 sentences
[REF:urn:cts:latinLit:stoa0285c.stoa001] has 481 tokens and 22 sentences
[REF:urn:cts:latinLit:stoa0116c.stoa001:1-2] has 725 tokens and 26 sentences
[REF:urn:cts:latinLit:stoa0107.stoa001:38-40] has 474 tokens and 20 sentences
[REF:urn:cts:latinLit:stoa0023.stoa001.perseus-lat2:20.3.1-20.3.12] has 762 tokens and 26 sentences
[REF:urn:cts:latinLit:stoa0146b.stoa001:5.2] has 377 tokens and 19 sentences
urn:cts:latinLit:stoa0275.stoa022.opp-lat1:3 has 558 tokens and 42 sentences
urn:cts:latinLit:stoa0275.stoa027.op