In [12]:
def read_tsv(file):
    with open(file) as f:
        header = []
        for line in f:
            line = line.strip()
            if not header:
                header = line.split("\t")
                continue
            yield dict(zip(header, line.split("\t")))

In [28]:
import glob
from tabulate import tabulate
import os.path
from collections import Counter
wordCount = {}
total = 0
pos = Counter()

for file in glob.glob("*.tsv"):
    cur_file = os.path.basename(file)
    wordCount[cur_file] = {}
    cur_ref = None
    for line in read_tsv(file):
        if line["lemma"] == "[METADATA]":
            cur_ref = line["form"]
            wordCount[cur_file][cur_ref] = 0
            continue
        wordCount[cur_file][cur_ref] +=1
        total += 1
        pos[line["POS"]] += 1

In [29]:
Titles = {
    "urn:cts:latinLit:stoa0275.stoa022.opp-lat1:3" : "Tertullien, *De pallio*",
    "urn:cts:latinLit:stoa0275.stoa027.opp-lat2:9-10" : "Tertullien, *De spectaculis*",
    "urn:cts:latinLit:stoa0040.stoa003.opp-lat4:17.4" : "Augustin, *De civitate Dei*",
    "urn:cts:latinLit:stoa0040.stoa011.opp-lat4:262.1-262.4" : "Augustin, *Lettre CCLXII*",
    "urn:cts:latinLit:stoa0270.stoa002.opp-lat2:9-10" : "Sulpice Sévère, *Vita Martini*",
    "urn:cts:latinLit:stoa0238.stoa002.perseus-lat2:pr.1-1.20" : "Prudence, *Psychomachie*",
    "urn:cts:latinLit:stoa0096.stoa003.opp-lat2:1.35-1.37" : "Commodien, *Instructiones*",
    "urn:cts:latinLit:stoa0104a.stoa010.opp-lat1:6-8" : "Cyprien de Carthage, *De unitate Ecclesiae*",
    "urn:cts:latinLit:stoa0249a.stoa002.opp-lat1:6.53-6.60" : "Salvien de Marseille, *De gubernatione Dei*",
    "urn:cts:latinLit:stoa0076c.stoa002.opp-lat2:8.8-8.10" : "Jean Cassien, *Institutiones*",
    "urn:cts:latinLit:stoa0022.stoa044.opp-lat1:1-8" : "Ambroise de Milan, *De Tobia*",
    "urn:cts:latinLit:stoa0054.stoa001a.opp-lat1:1-2" : "Bède le Vénérable, *De locis sanctis*",
    "urn:cts:latinLit:stoa0149b.stoa001.opp-lat1:2" : "Hilaire de Poitiers, *Tractatus super psalmos*",
    "urn:cts:latinLit:stoa0171.stoa002.opp-lat1:26-27" : "Lactance, *De mortibus persecutoruma*",
    "urn:cts:latinLit:stoa0162.stoa024.opp-lat1:1.1.1-1.2.3" : "Jérôme, *Commentaire sur Jérémie*",
    "urn:cts:latinLit:stoa0058.stoa023.perseus-lat1:3" : "Boèce, *Contra Eutychen et Nestorium*",
    "urn:cts:latinLit:stoa0143.stoa001:@.30-2.31" : "Grégoire de Tours, *Historia Francorum*",
    "urn:cts:latinLit:stoa0261.stoa002:4.3" : "Sidoine Apollinaire, *Lettres*",
    "urn:cts:latinLit:stoa0112.stoa001:1-3" : "Eginhard, *Vie de Charlemagne*",
    # Phase 2
    "[REF:urn:cts:latinLit:stoa0187b.stoa002:1-4]": "Mamertin, *Panegyricus dicto Juliano imperatori*",
    "[REF:urn:cts:latinLit:stoa0287.stoa001:1.20]": "Végèce, *Epitome de re militari*",
    "[REF:urn:cts:latinLit:stoa0186-stoa001:3.14]": "Macrobe, *Saturnales*",
    "[REF:urn:cts:latinLit:phi2331.phi004.perseus-lat1:25-27]": "*Histoire Auguste*, Marc Aurèle",
    "[REF:urn:cts:latinLit:stoa0041a.stoa005:1.18-1.28]": "Caelius Aurelianus, *Gynaeciorum Sorani*",
    "[REF:urn:cts:latinLit:stoa0110.stoa002a:47-63]": "Donat, *In Bucolicis Vergilii commentarium, praefatio*",
    "[REF:urn:cts:latinLit:stoa0285c.stoa001]": "Vacca, *Vita M. Annaei Lucani*",
    "[REF:urn:cts:latinLit:stoa0116c.stoa001:1-2]": "Euantius, *De comoedia uel de fabula*",
    "[REF:urn:cts:latinLit:stoa0107.stoa001:38-40]": "Darès de Phrygie, *De excidio Troiae historia*",
    "[REF:urn:cts:latinLit:stoa0023.stoa001.perseus-lat2:20.3.1-20.3.12]": "Ammien Marcellin, *Res gestae*",
    "[REF:urn:cts:latinLit:stoa0146b.stoa001:5.2]": "Hégésippe, *Histoires*",
    # Phase 3
    "[REF:More]": "Thomas More, *Utopia*",
    "[REF:LegendeAndre]": "Jacques de Voragine, *Saint André* (Legende Dorée)",
    "[REF:LegendeAntoine]": "Jacques de Voragine, *Saint Antoine* (Legende Dorée)",
    "[REF:LegendeAlexis]": "Jacques de Voragine, *Saint Alexis* (Legende Dorée)",
    "[REF:LegendeSylvestre]": "Jacques de Voragine, *Saint Sylvestre* (Legende Dorée)",
    "[REF:LegendeLucie]": "Jacques de Voragine, *Saint Lucie* (Legende Dorée)",
    "[REF:LegendeMarieMadeleine]": "Jacques de Voragine, *Saint Marie-Madeleine* (Legende Dorée)",
    "[REF:LegendeThomas]": "Jacques de Voragine, *Saint Thomas* (Legende Dorée)",
    "[REF:LegendeFrancois]": "Jacques de Voragine, *Saint François* (Legende Dorée)",
}

In [35]:
print ("## Total\n")

print(f"There are {total} tokens registered in this corpus.\n")
print(tabulate(
    [[x,str(y)] for x, y in sorted(pos.items(), key=lambda x: -x[1])],
    ["POS", "Number of tokens"],
    tablefmt="github")+"\n\n")
    
for file in wordCount:
    print(f"## File {file}\n")
    print(tabulate(
        [[ref, Titles.get(ref, ""), str(cnt)] for ref, cnt in wordCount[file].items()],
        ["First Token", "Text title", "Number of tokens"],
        tablefmt="github"
    )+"\n\n")

## Total

There are 57123 tokens registered in this corpus.

| POS        |   Number of tokens |
|------------|--------------------|
| NOMcom     |              11310 |
| VER        |              10641 |
| PUNC       |               9292 |
| CONcoo     |               3903 |
| PRE        |               3458 |
| ADJqua     |               3296 |
| ADV        |               3094 |
| PROdem     |               2305 |
| CONsub     |               1775 |
| PROrel     |               1478 |
| NOMpro     |               1396 |
| PROind     |               1036 |
| PROper     |                674 |
| ADVneg     |                665 |
| ADVrel     |                430 |
| VERaux     |                407 |
| PROpos     |                393 |
| PROref     |                338 |
| PROpos.ref |                316 |
| ADJcar     |                285 |
| PROint     |                147 |
| ADJord     |                121 |
| ADVint     |                110 |
| OUT        |                110 |
| I