# Parlor Game, Revisited

This notebook is meant to accompany a Disiecta Membra blog post about the DCC Core Latin Vocabulary and the language of Virgil's *Aeneid*. This post can be found here: 

Source for .csv file in /data:
Francese, Christopher. Latin Core Vocabulary. Dickinson College Commentaries (2014). http://dcc.dickinson.edu/latin-vocabulary-list

## Preprocessing

In [None]:
# Imports & setup

import os
import csv
import string
import re
from collections import defaultdict

from cltk.utils.file_operations import open_pickle
from cltk.stem.latin.j_v import JVReplacer
from cltk.tokenize.word import WordTokenizer
from cltk.lemmatize.latin.backoff import BackoffLatinLemmatizer
from cltk.corpus.latin import latinlibrary

datapath = 'data/'
datafile = "latin_vocabulary_list.csv"

In [None]:
# Set up training sentences

rel_path = os.path.join('~/cltk_data/latin/model/latin_models_cltk/lemmata/backoff')
path = os.path.expanduser(rel_path)

# Check for presence of latin_pos_lemmatized_sents
file = 'latin_pos_lemmatized_sents.pickle'      

latin_pos_lemmatized_sents_path = os.path.join(path, file)
if os.path.isfile(latin_pos_lemmatized_sents_path):
    latin_pos_lemmatized_sents = open_pickle(latin_pos_lemmatized_sents_path)
else:
    latin_pos_lemmatized_sents = []
    print('The file %s is not available in cltk_data' % file)  

In [None]:
# Set up NLP tools

tokenizer = WordTokenizer('latin')
lemmatizer = BackoffLatinLemmatizer(latin_pos_lemmatized_sents)
replacer = JVReplacer()

### Preprocessing: DCC Core Vocabulary

In [None]:
# Load first column of DCC Core Vocabulary csv file

columns = defaultdict(list)

with open(datapath+datafile) as f:
    reader = csv.reader(f)
    next(reader)
    for row in reader:
        for (i,v) in enumerate(row):
            columns[i].append(v)

dcc_lemmas = columns[0]


In [None]:
# Split headword column by whitespace and keep only first word

dcc_lemmas_simple = [lemma.replace('/',' ').split()[0] for lemma in dcc_lemmas]

In [None]:
# Preprocess DCC lemmas

# Normalize u/v
dcc_lemmas_simple = [replacer.replace(lemma) for lemma in dcc_lemmas_simple]

# remove macrons
def remove_macrons(text):
    transmap = {ord('ā'): 'a', ord('ē'): 'e', ord('ī'): 'i', ord('ō'): 'o', ord('ū'): 'u', }
    return text.translate(transmap)

dcc_lemmas_simple = [remove_macrons(lemma) for lemma in dcc_lemmas_simple]

# Remove punctuation
translator = str.maketrans({key: None for key in string.punctuation})
dcc_lemmas_simple = [lemma.translate(translator) for lemma in dcc_lemmas_simple]

dcc_lemmas_simple.sort()  

print(len(dcc_lemmas_simple))
print(dcc_lemmas_simple[:10])

### Preprocessing: Aeneid

In [None]:
# Setup Aeneid text from Latin Library corpus

files = latinlibrary.fileids()
aeneid_files = [file for file in files if 'vergil/aen' in file]
aeneid_raw = latinlibrary.raw(aeneid_files)

In [247]:
# Preprocess Aeneid text

# Remove English paratexts from Latin Library texts
aeneid_text = re.sub(r'Vergil: Aeneid .{,2}', '', aeneid_raw)
aeneid_text = re.sub(r'P. VERGILI MARONIS AENEIDOS LIBER \b.+?\b', '', aeneid_text)
aeneid_text = re.sub(r'\bVergil\b','',aeneid_text)
aeneid_text = re.sub(r'\bThe Latin Library\b','',aeneid_text)
aeneid_text = re.sub(r'\bThe Classics Page\b','',aeneid_text)
aeneid_text = aeneid_text.replace('&#151;', ' ')

# Lowercase
aeneid_text = aeneid_text.lower()

# Remove punctuation
translator = str.maketrans({key: " " for key in string.punctuation})
aeneid_text  = aeneid_text.translate(translator)

# Remove numbers
translator = str.maketrans({key: " " for key in '0123456789'})
aeneid_text  = aeneid_text.translate(translator)

# Normalize u/v
aeneid_text = replacer.replace(aeneid_text)

In [None]:
# Tokenize and lemmatize Aeneid

tokens = tokenizer.tokenize(aeneid_text)
lemmas = lemmatizer.lemmatize(tokens)

# Lemmatizer returns a list of tuples in the form [(token, lemma)]
# Keep only the lemmas
aeneid_lemmas = [lemma[1] for lemma in lemmas]

In [None]:
# Postprocess Aeneid lemmas

# Some lemmas are returned with Morpheus number, e.g. accido1
# We want to remove these numbers because they are not used in
# the DCC core vocabulary.
translator = str.maketrans({key: "" for key in '0123456789'})
aeneid_lemmas = [lemma.translate(translator) for lemma in aeneid_lemmas]

# Normalize u/v in the output
aeneid_lemmas = [replacer.replace(lemma) for lemma in aeneid_lemmas]

print(aeneid_lemmas[:10])

## Analysis

In [None]:
# List of words reported in the DCC blog post

dcc_missing = ['aegre', 'arbitror', 'auctoritas', 'beneficium', 'celeriter', 'censeo', 'ceterum', 'cibus', 'cogito', 'comparo', 'consuetudo', 'damnum', 'desidero', 'dignitas', 'disciplina', 'dormio', 'eo', 'epistula', 'existimo', 'fabula', 'facinus', 'familia', 'fructus', 'imperator', 'initium', 'intellego', 'interficio', 'interim', 'interrogo', 'itaque', 'iudico', 'libido', 'littera', 'magnitudo', 'maiores', 'materia', 'memoria', 'multitudo', 'mundus', 'necessitas', 'negotium', 'nolo', 'oportet', 'oratio', 'paene', 'pecunia', 'pertineo', 'plerumque', 'plerusque', 'poeta', 'postea', 'praetor', 'priuatus', 'prouincia', 'publicus', 'quasi', 'quemadmodum', 'quidam', 'reliquus', 'reuerto', 'sapiens', 'sapientia', 'scientia', 'scribo', 'seruus', 'solum', 'statim', 'studeo', 'tamquam', 'tribunus', 'uagus', 'uitium', 'utilis', 'utrum', 'uxor']

print(len(dcc_missing))
print(dcc_missing)

In [None]:
# Find the list of missing DCC words from the Aeneid text generated by the CLTK Backoff Latin lemmatizer

cltk_missing = list(set([lemma for lemma in dcc_lemmas_simple if lemma not in aeneid_lemmas]))
cltk_missing.sort()

print(len(cltk_missing))
print(cltk_missing)

In [None]:
# Remove words that fail to match for technical reasons:
# - CLTK Backoff Latin lemmatizer (by default in beta version) distinguishes between
#    cum/with and cum/when as cum1 and cum2; the match here fails because of the
#    number, not the method
# - CLTK lemmatizer also appends a hyphen to enclitics, e.g. '-que' which prevents
#   matching
# 

technical_forms = ['cum', 'que', 'ue']
cltk_missing = [lemma for lemma in cltk_missing if lemma not in technical_forms]
print(len(cltk_missing))

In [None]:
# Remove words that fail because of alternative forms:
# - The CLTK lemmatizer defers to the practice of the »Ancient Greek and Latin
#   Dependency Treebank«) [https://perseusdl.github.io/treebank_data/] and uses
#   this data as the source of its default training data.
#   Form some words here, AGLDT uses a different base form:
#   - e.g. AGLDT uses 'atque' as the base form; DCC uses 'ac'

alt_forms  = ['a', 'ac', 'nec']
cltk_missing = [lemma for lemma in cltk_missing if lemma not in alt_forms]
print(len(cltk_missing))

In [None]:
# Which DCC missing lemmas did the CLTK lemmatizer correctly match (true positive)?

tp = [lemma for lemma in cltk_missing if lemma in dcc_missing]
print(len(tp))
print(tp)

In [None]:
# Which DCC missing lemmas did the CLTK lemmatizer miss (false negative)?

fn = [lemma for lemma in dcc_missing if lemma not in cltk_missing]
print(len(fn))
print(fn)

In [None]:
# Which lemmas did the CLTK lemmatizer return that were not on the DCC list (false positive)?

fp = [lemma for lemma in cltk_missing if lemma not in dcc_missing]
print(len(fp))
print(fp)

## Repeat study for Lucan?

In [None]:
# Tokenize and lemmatize Lucan

tokens = tokenizer.tokenize(lucan_text)
lemmas = lemmatizer.lemmatize(tokens)

# Lemmatizer returns a list of tuples in the form [(token, lemma)]
# Keep only the lemmas
lucan_lemmas = [lemma[1] for lemma in lemmas]

In [None]:
# Postprocess Lucan lemmas

# Some lemmas are returned with Morpheus number, e.g. accido1
# We want to remove these numbers because they are not used in
# the DCC core vocabulary.
translator = str.maketrans({key: "" for key in '0123456789'})
lucan_lemmas = [lemma.translate(translator) for lemma in lucan_lemmas]

# Normalize u/v in the output
lucan_lemmas = [replacer.replace(lemma) for lemma in lucan_lemmas]

print(lucan_lemmas[:10])

In [None]:
# Find the list of missing DCC words from the Aeneid text generated by the CLTK Backoff Latin lemmatizer

cltk_missing_lucan = list(set([lemma for lemma in dcc_lemmas_simple if lemma not in lucan_lemmas]))
cltk_missing_lucan.sort()

print(len(cltk_missing_lucan))
print(cltk_missing_lucan)

In [None]:
# Remove words that fail to match for technical reasons:
# - CLTK Backoff Latin lemmatizer (by default in beta version) distinguishes between
#    cum/with and cum/when as cum1 and cum2; the match here fails because of the
#    number, not the method
# - CLTK lemmatizer also appends a hyphen to enclitics, e.g. '-que' which prevents
#   matching
# 

technical_forms = ['cum', 'que', 'ue']
cltk_missing_lucan = [lemma for lemma in cltk_missing_lucan if lemma not in technical_forms]
print(len(cltk_missing_lucan))

In [None]:
# Remove words that fail because of alternative forms:
# - The CLTK lemmatizer defers to the practice of the »Ancient Greek and Latin
#   Dependency Treebank«) [https://perseusdl.github.io/treebank_data/] and uses
#   this data as the source of its default training data.
#   Form some words here, AGLDT uses a different base form:
#   - e.g. AGLDT uses 'atque' as the base form; DCC uses 'ac'

alt_forms  = ['a', 'ac', 'nec']
cltk_missing_lucan = [lemma for lemma in cltk_missing_lucan if lemma not in alt_forms]
print(len(cltk_missing_lucan))

In [None]:
print(cltk_missing_lucan)

In [None]:
print([lemma for lemma in cltk_missing_lucan if lemma not in cltk_missing])