In [15]:
import re
import treetaggerwrapper
tagger = treetaggerwrapper.TreeTagger(TAGLANG="it")

from canticas import inferno, purgatorio, paradiso

# Clean up the text
regex_to_remove = r"\[[\w\W]*?\]|CANTO\s[C-X]*"
inferno_text = re.sub(regex_to_remove, "", inferno)
purgatorio_text = re.sub(regex_to_remove, "", purgatorio)
paradiso_text = re.sub(regex_to_remove, "", paradiso)

# Preprocessing: remove punctuation, lemmatize, tokenize
def preprocess(text):
    lowercase_text = text.lower()
    cleaned_text = re.sub(r"[^\w\s]", " ", lowercase_text)
    tagged_text = treetaggerwrapper.make_tags(tagger.tag_text(cleaned_text))
    lemmatized_text = [lemma for word, tag, lemma in tagged_text]
    return lemmatized_text

preprocessed_inferno = preprocess(inferno_text)
preprocessed_purgatorio = preprocess(purgatorio_text)
preprocessed_paradiso = preprocess(paradiso_text)

tokens_in_inferno = len(preprocessed_inferno)
tokens_in_purgatorio = len(preprocessed_purgatorio)
tokens_in_paradiso = len(preprocessed_paradiso)

bow_inferno = {lemma for lemma in preprocessed_inferno}
bow_purgatorio = {lemma for lemma in preprocessed_purgatorio}
bow_paradiso = {lemma for lemma in preprocessed_paradiso}

types_in_inferno = len(bow_inferno)
types_in_purgatorio = len(bow_purgatorio)
types_in_paradiso = len(bow_paradiso)

print("Inferno has", tokens_in_inferno, "total words and", types_in_inferno, "unique words.")
print("The type/token ratio of Inferno is", types_in_inferno / tokens_in_inferno)
print("Purgatorio has", tokens_in_purgatorio, "total words and", types_in_purgatorio, "unique words.")
print("The type/token ratio in Purgatorio is", types_in_purgatorio / tokens_in_purgatorio)
print("Paradiso has", tokens_in_paradiso, "total words and", types_in_paradiso, "unique words.")
print("The type/token ratio in Paradiso is", types_in_paradiso / tokens_in_paradiso)

Inferno has 34142 total words and 4779 unique words.
The type/token ratio of Inferno is 0.13997422529435885
Purgatorio has 34053 total words and 4670 unique words.
The type/token ratio in Purgatorio is 0.13713916541861215
Paradiso has 33410 total words and 4522 unique words.
The type/token ratio in Paradiso is 0.13534869799461238
