In [1]:
# Imports

import os
import string
import re
from collections import Counter
from cltk.corpus.readers import get_corpus_reader


from cltk.tokenize.sentence import TokenizeSentence
from cltk.tokenize.word import WordTokenizer
from cltk.lemmatize.latin.backoff import BackoffLatinLemmatizer
from cltk.stem.latin.j_v import JVReplacer
from cltk.utils.file_operations import open_pickle

In [2]:
tess = get_corpus_reader(corpus_name = 'latin_text_tesserae', language = 'latin')

In [3]:
# Setup CLTK tools

word_tokenizer = WordTokenizer('latin')
sent_tokenizer = TokenizeSentence('latin')
lemmatizer = BackoffLatinLemmatizer()
replacer = JVReplacer()

In [4]:
tess_texts = [text for text in tess.texts(tess.fileids())]

In [None]:
# Preprocessing script for the Latin Library

def preprocess(text):    
    
    text = text.lower()

    punctuation = string.punctuation
    punctuation += "\"#$%&\'()*+,-/:;<=>@[\]^_`{|}~"
    translator = str.maketrans({key: " " for key in punctuation})
    text = text.translate(translator)
    
    translator = str.maketrans({key: " " for key in '0123456789'})
    text = text.translate(translator)
    
    text = replacer.replace(text)
    
    text = " ".join(text.split('\n'))
    text = " ".join(text.split())
    
    return text

In [None]:
# Preprocess Latin Library text
tess_pp = [preprocess(text).split() for text in tess_texts]

In [None]:
tess_flat = " ".join([item for sublist in tess_pp for item in sublist])

In [None]:
tess_flat[:10]

'galli caes'

In [None]:
# Tokenize Latin Library text

tess_tokens = word_tokenizer.tokenize(tess_flat)
print(tess_tokens[:10])

['galli', 'caesaris', 'saeuitia', 'post', 'emensos', 'insuperabilis', 'expeditionis', 'euentus', 'languentibus', 'partium']


In [None]:
# Get total token counts

tess_tokens_len = len(tess_tokens)
tess_tokens_set_len = len(set(tess_tokens))

In [None]:
# Print top 10 token counts

print('Number of tokens in CLTK Tesserae:', tess_tokens_len)
print('Number of unique tokens in CLTK Tesserae:', tess_tokens_set_len)

Number of tokens in CLTK Tesserae: 6864495
Number of unique tokens in CLTK Tesserae: 310930


In [None]:
# Build counter of top token counts

tess_tokens_counter = Counter(tess_tokens)
tess_tokens_mc = tess_tokens_counter.most_common(10000)

running = 0

print('Top 25 tokens in CLTK Tesserae:\n')
print("{number:>5}  {token:<12}{count:<12}{percent:<12}{running:<12}".format(number="", token="TOKEN", count="COUNT", percent="Type-Tok %", running = "RUNNING %"))
for i, pair in enumerate(tess_tokens_mc[:10]):
    running += pair[1]
    print("{number:>5}. {token:<12}{count:<12}{percent:<12}{running:<12}".format(number=i+1, token=pair[0], count=pair[1], percent=str(round(pair[1] / len(tess_tokens)*100, 2))+"%", running = str(round(running / len(tess_tokens)*100, 2))+"%"))

Top 25 tokens in CLTK Tesserae:

       TOKEN       COUNT       Type-Tok %  RUNNING %   
    1. et          225907      3.29%       3.29%       
    2. in          141415      2.06%       5.35%       
    3. est         81715       1.19%       6.54%       
    4. non         77337       1.13%       7.67%       
    5. ut          60016       0.87%       8.54%       
    6. ad          55140       0.8%        9.35%       
    7. cum         53193       0.77%       10.12%      
    8. quod        44106       0.64%       10.76%      
    9. qui         43352       0.63%       11.39%      
   10. si          38532       0.56%       11.96%      


In [None]:
# Lemmatize Latin Library text

tess_lemma_pairs = lemmatizer.lemmatize(tess_tokens)
print(tess_lemma_pairs[:10])

In [None]:
# Get total lemma counts

tess_lemmas = [lemma[1] for lemma in tess_lemma_pairs]
tess_lemmas_set_len = len(set(tess_lemmas))

In [None]:
# Print top 10 token counts

print('Number of tokens in CLTK Tesserae:', tess_tokens_len)
print('Number of unique tokens in CLTK Tesserae:', tess_tokens_set_len)
print('Number of unique lemmas in CLTK Tesserae:', tess_lemmas_set_len)

In [None]:
# Build counter of top lemma counts

tess_lemmas_counter = Counter(tess_lemmas)
tess_lemmas_mc = tess_lemmas_counter.most_common(10000)

#print('Top 10 lemmas in Latin Library:\n')
#for i, pair in enumerate(tess_lemmas_mc[:10]):
#    print("{number}. {lemma}\t\t{count}\t\t{percent}%".format(number=i+1, lemma=pair[0], count=pair[1], percent=round(pair[1] / len(tess_tokens)*100, 2)))

running = 0

print('Top 10 lemmas in Latin Library:\n')
print("{number:>5}  {lemma:<12}{count:<12}{percent:<12}{running:<12}".format(number="", lemma="LEMMA", count="COUNT", percent="TYPE-LEM %", running = "RUNNING %"))
for i, pair in enumerate(tess_lemmas_mc[:10]):
    running += pair[1]
    print("{number:>5}. {lemma:<12}{count:<12}{percent:<12}{running:<12}".format(number=i+1, lemma=pair[0], count=pair[1], percent=str(round(pair[1] / len(tess_tokens)*100, 2))+"%", running = str(round(running / len(tess_tokens)*100, 2))+"%"))    

In [None]:
# Print top 10,000 counts

print('Top 10,000 tokens in the CLTK Tesserae:\n')
for i, pair in enumerate(tess_tokens_mc):
    print("{number}. {token} ({count})".format(number=i+1, token=pair[0], count=pair[1]))

In [None]:
# Print top 10,000 lemma counts

print('Top 10,000 lemmas in the CLTK Tesserae:\n')
for i, pair in enumerate(tess_lemmas_mc):
    print("{number}. {lemma} ({count})".format(number=i+1, lemma=pair[0], count=pair[1]))