In [1]:
# Some spec work done 12/21/18 following this tweet:
# https://twitter.com/MagisterConway/status/1075937446129471488
# Hasn't been reviewed yet. Tweet corrections, etc. to @diyclassics

In [2]:
# Imports

import os
import string
import re
from collections import Counter

from cltk.corpus.latin import latinlibrary
from cltk.tokenize.sentence import TokenizeSentence
from cltk.tokenize.word import WordTokenizer
from cltk.lemmatize.latin.backoff import BackoffLatinLemmatizer
from cltk.stem.latin.j_v import JVReplacer
from cltk.utils.file_operations import open_pickle

In [3]:
# Set up training sentences

rel_path = os.path.join('~/cltk_data/latin/model/latin_models_cltk/lemmata/backoff')
path = os.path.expanduser(rel_path)

# Check for presence of latin_pos_lemmatized_sents
file = 'latin_pos_lemmatized_sents.pickle'      

latin_pos_lemmatized_sents_path = os.path.join(path, file)
if os.path.isfile(latin_pos_lemmatized_sents_path):
    latin_pos_lemmatized_sents = open_pickle(latin_pos_lemmatized_sents_path)
else:
    latin_pos_lemmatized_sents = []
    print('The file %s is not available in cltk_data' % file)  

In [4]:
# Setup CLTK tools

word_tokenizer = WordTokenizer('latin')
sent_tokenizer = TokenizeSentence('latin')
lemmatizer = BackoffLatinLemmatizer(latin_pos_lemmatized_sents)
replacer = JVReplacer()

In [5]:
met_files = [file for file in latinlibrary.fileids() if 'ovid.met' in file]
met_order = [int(" ".join(re.findall(r'\d+', item))) for item in met_files]
met_files = [x for _, x in sorted(zip(met_order, met_files))]
print(met_files)

['ovid/ovid.met1.txt', 'ovid/ovid.met2.txt', 'ovid/ovid.met3.txt', 'ovid/ovid.met4.txt', 'ovid/ovid.met5.txt', 'ovid/ovid.met6.txt', 'ovid/ovid.met7.txt', 'ovid/ovid.met8.txt', 'ovid/ovid.met9.txt', 'ovid/ovid.met10.txt', 'ovid/ovid.met11.txt', 'ovid/ovid.met12.txt', 'ovid/ovid.met13.txt', 'ovid/ovid.met14.txt', 'ovid/ovid.met15.txt']


In [6]:
# Get raw text of Metamorphoses

met_raw = latinlibrary.raw(met_files)

In [7]:
# Preprocessing script for the Latin Library

def preprocess(text):    
    
    remove_list = [
        r'Ovid: Metamorph*oses .+',
        r'P. OVIDI NASONIS METAMORPHOSEN LIBER .+',
        r'\bOvid\b',
        r'The Latin Library',
        r'The Classics Page',
    ]
    
    for pattern in remove_list:
        text = re.sub(pattern, '', text)
    
    text = text.lower()
    
    text= re.sub(r'&lt;','<',text)
    text= re.sub(r'&gt;','>',text)    

    punctuation = string.punctuation
    #punctuation += "\"#$%&\'()*+,-/:;<=>@[\]^_`{|}~"
    translator = str.maketrans({key: " " for key in punctuation})
    text = text.translate(translator)
    
    translator = str.maketrans({key: " " for key in '0123456789'})
    text = text.translate(translator)
    
    text = replacer.replace(text)
    
    text = re.sub('[ ]+',' ', text) # Remove double spaces
    text = re.sub('\s+\n+\s+','\n', text) # Remove double lines and trim spaces around new lines
    
    return text.strip()

In [8]:
met_raw[:100]

'Ovid: Metamorposes I\r\n\t\t \r\n\r\n\t\t \r\n\t\t \r\n\t \r\n\t\r\n \r\n\r\n P. OVIDI NASONIS METAMORPHOSEN LIBER PRIMVS\r\n \r\n'

In [9]:
# Preprocess Latin Library text

met_pp = preprocess(met_raw)
print(met_pp[:100])

in noua fert animus mutatas dicere formas 
corpora di coeptis nam uos mutastis et illas 
adspirate


In [10]:
# Tokenize Latin Library text

met_tokens = word_tokenizer.tokenize(met_pp)
print(met_tokens[:50])

['in', 'noua', 'fert', 'animus', 'mutatas', 'dicere', 'formas', 'corpora', 'di', 'coeptis', 'nam', 'uos', 'mutastis', 'et', 'illas', 'adspirate', 'meis', 'prima', '-que', 'ab', 'origine', 'mundi', 'ad', 'mea', 'perpetuum', 'deducite', 'tempora', 'carmen', 'ante', 'mare', 'et', 'terras', 'et', 'quod', 'tegit', 'omnia', 'caelum', 'unus', 'erat', 'toto', 'naturae', 'uultus', 'in', 'orbe', 'quem', 'dixere', 'chaos', 'rudis', 'indigesta', '-que']


In [11]:
# Get total token counts

met_tokens_len = len(met_tokens)
met_tokens_set_len = len(set(met_tokens))

In [12]:
# Print top 10 token counts

print('Number of tokens in Metamorphoses:', met_tokens_len)
print('Number of unique tokens in Metamorphoses:', met_tokens_set_len)

Number of tokens in Metamorphoses: 82834
Number of unique tokens in Metamorphoses: 18382


In [13]:
# Build counter of top token counts

met_tokens_counter = Counter(met_tokens)
met_tokens_mc = met_tokens_counter.most_common(10000)

running = 0

print('Top 25 tokens in Metamorphoses:\n')
print("{number:>5}  {token:<12}{count:<12}{percent:<12}{running:<12}".format(number="", token="TOKEN", count="COUNT", percent="Type-Tok %", running = "RUNNING %"))
for i, pair in enumerate(met_tokens_mc[:25]):
    running += pair[1]
    print("{number:>5}. {token:<12}{count:<12}{percent:<12}{running:<12}".format(number=i+1, token=pair[0], count=pair[1], percent=str(round(pair[1] / len(met_tokens)*100, 2))+"%", running = str(round(running / len(met_tokens)*100, 2))+"%"))

Top 25 tokens in Metamorphoses:

       TOKEN       COUNT       Type-Tok %  RUNNING %   
    1. -que        4383        5.29%       5.29%       
    2. et          2131        2.57%       7.86%       
    3. in          1164        1.41%       9.27%       
    4. est         987         1.19%       10.46%      
    5. nec         629         0.76%       11.22%      
    6. non         588         0.71%       11.93%      
    7. cum         462         0.56%       12.49%      
    8. ut          379         0.46%       12.95%      
    9. per         331         0.4%        13.34%      
   10. -ne         319         0.39%       13.73%      
   11. quae        297         0.36%       14.09%      
   12. sed         292         0.35%       14.44%      
   13. tamen       290         0.35%       14.79%      
   14. mihi        275         0.33%       15.12%      
   15. ad          274         0.33%       15.45%      
   16. quoque      274         0.33%       15.78%      
   17. quod    

In [14]:
with open("data/met_counts/met_tokens.txt", 'w') as f:
    for k,v in  met_tokens_counter.most_common():
        f.write( "{} {}\n".format(k,v))

In [15]:
# Lemmatize Latin Library text

met_lemma_pairs = lemmatizer.lemmatize(met_tokens)
print(met_lemma_pairs[:100])

[('in', 'in'), ('noua', 'nouus'), ('fert', 'fero'), ('animus', 'animus'), ('mutatas', 'muto'), ('dicere', 'dico'), ('formas', 'forma'), ('corpora', 'corpus'), ('di', 'deus'), ('coeptis', 'coepio'), ('nam', 'nam'), ('uos', 'tu'), ('mutastis', 'muto'), ('et', 'et'), ('illas', 'ille'), ('adspirate', 'adspiro'), ('meis', 'meus'), ('prima', 'primus'), ('-que', '-que'), ('ab', 'ab'), ('origine', 'origo'), ('mundi', 'mundus'), ('ad', 'ad'), ('mea', 'meus'), ('perpetuum', 'perpetuus'), ('deducite', 'deduco'), ('tempora', 'tempus'), ('carmen', 'carmen'), ('ante', 'ante'), ('mare', 'mare'), ('et', 'et'), ('terras', 'terra'), ('et', 'et'), ('quod', 'qui'), ('tegit', 'tego'), ('omnia', 'omnis'), ('caelum', 'caelum'), ('unus', 'unus'), ('erat', 'sum'), ('toto', 'totus'), ('naturae', 'natura'), ('uultus', 'uultus'), ('in', 'in'), ('orbe', 'orbis'), ('quem', 'qui'), ('dixere', 'dico'), ('chaos', 'chaos'), ('rudis', 'rudis'), ('indigesta', 'indigestus'), ('-que', '-que'), ('moles', 'moles'), ('nec', '

In [16]:
# Get total lemma counts

met_lemmas = [lemma[1] for lemma in met_lemma_pairs]
met_lemmas_set_len = len(set(met_lemmas))

In [17]:
# Print top 10 token counts

print('Number of tokens in Metamorphoses:', met_tokens_len)
print('Number of unique tokens in Metamorphoses:', met_tokens_set_len)
print('Number of unique lemmas in Metamorphoses:', met_lemmas_set_len)

Number of tokens in Metamorphoses: 82834
Number of unique tokens in Metamorphoses: 18382
Number of unique lemmas in Metamorphoses: 7772


In [18]:
# Build counter of top lemma counts

met_lemmas_counter = Counter(met_lemmas)
met_lemmas_mc = met_lemmas_counter.most_common(10000)

#print('Top 10 lemmas in Metamorphoses:\n')
#for i, pair in enumerate(met_lemmas_mc[:10]):
#    print("{number}. {lemma}\t\t{count}\t\t{percent}%".format(number=i+1, lemma=pair[0], count=pair[1], percent=round(pair[1] / len(met_tokens)*100, 2)))

running = 0

print('Top 25 lemmas in Metamorphoses:\n')
print("{number:>5}  {lemma:<12}{count:<12}{percent:<12}{running:<12}".format(number="", lemma="LEMMA", count="COUNT", percent="TYPE-LEM %", running = "RUNNING %"))
for i, pair in enumerate(met_lemmas_mc[:25]):
    running += pair[1]
    print("{number:>5}. {lemma:<12}{count:<12}{percent:<12}{running:<12}".format(number=i+1, lemma=pair[0], count=pair[1], percent=str(round(pair[1] / len(met_tokens)*100, 2))+"%", running = str(round(running / len(met_tokens)*100, 2))+"%"))    

Top 25 lemmas in Metamorphoses:

       LEMMA       COUNT       TYPE-LEM %  RUNNING %   
    1. -que        4385        5.29%       5.29%       
    2. sum         2166        2.61%       7.91%       
    3. et          2131        2.57%       10.48%      
    4. qui         1276        1.54%       12.02%      
    5. in          1164        1.41%       13.43%      
    6. ille        784         0.95%       14.37%      
    7. hic         773         0.93%       15.31%      
    8. neque       729         0.88%       16.19%      
    9. ego         608         0.73%       16.92%      
   10. non         588         0.71%       17.63%      
   11. tu          507         0.61%       18.24%      
   12. suus        472         0.57%       18.81%      
   13. cum2        462         0.56%       19.37%      
   14. uideo       427         0.52%       19.89%      
   15. do          403         0.49%       20.37%      
   16. ut          379         0.46%       20.83%      
   17. dico    

In [19]:
with open("data/met_counts/met_lemmas.txt", 'w') as f:
    for k,v in  met_lemmas_counter.most_common():
        f.write( "{} {}\n".format(k,v))