In [1]:
# Imports

import os
import string
import re
from collections import Counter

from cltk.corpus.latin import latinlibrary
from cltk.tokenize.sentence import TokenizeSentence
from cltk.tokenize.word import WordTokenizer
from cltk.lemmatize.latin.backoff import BackoffLatinLemmatizer
from cltk.stem.latin.j_v import JVReplacer
from cltk.utils.file_operations import open_pickle

In [2]:
# Set up training sentences

rel_path = os.path.join('~/cltk_data/latin/model/latin_models_cltk/lemmata/backoff')
path = os.path.expanduser(rel_path)

# Check for presence of latin_pos_lemmatized_sents
file = 'latin_pos_lemmatized_sents.pickle'      

latin_pos_lemmatized_sents_path = os.path.join(path, file)
if os.path.isfile(latin_pos_lemmatized_sents_path):
    latin_pos_lemmatized_sents = open_pickle(latin_pos_lemmatized_sents_path)
else:
    latin_pos_lemmatized_sents = []
    print('The file %s is not available in cltk_data' % file)  

In [3]:
# Setup CLTK tools

word_tokenizer = WordTokenizer('latin')
sent_tokenizer = TokenizeSentence('latin')
lemmatizer = BackoffLatinLemmatizer(latin_pos_lemmatized_sents)
replacer = JVReplacer()

In [4]:
# Get raw text of the Latin Library

ll_raw = latinlibrary.raw()

In [5]:
# Preprocessing script for the Latin Library

def preprocess(text):    
    
    text = text.lower()
    
    text= re.sub(r'&lt;','<',text)
    text= re.sub(r'&gt;','>',text)    

    punctuation = string.punctuation
    #punctuation += "\"#$%&\'()*+,-/:;<=>@[\]^_`{|}~"
    translator = str.maketrans({key: " " for key in punctuation})
    text = text.translate(translator)
    
    translator = str.maketrans({key: " " for key in '0123456789'})
    text = text.translate(translator)
    
    text = replacer.replace(text)

    remove_list = [r'\bthe latin library\b', r'\bthe classics page\b', 
                   r'\bthe miscellany\b', r'\bchristian latin\b', r'medieval latin',
                  r'\bneo latin\b'] 
    for pattern in remove_list:
        text = re.sub(pattern, '', text)
    
    text = re.sub('[ ]+',' ', text) # Remove double spaces
    text = re.sub('\s+\n+\s+','\n', text) # Remove double lines and trim spaces around new lines
    
    return text

In [6]:
# Preprocess Latin Library text

ll_pp = preprocess(ll_raw)

In [15]:
ll_pp = ll_pp[50000:50498]

In [17]:
# Tokenize Latin Library text

ll_tokens = word_tokenizer.tokenize(ll_pp)
print(ll_tokens)

['erunt', 'ut', 'cornibus', 'uel', 'tubis', 'ductilibus', 'singuli', 'contenti', 'essent', 'quatinus', 'circumcirca', 'peruagantes', 'uocibus', 'aut', 'tubarum', 'strepitu', 'sibi', 'mutuo', 'innuerent', 'ne', 'aut', 'lustrata', 'repeterent', 'aut', 'non', 'lustrata', 'desererent', 'xiii', 'quod', 'ut', 'factum', 'est', 'res', 'dictu', 'mirabilis', 'et', 'saeculis', 'inaudita', 'contigit', 'quippe', 'caput', 'sancti', 'regis', 'longius', 'remotum', 'a', 'suo', 'corpore', 'prorupit', 'in', 'uocem', 'absque', 'fibrarum', 'opitulatione', 'aut', 'arteriarum', 'praecordiali', 'munere', 'uespillonum', 'sane', 'more', 'pluribus', 'pedetentim', 'inuia', 'perlustrantibus', 'cum', 'iam', 'posset', 'audiri']


In [8]:
# Get total token counts

ll_tokens_len = len(ll_tokens)
ll_tokens_set_len = len(set(ll_tokens))

In [9]:
# Print top 10 token counts

print('Number of tokens in Latin Library:', ll_tokens_len)
print('Number of unique tokens in Latin Library:', ll_tokens_set_len)

Number of tokens in Latin Library: 13563476
Number of unique tokens in Latin Library: 386698


In [10]:
# Build counter of top token counts

ll_tokens_counter = Counter(ll_tokens)
ll_tokens_mc = ll_tokens_counter.most_common(10000)

running = 0

print('Top 25 tokens in Latin Library:\n')
print("{number:>5}  {token:<12}{count:<12}{percent:<12}{running:<12}".format(number="", token="TOKEN", count="COUNT", percent="Type-Tok %", running = "RUNNING %"))
for i, pair in enumerate(ll_tokens_mc[:10]):
    running += pair[1]
    print("{number:>5}. {token:<12}{count:<12}{percent:<12}{running:<12}".format(number=i+1, token=pair[0], count=pair[1], percent=str(round(pair[1] / len(ll_tokens)*100, 2))+"%", running = str(round(running / len(ll_tokens)*100, 2))+"%"))

Top 25 tokens in Latin Library:

       TOKEN       COUNT       Type-Tok %  RUNNING %   
    1. et          446474      3.29%       3.29%       
    2. in          274387      2.02%       5.31%       
    3. est         174413      1.29%       6.6%        
    4. non         166083      1.22%       7.83%       
    5. -que        135281      1.0%        8.82%       
    6. ad          133596      0.98%       9.81%       
    7. ut          119504      0.88%       10.69%      
    8. cum         109996      0.81%       11.5%       
    9. quod        104315      0.77%       12.27%      
   10. si          95511       0.7%        12.97%      


In [18]:
# Lemmatize Latin Library text

ll_lemma_pairs = lemmatizer.lemmatize(ll_tokens)
print(ll_lemma_pairs)

[('erunt', 'sum'), ('ut', 'ut'), ('cornibus', 'cornu'), ('uel', 'uel'), ('tubis', 'tuba'), ('ductilibus', 'ductilis'), ('singuli', 'singulis'), ('contenti', 'contendo'), ('essent', 'sum'), ('quatinus', 'quatenus'), ('circumcirca', 'circumcirca'), ('peruagantes', 'pervagor'), ('uocibus', 'uox'), ('aut', 'aut'), ('tubarum', 'tubaris'), ('strepitu', 'strepitus'), ('sibi', 'sui'), ('mutuo', 'mutuus'), ('innuerent', 'innuerent'), ('ne', 'ne'), ('aut', 'aut'), ('lustrata', 'lustro1'), ('repeterent', 'repeto'), ('aut', 'aut'), ('non', 'non'), ('lustrata', 'lustro1'), ('desererent', 'desero2'), ('xiii', 'xiii'), ('quod', 'qui'), ('ut', 'ut'), ('factum', 'facio'), ('est', 'sum'), ('res', 'res'), ('dictu', 'dico2'), ('mirabilis', 'mirabilis'), ('et', 'et'), ('saeculis', 'saeculis'), ('inaudita', 'inaudio'), ('contigit', 'contingo'), ('quippe', 'quippe'), ('caput', 'caput'), ('sancti', 'sanctus'), ('regis', 'rex'), ('longius', 'longus'), ('remotum', 'removeo'), ('a', 'ab'), ('suo', 'suus'), ('cor

In [12]:
# Get total lemma counts

ll_lemmas = [lemma[1] for lemma in ll_lemma_pairs]
ll_lemmas_set_len = len(set(ll_lemmas))

In [13]:
# Print top 10 token counts

print('Number of tokens in Latin Library:', ll_tokens_len)
print('Number of unique tokens in Latin Library:', ll_tokens_set_len)
print('Number of unique lemmas in Latin Library:', ll_lemmas_set_len)

Number of tokens in Latin Library: 13563476
Number of unique tokens in Latin Library: 386698
Number of unique lemmas in Latin Library: 216868


In [14]:
# Build counter of top lemma counts

ll_lemmas_counter = Counter(ll_lemmas)
ll_lemmas_mc = ll_lemmas_counter.most_common(10000)

#print('Top 10 lemmas in Latin Library:\n')
#for i, pair in enumerate(ll_lemmas_mc[:10]):
#    print("{number}. {lemma}\t\t{count}\t\t{percent}%".format(number=i+1, lemma=pair[0], count=pair[1], percent=round(pair[1] / len(ll_tokens)*100, 2)))

running = 0

print('Top 10 lemmas in Latin Library:\n')
print("{number:>5}  {lemma:<12}{count:<12}{percent:<12}{running:<12}".format(number="", lemma="LEMMA", count="COUNT", percent="TYPE-LEM %", running = "RUNNING %"))
for i, pair in enumerate(ll_lemmas_mc[:10]):
    running += pair[1]
    print("{number:>5}. {lemma:<12}{count:<12}{percent:<12}{running:<12}".format(number=i+1, lemma=pair[0], count=pair[1], percent=str(round(pair[1] / len(ll_tokens)*100, 2))+"%", running = str(round(running / len(ll_tokens)*100, 2))+"%"))    

Top 10 lemmas in Latin Library:

       LEMMA       COUNT       TYPE-LEM %  RUNNING %   
    1. et          446474      3.29%       3.29%       
    2. sum         438065      3.23%       6.52%       
    3. qui         365280      2.69%       9.21%       
    4. in          274387      2.02%       11.24%      
    5. is          213677      1.58%       12.81%      
    6. non         166083      1.22%       14.04%      
    7. -que        144790      1.07%       15.1%       
    8. hic         140414      1.04%       16.14%      
    9. ad          133613      0.99%       17.13%      
   10. ut          119506      0.88%       18.01%      


In [15]:
# Print top 10,000 counts

print('Top 10,000 tokens in the Latin Library:\n')
for i, pair in enumerate(ll_tokens_mc):
    print("{number}. {token} ({count})".format(number=i+1, token=pair[0], count=pair[1]))

Top 10,000 tokens in the Latin Library:

1. et (446474)
2. in (274387)
3. est (174413)
4. non (166083)
5. -que (135281)
6. ad (133596)
7. ut (119504)
8. cum (109996)
9. quod (104315)
10. si (95511)
11. qui (93423)
12. de (81344)
13. a (74477)
14. sed (74410)
15. quae (65472)
16. ex (59811)
17. quam (56452)
18. per (51190)
19. esse (49449)
20. nec (45391)
21. se (44896)
22. sunt (44296)
23. hoc (43795)
24. enim (42558)
25. uel (41716)
26. aut (40729)
27. autem (40662)
28. ab (39996)
29. etiam (37861)
30. -ne (36644)
31. eius (35578)
32. quid (32516)
33. sit (32272)
34. atque (30371)
35. me (30371)
36. te (29973)
37. quo (29786)
38. quia (28071)
39. id (27370)
40. ac (26949)
41. ne (26825)
42. tamen (26201)
43. ita (25116)
44. iam (24906)
45. dig (24841)
46. haec (24269)
47. eo (23665)
48. nam (23572)
49. eum (23044)
50. pro (22608)
51. mihi (21226)
52. uero (20921)
53. tibi (19658)
54. neque (19584)
55. ea (19555)
56. sic (19063)
57. quidem (18793)
58. quibus (18618)
59. quoque (18546)


In [17]:
# Print top 10,000 lemma counts

print('Top 10,000 lemmas in the Latin Library:\n')
for i, pair in enumerate(ll_lemmas_mc):
    print("{number}. {lemma} ({count})".format(number=i+1, lemma=pair[0], count=pair[1]))

Top 10,000 lemmas in the Latin Library:

1. et (446474)
2. sum (438065)
3. qui (365280)
4. in (274387)
5. is (213677)
6. non (166083)
7. -que (144790)
8. hic (140414)
9. ad (133613)
10. ut (119506)
11. ab (114503)
12. cum2 (113090)
13. si (95511)
14. alis (85619)
15. ille (83003)
16. de (81629)
17. tu (78390)
18. sed (74877)
19. suus (72813)
20. dico (70484)
21. ego (70445)
22. omnis (70423)
23. sui (66113)
24. possum (65552)
25. neque (64975)
26. ipse (62436)
27. facio (60464)
28. ex (59812)
29. atque (57320)
30. quam (56452)
31. res (53480)
32. per (51190)
33. habeo (48495)
34. quis (47878)
35. deus (43683)
36. enim (42558)
37. uideo (42379)
38. uel (41716)
39. aut (40729)
40. autem (40662)
41. multus (40256)
42. do (38086)
43. etiam (37980)
44. magnus (37256)
45. -ne (36644)
46. uerus (35281)
47. idem (35080)
48. tuus (33673)
49. homo (30541)
50. quo (29786)
51. meus (29703)
52. nos (29402)
53. quia (28074)
54. bonus (27639)
55. dominus (27493)
56. ne (26825)
57. pars (26764)
58. no