In [46]:
import html
import re
from collections import Counter

from pprint import pprint

from cltk.corpus.latin import latinlibrary
from cltk.tokenize.word import WordTokenizer
from cltk.stem.latin.j_v import JVReplacer

In [17]:
word_tokenizer = WordTokenizer('latin')
replacer = JVReplacer()

In [73]:
# Preprocess texts

def preprocess(text):

    text = html.unescape(text) # Handle html entities
    text = re.sub(r'&nbsp;?', ' ',text) #&nbsp; stripped incorrectly in corpus?
    text = re.sub(r'\x00',' ',text) #Another space problem?
    
    text = text.lower()
    text = replacer.replace(text) #Normalize u/v & i/j
    
    punctuation ="\"#$%&\'()*+,-/:;<=>@[\]^_`{|}~.?!«»“"
    translator = str.maketrans({key: " " for key in punctuation})
    text = text.translate(translator)
    
    translator = str.maketrans({key: " " for key in '0123456789'})
    text = text.translate(translator)
    
    remove_list = [r'\bthe latin library\b',
               r'\bthe classics page\b',
               r'\bneo-latin\b', 
               r'\bmedieval latin\b',
               r'\bchristian latin\b',
               r'\bthe miscellany\b',
              ]
    
    for pattern in remove_list:
        text = re.sub(pattern, '', text)
    
    text = re.sub('[ ]+',' ', text) # Remove double spaces
    text = re.sub('\s+\n+\s+','\n', text) # Remove double lines and trim spaces around new lines
    
    return text

In [74]:
raw = preprocess(latinlibrary.raw())

In [75]:
tokens = word_tokenizer.tokenize(raw)

In [76]:
tokens[:100] 

['duodecim',
 'tabularum',
 'leges',
 'duodecim',
 'tabularum',
 'leges',
 'tabula',
 'i',
 'si',
 'in',
 'ius',
 'uocat',
 'ito',
 'ni',
 'it',
 'antestamino',
 'igitur',
 'em',
 'capito',
 'si',
 'caluitur',
 'pedem',
 '-ue',
 'struit',
 'manum',
 'endo',
 'iacito',
 'si',
 'morbus',
 'aeuitas',
 '-ue',
 'uitium',
 'escit',
 'iumentum',
 'dato',
 'si',
 'nolet',
 'arceram',
 'ne',
 'sternito',
 'assiduo',
 'uindex',
 'assiduus',
 'esto',
 'proletario',
 'iam',
 'ciui',
 'quis',
 'uolet',
 'uindex',
 'esto',
 'nex',
 'forti',
 'sanati',
 'rem',
 'ubi',
 'pacunt',
 'orato',
 'ni',
 'pacunt',
 'in',
 'comitio',
 'aut',
 'in',
 'foro',
 'ante',
 'meridiem',
 'caussam',
 'coiciunto',
 'com',
 'peroranto',
 'ambo',
 'praesentes',
 'post',
 'meridiem',
 'praesenti',
 'litem',
 'addicito',
 'si',
 'ambo',
 'praesentes',
 'solis',
 'occasus',
 'suprema',
 'tempestas',
 'esto',
 'uades',
 'subuades',
 'tabula',
 'ii',
 'actor',
 'dicito',
 'ex',
 'sponsione',
 'te',
 'mihi',
 'dare',
 'oporter

In [77]:
size = len(tokens)

### -ebus adverbs

In [80]:
ebus_exceptions = ['phoebus', 'nectanebus', 'phebus', 'ephebus', 'coroebus', 'erebus', 'choroebus']
# iebus? phaebus? estrebus?
ebus_tokens = [token for token in tokens if token.endswith('ebus') and token not in ebus_exceptions]

In [81]:
print('There are %d tokens with the ending -ebus.\n' % len(ebus_tokens))

print('The most common -ebus tokens are: ')
pprint(Counter(ebus_tokens).most_common(25))

print('\n -ebus tokens make up {:.2%} of the Latin Library. Diederich reports .12%.'.format(len(ebus_tokens)/size))

There are 10653 tokens with the ending -ebus.

The most common -ebus tokens are: 
[('rebus', 7440),
 ('diebus', 2975),
 ('speciebus', 136),
 ('aciebus', 41),
 ('iebus', 12),
 ('superficiebus', 6),
 ('spebus', 5),
 ('faciebus', 4),
 ('phaebus', 3),
 ('omnibusrebus', 2),
 ('generationebus', 2),
 ('foebus', 2),
 ('bonisrebus', 1),
 ('aliisrebus', 1),
 ('cunctisdiebus', 1),
 ('estrebus', 1),
 ('humanisrebus', 1),
 ('prosperisrebus', 1),
 ('philebus', 1),
 ('alebus', 1),
 ('spetiebus', 1),
 ('exemplarebus', 1),
 ('uocebus', 1),
 ('manebus', 1),
 ('pedebus', 1)]

 -ebus tokens make up 0.08% of the Latin Library. Diederich reports .12%.


### -u adverb

In [91]:
u_exceptions = ['tu', 'u']
u_exceptions += ['iu', 'xu', 'xiu'] #roman numerals
u_exceptions += ['nou'] # abbreviation
# uu?

u_tokens = [token for token in tokens if token.endswith('u') and token not in u_exceptions]

In [92]:
print('There are %d tokens with the ending -u.\n' % len(u_tokens))

print('The most common -u tokens are: ')
pprint(Counter(u_tokens).most_common(25))

print('\n -u tokens make up {:.2%} of the Latin Library. Diederich reports .63%.'.format(len(u_tokens)/size))

There are 80164 tokens with the ending -u.

The most common -u tokens are: 
[('manu', 4489),
 ('seu', 3756),
 ('diu', 3238),
 ('exercitu', 2767),
 ('metu', 1892),
 ('casu', 1718),
 ('senatu', 1713),
 ('spiritu', 1529),
 ('usu', 1484),
 ('conspectu', 1220),
 ('cursu', 978),
 ('impetu', 944),
 ('uu', 943),
 ('iussu', 925),
 ('motu', 920),
 ('heu', 886),
 ('uultu', 882),
 ('iesu', 878),
 ('consensu', 865),
 ('sensu', 825),
 ('statu', 808),
 ('gradu', 749),
 ('aduentu', 739),
 ('quamdiu', 736),
 ('actu', 668)]

 -u tokens make up 0.59% of the Latin Library. Diederich reports .63%.


### -iter true adverb

### -ei adjective adverb

### -d subject object

### -ius subject object adjective, true comparative adverb

### -r acted upon

### -mur acted upon

### -tur acted upon

### -ntur acted upon

In [64]:
ntur_exceptions = []
ntur_tokens = [token for token in tokens if token.endswith('ntur')]

In [50]:
print('There are %d tokens with the ending -ntur.\n' % len(ntur_tokens))

print('The most common -ntur tokens are: ')
pprint(Counter(ntur_tokens).most_common(25))

print('\n -ntur tokens make up {:.2%} of the Latin Library. Diederich reports .51%.'.format(len(ntur_tokens)/size))

There are 66712 tokens with the ending -ntur.

The most common -ntur tokens are: 
[('dicuntur', 1848),
 ('uidentur', 1665),
 ('uideantur', 781),
 ('uidebantur', 575),
 ('sequuntur', 502),
 ('continentur', 500),
 ('uocantur', 488),
 ('uiderentur', 411),
 ('utuntur', 389),
 ('nascuntur', 348),
 ('appellantur', 329),
 ('habentur', 324),
 ('referuntur', 309),
 ('feruntur', 309),
 ('tenentur', 294),
 ('dicantur', 264),
 ('aguntur', 262),
 ('inueniuntur', 259),
 ('patiuntur', 240),
 ('dantur', 239),
 ('loquuntur', 235),
 ('geruntur', 220),
 ('teneantur', 216),
 ('arbitrantur', 206),
 ('mouentur', 201)]

 -ntur tokens make up 0.49% of the Latin Library. Diederich reports .51%.


### -isti acted

In [54]:
isti_exceptions = ['christi', 'isti', 'tristi', 'cristi']
isti_tokens = [token for token in tokens if token.endswith('isti') and token not in isti_exceptions]

In [55]:
print('There are %d tokens with the ending -isti.\n' % len(isti_tokens))

print('The most common -isti tokens are: ')
pprint(Counter(isti_tokens).most_common(25))

print('\n -isti tokens make up {:.2%} of the Latin Library. Diederich reports .05%.'.format(len(isti_tokens)/size))

There are 9246 tokens with the ending -isti.

The most common -isti tokens are: 
[('fecisti', 687),
 ('dedisti', 405),
 ('dixisti', 393),
 ('fuisti', 276),
 ('uidisti', 251),
 ('uoluisti', 246),
 ('audisti', 174),
 ('potuisti', 147),
 ('accepisti', 140),
 ('uenisti', 135),
 ('habuisti', 122),
 ('misisti', 121),
 ('scripsisti', 112),
 ('sisti', 92),
 ('posuisti', 88),
 ('meministi', 83),
 ('coepisti', 80),
 ('occidisti', 78),
 ('perdidisti', 74),
 ('debuisti', 73),
 ('egisti', 66),
 ('emisti', 64),
 ('uicisti', 63),
 ('noluisti', 59),
 ('tulisti', 59)]

 -isti tokens make up 0.07% of the Latin Library. Diederich reports .05%.


### -erunt acted

In [58]:
erunt_exceptions = ['fuerunt', 'erunt', 'ferunt', 'quaerunt']
erunt_tokens = [token for token in tokens if token.endswith('erunt') and token not in erunt_exceptions]

In [59]:
print('There are %d tokens with the ending -erunt.\n' % len(erunt_tokens))

print('The most common -erunt tokens are: ')
pprint(Counter(erunt_tokens).most_common(25))

print('\n -erunt tokens make up {:.2%} of the Latin Library. Diederich reports .20%.'.format(len(erunt_tokens)/size))

There are 40486 tokens with the ending -erunt.

The most common -erunt tokens are: 
[('dixerunt', 1396),
 ('fecerunt', 1158),
 ('uenerunt', 1065),
 ('coeperunt', 836),
 ('potuerunt', 743),
 ('dederunt', 561),
 ('uoluerunt', 528),
 ('uiderunt', 446),
 ('poterunt', 389),
 ('habuerunt', 383),
 ('miserunt', 372),
 ('ceciderunt', 370),
 ('acceperunt', 354),
 ('posuerunt', 331),
 ('peruenerunt', 309),
 ('responderunt', 297),
 ('inuenerunt', 278),
 ('receperunt', 268),
 ('conuenerunt', 245),
 ('crediderunt', 244),
 ('reliquerunt', 238),
 ('tulerunt', 238),
 ('rescripserunt', 235),
 ('perierunt', 231),
 ('tradiderunt', 217)]

 -erunt tokens make up 0.30% of the Latin Library. Diederich reports .20%.


### -ri to be acted upon

### -isse to have acted

In [62]:
isse_exceptions = []
isse_tokens = [token for token in tokens if token.endswith('isse') and token not in isse_exceptions]

In [63]:
print('There are %d tokens with the ending -isse.\n' % len(isse_tokens))

print('The most common -isse tokens are: ')
pprint(Counter(isse_tokens).most_common(25))

print('\n -isse tokens make up {:.2%} of the Latin Library. Diederich reports .21%.'.format(len(isse_tokens)/size))

There are 30928 tokens with the ending -isse.

The most common -isse tokens are: 
[('fuisse', 5429),
 ('fecisse', 1239),
 ('potuisse', 820),
 ('dixisse', 760),
 ('uenisse', 740),
 ('habuisse', 635),
 ('dedisse', 515),
 ('meminisse', 393),
 ('uidisse', 392),
 ('uoluisse', 369),
 ('accepisse', 346),
 ('peruenisse', 238),
 ('egisse', 221),
 ('respondisse', 211),
 ('cecidisse', 208),
 ('scripsisse', 204),
 ('audisse', 204),
 ('odisse', 195),
 ('sensisse', 183),
 ('perisse', 182),
 ('accidisse', 171),
 ('meruisse', 165),
 ('inuenisse', 160),
 ('tulisse', 153),
 ('placuisse', 151)]

 -isse tokens make up 0.23% of the Latin Library. Diederich reports .21%.


### verb stem with or without -te (Act!)

### -ebus adverbs

### -ebus adverbs

### -ebus adverbs

### -ebus adverbs

### -ebus adverbs

### -ebus adverbs

### -ebus adverbs