In [1]:
import csv
import random

from collections import Counter

import nltk

import pickle
from pprint import pprint

In [2]:
from cltk.corpus.readers import get_corpus_reader
reader = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library')

In [3]:
raw = reader.raw()

In [4]:
ve_words = [word.lower() for word in raw.split() if word.endswith('ve')]

In [5]:
len(ve_words)

14912

In [6]:
print(Counter(ve_words).most_common(25))

[('sive', 8442), ('neve', 467), ('grave', 428), ('iove', 300), ('breve', 235), ('cave', 230), ('leve', 186), ('nave', 147), ('quidve', 93), ('suave', 87), ('quive', 78), ('nive', 55), ('quaeve', 51), ('quove', 50), ('salve', 50), ('bove', 50), ('cive', 48), ('vive', 43), ('prave', 41), ('pluresve', 39), ('minusve', 38), ('solve', 36), ('intempestive', 35), ('abusive', 35), ('aliave', 33)]


In [7]:
with open('data/enclitics.csv', 'r') as f:
    reader = csv.reader(f, delimiter='\t')
    options_ = list(reader)

In [8]:
options = []
for option in options_:
    o = (option[0], option[1])
    multiple = int(option[2])
    options.extend([o]*multiple)

In [9]:
def enclitics_features(word):
    return {'last_1_letters': word[-1:],
            'last_2_letters': word[-2:],
            'last_3_letters': word[-3:],
            'last_4_letters': word[-4:],
            'last_5_letters': word[-5:],
            'first_1_letters': word[0],
            'first_2_letters': word[:2],
            'first_3_letters': word[:3],
            'first_4_letters': word[:4],
            'length_gt_5': len(word) > 5,
           }

In [10]:
has_enc = set([option[0] for option in options if int(option[1]) is 1])
no_enc = set([option[0] for option in options if int(option[1]) is 0])

In [11]:
print(f'There are {len(has_enc)} words with enclitics.' )
print('Some examples include: {}'.format(', '.join(sorted(has_enc)[:10])))

There are 130 words with enclitics.
Some examples include: adipiscendive, aliamve, aliave, aliisve, aliorumve, aliove, aliudve, aliumve, argentove, bonisve


In [12]:
print(f'There are {len(no_enc)} words without enclitics.' )
print('Some examples include: {}'.format(', '.join(sorted(no_enc)[:10])))

There are 70 words without enclitics.
Some examples include: abusive, active, adiective, affirmative, agave, appositive, ave, bove, breve, calve


In [13]:
# Set up classifier, spec. Naive Bayers classifier

random.shuffle(options)
featuresets = [(enclitics_features(n), g) for (n,g) in options]
train_set, test_set = featuresets[10000:], featuresets[10000:]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [14]:
# Give accuracy 

print(nltk.classify.accuracy(classifier, test_set))

0.9824828636709825


In [15]:
classifier.show_most_informative_features(25)

Most Informative Features
          last_4_letters = 'sive'              0 : 1      =     80.7 : 1.0
         first_1_letters = 'e'                 1 : 0      =     70.7 : 1.0
          last_4_letters = 'aeve'              1 : 0      =     60.3 : 1.0
         first_2_letters = 'fi'                1 : 0      =     39.3 : 1.0
         first_2_letters = 'cu'                1 : 0      =     29.0 : 1.0
         first_1_letters = 'm'                 1 : 0      =     27.6 : 1.0
          last_4_letters = 'nove'              1 : 0      =     25.8 : 1.0
         first_1_letters = 's'                 0 : 1      =     22.7 : 1.0
             length_gt_5 = True                1 : 0      =     21.7 : 1.0
         first_1_letters = 'd'                 1 : 0      =     21.5 : 1.0
         first_2_letters = 'fa'                1 : 0      =     19.9 : 1.0
          last_4_letters = 'love'              1 : 0      =     18.9 : 1.0
         first_2_letters = 'de'                1 : 0      =     15.8 : 1.0

In [16]:
# Review errors from devtest

errors = []
for (name, tag) in options:
    guess = classifier.classify(enclitics_features(name))
    if guess != tag:
        errors.append((tag, guess, name))

print('There were {} errors out of {}.'.format(len(errors), len(options)))
        
for (tag, guess, name) in sorted(set(errors)):
    print('correct=%-8s guess=%-8s name=%-30s' % (tag, guess, name))

There were 248 errors out of 12626.
correct=0        guess=1        name=active                        
correct=0        guess=1        name=adiective                     
correct=0        guess=1        name=agave                         
correct=0        guess=1        name=appositive                    
correct=0        guess=1        name=calve                         
correct=0        guess=1        name=comparative                   
correct=0        guess=1        name=conclave                      
correct=0        guess=1        name=curve                         
correct=0        guess=1        name=declive                       
correct=0        guess=1        name=demonstrative                 
correct=0        guess=1        name=dispensative                  
correct=0        guess=1        name=dispositive                   
correct=0        guess=1        name=dissolve                      
correct=0        guess=1        name=dive                          
correct=0   

In [19]:
%%time

# Time classifier

classifier.classify(enclitics_features('neve'))

CPU times: user 151 µs, sys: 1e+03 ns, total: 152 µs
Wall time: 160 µs


'0'

In [20]:
# Save a dictionary into a pickle file.

pickle.dump(classifier, open('models/ve_splitter.p', 'wb'))

In [21]:
# Work from pickled model

classifier = pickle.load(open('models/ve_splitter.p', 'rb'))

In [26]:
# Write function to validate input

def is_ve_enclitic(word, classifier, features):
    if word.endswith('ve'):
        return int(classifier.classify(features(word)))
    else:
        return 0

In [27]:
%%time
is_ve_enclitic('neve', classifier=classifier, features=enclitics_features)

CPU times: user 542 µs, sys: 277 µs, total: 819 µs
Wall time: 608 µs


0

In [28]:
from random import sample

random_ve = sample(ve_words, 100)

In [30]:
for word in random_ve:
    print(f'{word}: {is_ve_enclitic(word, classifier=classifier, features=enclitics_features)}')

sive: 0
sive: 0
sive: 0
bove: 0
parentive: 1
sive: 0
iove: 0
sive: 0
leve: 0
tacitusve: 1
sive: 0
sive: 0
medicusve: 1
conductoremve: 1
sive: 0
neptibusve: 1
dominave: 1
totumve: 1
nave: 0
sive: 0
raptave: 1
sive: 0
sive: 0
sive: 0
leve: 0
sive: 0
dispensative: 1
sive: 0
ove: 0
ove: 0
alterave: 1
sive: 0
sive: 0
sive: 0
sive: 0
flagitiumve: 1
sive: 0
neve: 0
prave: 0
successive: 1
breve: 0
sitve: 1
aluntve: 1
sive: 0
sive: 0
parve: 1
humanioresve: 1
active: 1
sive: 0
dolove: 1
sive: 0
sive: 0
sive: 0
sive: 0
sive: 0
grave: 0
solve: 0
sive: 0
sive: 0
iove: 0
sive: 0
balneove: 1
sive: 0
sive: 0
nolanove: 1
sive: 0
sive: 0
sive: 0
sive: 0
breve: 0
sive: 0
sive: 0
sive: 0
sive: 0
sive: 0
minoremve: 1
sive: 0
sive: 0
sive: 0
neve: 0
sive: 0
sive: 0
quamve: 1
sive: 0
sive: 0
nepotemve: 1
cererisve: 1
sive: 0
sive: 0
heredesve: 1
latronemve: 1
sive: 0
sive: 0
bustumve: 1
regnave: 1
sive: 0
iove: 0
adnumeratave: 1
grave: 0
sive: 0
