In [1]:
# Imports

import csv
import random

from collections import Counter

import nltk

import pickle

from pprint import pprint

In [2]:
# Load entire CLTK Latin Library corpus

from cltk.corpus.readers import get_corpus_reader
reader = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library')
raw = reader.raw()
filecount = len(reader.fileids())
# rough_wordcount = len(list(reader.words()))

In [3]:
print(f'There are {filecount} files in this corpus.')
# print(f'There are (roughly) {rough_wordcount} words in this corpus.')

There are 2141 files in this corpus.


In [4]:
# Count words that end with ve

ve_words = [word.lower() for word in raw.split() if word.endswith('ve')]
print(f'There are {len(ve_words)} words that end with -ve in this corpus.')

There are 14912 words that end with -ve in this corpus.


In [5]:
# Get frequency count for -ve words

print(Counter(ve_words).most_common(25))

[('sive', 8442), ('neve', 467), ('grave', 428), ('iove', 300), ('breve', 235), ('cave', 230), ('leve', 186), ('nave', 147), ('quidve', 93), ('suave', 87), ('quive', 78), ('nive', 55), ('quaeve', 51), ('quove', 50), ('salve', 50), ('bove', 50), ('cive', 48), ('vive', 43), ('prave', 41), ('pluresve', 39), ('minusve', 38), ('solve', 36), ('intempestive', 35), ('abusive', 35), ('aliave', 33)]


In [6]:
# Load 200 tagged examples of words with -ve enclitics vs. words that end with -ve

with open('data/enclitics.csv', 'r') as f:
    reader = csv.reader(f, delimiter='\t')
    options_ = list(reader)
    
options = []
for option in options_:
    o = (option[0], option[1])
    multiple = int(option[2])
    options.extend([o]*multiple)    
    
has_enc = set([option[0] for option in options if int(option[1]) is 1])
no_enc = set([option[0] for option in options if int(option[1]) is 0])    

print(f'There are {len(has_enc)} words with enclitics.' )
print('Some examples include: {}'.format(', '.join(sorted(has_enc)[:10])))
print('\n')
print(f'There are {len(no_enc)} words without enclitics.' )
print('Some examples include: {}'.format(', '.join(sorted(no_enc)[:10])))

There are 130 words with enclitics.
Some examples include: adipiscendive, aliamve, aliave, aliisve, aliorumve, aliove, aliudve, aliumve, argentove, bonisve


There are 70 words without enclitics.
Some examples include: abusive, active, adiective, affirmative, agave, appositive, ave, bove, breve, calve


In [7]:
# Define some features for classification

def enclitics_features(word):
    return {'last_1_letters': word[-1:],
            'last_2_letters': word[-2:],
            'last_3_letters': word[-3:],
            'last_4_letters': word[-4:],
            'last_5_letters': word[-5:],
            'first_1_letters': word[0],
            'first_2_letters': word[:2],
            'first_3_letters': word[:3],
            'first_4_letters': word[:4],
            'length_gt_5': len(word) > 5,
           }

In [8]:
# Set up classifier, spec. Naive Bayers classifier

random.shuffle(options)
featuresets = [(enclitics_features(n), g) for (n,g) in options]
train_set, test_set = featuresets[10000:], featuresets[10000:]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [9]:
# Give accuracy 

print(nltk.classify.accuracy(classifier, test_set))

0.9813404417364814


In [10]:
# Show most informative features

classifier.show_most_informative_features(10)

Most Informative Features
         first_1_letters = 'm'                 1 : 0      =     48.1 : 1.0
          last_4_letters = 'aeve'              1 : 0      =     38.7 : 1.0
          last_4_letters = 'nove'              1 : 0      =     36.2 : 1.0
             length_gt_5 = True                1 : 0      =     32.9 : 1.0
         first_1_letters = 'd'                 1 : 0      =     28.7 : 1.0
         first_1_letters = 'f'                 1 : 0      =     25.4 : 1.0
          last_4_letters = 'sive'              0 : 1      =     25.0 : 1.0
          last_4_letters = 'dive'              1 : 0      =     19.5 : 1.0
          last_5_letters = 'ssive'             1 : 0      =     18.9 : 1.0
         first_2_letters = 'cu'                1 : 0      =     18.3 : 1.0


In [11]:
# Review errors from devtest

errors = []
for (name, tag) in options:
    guess = classifier.classify(enclitics_features(name))
    if guess != tag:
        errors.append((tag, guess, name))

print('There were {} errors out of {}.'.format(len(errors), len(options)))
print('\nHere are 10 examples...')
        
for (tag, guess, name) in sorted(set(errors))[:10]:
    print('correct=%-8s guess=%-8s name=%-30s' % (tag, guess, name))

There were 331 errors out of 12626.

Here are 10 examples...
correct=0        guess=1        name=active                        
correct=0        guess=1        name=adiective                     
correct=0        guess=1        name=affirmative                   
correct=0        guess=1        name=agave                         
correct=0        guess=1        name=appositive                    
correct=0        guess=1        name=calve                         
correct=0        guess=1        name=comparative                   
correct=0        guess=1        name=conclave                      
correct=0        guess=1        name=curve                         
correct=0        guess=1        name=declive                       


In [12]:
%%time

# Time classifier

classifier.classify(enclitics_features('neve'))

CPU times: user 169 µs, sys: 1 µs, total: 170 µs
Wall time: 179 µs


'0'

In [13]:
# Save a dictionary into a pickle file.

pickle.dump(classifier, open('models/ve_splitter.p', 'wb'))

In [14]:
# Work from pickled model

classifier = pickle.load(open('models/ve_splitter.p', 'rb'))

In [15]:
# Write function to validate input

def is_ve_enclitic(word, classifier, features):
    if word.endswith('ve'):
        return int(classifier.classify(features(word)))
    else:
        return 0

In [16]:
%%time
is_ve_enclitic('neve', classifier=classifier, features=enclitics_features)

CPU times: user 181 µs, sys: 1 µs, total: 182 µs
Wall time: 191 µs


0