In [1]:
# Imports & setup

import os
import csv
import string
import re
from collections import defaultdict

from cltk.utils.file_operations import open_pickle
from cltk.stem.latin.j_v import JVReplacer
from cltk.tokenize.sentence import TokenizeSentence
from cltk.tokenize.word import WordTokenizer
from cltk.lemmatize.latin.backoff import BackoffLatinLemmatizer
from cltk.corpus.latin import latinlibrary

datapath = '../data/'
datafile = "latin_vocabulary_list.csv"

In [2]:
# Set up training sentences

rel_path = os.path.join('~/cltk_data/latin/model/latin_models_cltk/lemmata/backoff')
path = os.path.expanduser(rel_path)

# Check for presence of latin_pos_lemmatized_sents
file = 'latin_pos_lemmatized_sents.pickle'      

latin_pos_lemmatized_sents_path = os.path.join(path, file)
if os.path.isfile(latin_pos_lemmatized_sents_path):
    latin_pos_lemmatized_sents = open_pickle(latin_pos_lemmatized_sents_path)
else:
    latin_pos_lemmatized_sents = []
    print('The file %s is not available in cltk_data' % file)  

In [3]:
# Set up NLP tools

word_tokenizer = WordTokenizer('latin')
sent_tokenizer = TokenizeSentence('latin')
lemmatizer = BackoffLatinLemmatizer(latin_pos_lemmatized_sents)
replacer = JVReplacer()

In [4]:
# Load first column of DCC Core Vocabulary csv file

columns = defaultdict(list)

with open(datapath+datafile) as f:
    reader = csv.reader(f)
    next(reader)
    for row in reader:
        for (i,v) in enumerate(row):
            columns[i].append(v)

dcc_lemmas = columns[0]


# Split headword column by whitespace and keep only first word

dcc_lemmas_simple = [lemma.replace('/',' ').split()[0] for lemma in dcc_lemmas]


# Preprocess DCC lemmas

# Normalize u/v
dcc_lemmas_simple = [replacer.replace(lemma) for lemma in dcc_lemmas_simple]

# remove macrons
def remove_macrons(text):
    transmap = {ord('ā'): 'a', ord('ē'): 'e', ord('ī'): 'i', ord('ō'): 'o', ord('ū'): 'u', }
    return text.translate(transmap)

dcc_lemmas_simple = [remove_macrons(lemma) for lemma in dcc_lemmas_simple]

# Remove punctuation
translator = str.maketrans({key: None for key in string.punctuation})
dcc_lemmas_simple = [lemma.translate(translator) for lemma in dcc_lemmas_simple]

dcc_lemmas_simple.sort()  

In [5]:
# Setup Cicero texts from Latin Library corpus

files = latinlibrary.fileids()
cicero_files = [file for file in files if 'cicero/' in file]
cicero_names = [os.path.splitext(os.path.basename(file))[0] for file in cicero_files]
cicero_raw = latinlibrary.raw(cicero_files)

In [6]:
# Preprocess texts
def preprocess(text):    
    text = text.lower()
    
    text= re.sub(r'&lt;','<',text)
    text= re.sub(r'&gt;','>',text)    
    
    punctuation ="\"#$%&\'()*+,-/:;<=>@[\]^_`{|}~" # Without periods, question marks, exclamation marks
    translator = str.maketrans({key: " " for key in punctuation})
    text = text.translate(translator)
    
    translator = str.maketrans({key: " " for key in '0123456789'})
    text = text.translate(translator)
    
    text = replacer.replace(text)

    remove_list = [r'\bthe classics page', r'\bthe latin library\b', r'\bcicero: .+?$', r'\bliber \b\w+?\b\s+?', r'\s{3}\bcicero\b\s{3}']    
    for pattern in remove_list:
        text = re.sub(pattern, '', text)
    
    text = text.replace('&#151;', ' ')

    
    return text

def remove_spaces(text):
    text = '\n'.join(' '.join(line.split()) for line in text.split('\n'))
    text = '\n'.join([x for x in text.splitlines() if x.strip()])
    return text

In [7]:
# Prepare Cicero texts

cicero_texts = [latinlibrary.raw(file) for file in cicero_files]
cicero_texts = [preprocess(text) for text in cicero_texts]
cicero_texts = [remove_spaces(text) for text in cicero_texts]



In [8]:
cicero_paras = []

for text in cicero_texts:
    paras = [para for para in text.split('\n')]
    cicero_paras.append(paras)

In [9]:
# Postprocess lemmas

def postprocess_lemmas(lemmas):
    translator = str.maketrans({key: "" for key in '0123456789'})
    lemmas = [lemma.translate(translator) for lemma in lemmas]
    lemmas = [replacer.replace(lemma) for lemma in lemmas]
    return lemmas

In [10]:
def sent_count(text):
    sents = sent_tokenizer.tokenize(text)
    return(len(sents))

def word_count(text):
    words = word_tokenizer.tokenize(text)
    return(len(words))

def char_count(text):
    return len(text)

def difficult_words(text):
    tokens = word_tokenizer.tokenize(text)
    lemmas = lemmatizer.lemmatize(tokens)
    lemmas = [lemma[1] for lemma in lemmas if lemma[1] != 'punc']
    lemmas = postprocess_lemmas(lemmas)
    difficult_words = [lemma for lemma in lemmas if lemma not in dcc_lemmas_simple]
    #rint(difficult_words)
    return len(difficult_words)

In [11]:
def dale_chall(text):
    diff_words = difficult_words(text)
    words = word_count(text)
    sents = sent_count(text)
    return (0.1579 * ((diff_words / words) * 100)) + (0.0496 * (words / sents))

In [24]:
scores_texts = {}

for index, text in enumerate(cicero_paras):
    name = cicero_names[index]
    #print(name)
    scores = []
    #print(index)
    for para in text:
    #    score = index
        score = dale_chall(para)
        scores.append(score)
    scores_texts[name] = scores

#print(scores_texts)
    
    

acad
0
adbrutum1
1
adbrutum2
2
amic
3
arch
4
att1
5
att10
6
att11
7
att12
8
att13
9
att14
10
att15
11
att16
12
att2
13
att3
14
att4
15
att5
16
att6
17
att7
18
att8
19
att9
20
balbo
21
brut
22
caecilium
23
caecina
24
cael
25
cat1
26
cat2
27
cat3
28
cat4
29
cluentio
30
compet
31
consulatu
32
deio
33
divinatione
34
divinatione1
35
divinatione2
36
domo
37
fam1
38
fam10
39
fam11
40
fam12
41
fam13
42
fam14
43
fam15
44
fam16
45
fam2
46
fam3
47
fam4
48
fam5
49
fam6
50
fam7
51
fam8
52
fam9
53
fato
54
fin1
55
fin2
56
fin3
57
fin4
58
fin5
59
flacco
60
fonteio
61
fratrem1
62
fratrem2
63
fratrem3
64
haruspicum
65
imp
66
inventione1
67
inventione2
68
leg1
69
leg2
70
leg3
71
legagr1
72
legagr2
73
legagr3
74
lig
75
marc
76
milo
77
murena
78
nd
79
nd1
80
nd2
81
nd3
82
off1
83
off2
84
off3
85
optgen
86
orator
87
oratore1
88
oratore2
89
oratore3
90
paradoxa
91
partitione
92
phil1
93
phil10
94
phil11
95
phil12
96
phil13
97
phil14
98
phil2
99
phil3
100
phil4
101
phil5
102
phil6
103
phil7
104
phil8
105
phil

In [83]:
avgs = []

for k, v in scores_texts.items():
    l = len(v)
    s = sum(v)
    a = s/l
    avgs.append((k,a))

avgs.sort(key=lambda x: x[1])    
    
pprint(avgs)

[('fin3', 4.163267196278051),
 ('fin2', 4.236250223090476),
 ('lig', 4.3689744046581485),
 ('fin4', 4.383422021201318),
 ('rabiriopost', 4.506646813982484),
 ('leg3', 4.556283650441678),
 ('fin5', 4.559248366055288),
 ('phil1', 4.620397642534915),
 ('tusc5', 4.627930566597206),
 ('amic', 4.631185320972376),
 ('milo', 4.669700021831883),
 ('off3', 4.679333302686257),
 ('fin1', 4.703210396690284),
 ('marc', 4.770639874070674),
 ('tusc1', 4.79006123821022),
 ('sex.rosc', 4.824178325221478),
 ('topica', 4.843848093809633),
 ('phil2', 4.850584850182335),
 ('nd1', 4.862658416305973),
 ('partitione', 4.866168419687727),
 ('cat1', 4.88953170141591),
 ('repub3', 4.8982300316848),
 ('sulla', 4.911470320349661),
 ('off2', 4.932174129235842),
 ('off1', 4.934705446947915),
 ('leg2', 4.939167708653915),
 ('leg1', 4.94569317806525),
 ('cluentio', 4.952788061376929),
 ('orator', 4.955753181538236),
 ('fato', 4.959238195706703),
 ('plancio', 4.967754620098796),
 ('quinc', 4.9737299785326545),
 ('deio',

In [90]:
cicero_names.index('paradoxa')

91

In [91]:
print(len(cicero_paras[91]))

22


In [92]:
pprint(cicero_paras[91])

['cicero paradoxa',
 'm. tulli ciceronis paradoxa ad m. brutum',
 'animaduerti brute saepe catonem auunculum tuum cum in senatu sententiam '
 'diceret locos graues ex philosophia tractare abhorrentes ab hoc usu forensi '
 'et publico sed dicendo consequi tamen ut illa etiam populo probabilia '
 'uiderentur. quod eo maius est illi quam aut tibi aut nobis quia nos ea '
 'philosophia plus utimur quae peperit dicendi copiam et in qua dicuntur ea '
 'quae non multum discrepent ab opinione populari cato autem perfectus mea '
 'sententia stoicus et ea sentit quae non sane probantur in uolgus et in ea '
 'est haeresi quae nullum sequitur florem orationis neque dilatat argumentum '
 'minutis interrogatiunculis quasi punctis quod proposuit efficit. sed nihil '
 'est tam incredibile quod non dicendo fiat probabile nihil tam horridum tam '
 'incultum quod non splendescat oratione et tamquam excolatur. quod cum ita '
 'putarem feci etiam audacius quam ille ipse de quo loquor. cato enim dumtaxat '
 

In [93]:
def ari(text):
    c, w, s = char_count(text), word_count(text), sent_count(text)
    print(c,w,s)
    score = 4.71 * (c / w) + 0.5 * (w / s) - 21.43
    return score

In [94]:
ari(" ".join(cicero_paras[91]))

28740 4759 261


16.130943129331882

In [95]:
ari(" ".join(cicero_paras[90]))

111247 17573 477


26.807291555644312