## Imports

In [65]:
import re

from pprint import pprint

from cltk.corpus.latin import latinlibrary
from cltk.tokenize.sentence import TokenizeSentence
from cltk.tokenize.word import WordTokenizer

## CLTK Tools

In [None]:
word_tokenizer = WordTokenizer('latin')
sent_tokenizer = TokenizeSentence('latin')

## Process Files

In [80]:
files = latinlibrary.fileids()
cicero_files = [file for file in files if 'cicero/' in file]
cicero_raw = latinlibrary.raw(cicero_files)

## Function for preprocessing texts

In [74]:
# Preprocess texts
def preprocess(text):    
    
    text = text.lower()
    
    text= re.sub(r'&lt;','<',text)
    text= re.sub(r'&gt;','>',text)    
        
    punctuation ="\"#$%&\'()*+,-/:;<=>@[\]^_`{|}~"
    translator = str.maketrans({key: " " for key in punctuation})
    text = text.translate(translator)
    
    translator = str.maketrans({key: " " for key in '0123456789'})
    text = text.translate(translator)
    
    from cltk.stem.latin.j_v import JVReplacer
    replacer = JVReplacer()
    text = replacer.replace(text)

    remove_list = [r'\bthe\b', r'\blatin\b', r'\blibrary\b', r'\bclassics\b', r'\bpage\b']    
    for pattern in remove_list:
        text = re.sub(pattern, '', text)
    
    return text

## Process Texts

In [75]:
# Assemble a list of all 140 of Cicero's texts
cicero_texts = [latinlibrary.raw(file) for file in cicero_files]
cicero_texts = [preprocess(text) for text in cicero_texts]

# Assemble a list of all 140 of Cicero's titles
cicero_titles = [text.split('\n', 1)[0] for text in cicero_texts]
cicero_titles = [title.split(': ',1)[-1].lower().strip() for title in cicero_titles]

In [None]:
def sent_count(text):
    sents = sent_tokenizer.tokenize(text)
    return(len(sents))

def word_count(text):
    words = word_tokenizer.tokenize(text)
    return(len(words))

def char_count(text):
    return len(text)


In [None]:
def ari(text):
    c, w, s = char_count(text), word_count(text), sent_count(text)
    print(c,w,s)
    score = 4.71 * (c / w) + 0.5 * (w / s) - 21.43
    return score

In [79]:
for text in cicero_texts:
    print(text[:250])
    print('----------')

cicero  academica
		 

		 
		 
	 
	
 

 m. tulli ciceronis academici libri quattuor
 

 
 
liber primus      
     
     
     
     
     
     
     
     
      
      
      
 liber secundus 
 liber tertius 
 
 

 
     in cumano nuper cum mecum 
----------
cicero  ad brutum i
		 

		 
		 
	 
	
 

 m. tulli ciceronis epistularum ad brutum liber primus
 

 

 
 i
 ii
 iii
 iu
 u
 ui
 uii
 uiii
 ix
 x
 xi
 xii
 xiii
 xiu
 xu
 xui
 xuii
 xuiii 
 

 

 
  i  scr. eodem die quo ep.  a.          cicero bruto 
----------
cicero  ad brutum ii
		 

		 
		 
	 
	
 

 m. tulli ciceronis epistularum ad brutum liber secundus
 

 

 
 i 
 ii 
 iii 
 iu 
 u 
 

 

 
  i  scr. romae ex. m. mart. ant in. apr. a.         . cicero bruto sal. 
 

 
 cum haec scribebam  res existim
----------
cicero  de amicitia
		 

		 
		 
	 
	
 

 m. tulli ciceronis laelius de amicitia
 

 

 
   	   	   	   	   	   	   	   	   	    	    	    	    	    	    	    	    	    	    	    	    	    	    	    	    	    	    

In [None]:
print(preprocess(cicero_texts[0]))

In [None]:
cicero_edit = cicero_raw
cicero_edit = cicero_edit.lower()

In [None]:
punctuation ="\"#$%&\'()*+,-/:;<=>@[\]^_`{|}~"
translator = str.maketrans({key: " " for key in punctuation})
cicero_edit = cicero_edit.translate(translator)

In [None]:
translator = str.maketrans({key: " " for key in '0123456789'})
cicero_edit = cicero_edit.translate(translator)

In [None]:
from cltk.stem.latin.j_v import JVReplacer
replacer = JVReplacer()

cicero_edit = replacer.replace(cicero_edit)

In [None]:
remove_list = ['the', 'latin', 'library', 'classics', 'page']
remove_dict = {key: ' ' for key in remove_list}

for k, v in remove_dict.items():
    cicero_edit = cicero_edit.replace(k,v)

In [None]:
cicero_edit = " ".join(cicero_edit.split('\t'))

In [None]:
import re
cicero_edit = re.sub(r'\n\s*\n', '\n', cicero_edit)

In [None]:
cicero_edit = ". ".join(cicero_edit.split('\n'))
cicero_edit = re.sub(' +',' ', cicero_edit)
cicero_edit = re.sub(' \.','.', cicero_edit)
cicero_edit = re.sub('\.\.','.', cicero_edit)

In [None]:
print(cicero_edit[:1000])

In [None]:
from cltk.tokenize.sentence import TokenizeSentence
tokenizer = TokenizeSentence('latin')

In [None]:
cicero_sents = tokenizer.tokenize(cicero_edit)

In [None]:
cicero_char_count = []
cicero_word_count = []

for sent in cicero_sents:
    cicero_char_count.append(len(sent))
    cicero_word_count.append(len(sent.split()))

In [None]:
data = list(zip(cicero_sents, cicero_char_count, cicero_word_count))

In [None]:
data[0]

In [None]:
def ari(sentence):
    chars = len(sentence)
    words = len(sentence.split())
    ari = 4.71*(chars/words) + 0.5*(words)-21.43
    return ari

In [None]:
aris =[]

for item in cicero_sents:
    aris.append(ari(item))


In [None]:
aris.index(max(aris))

In [None]:
print(data[39497][1])
print(data[39497][2])
print(cicero_sents[39497])

In [None]:
word_counts = [item[2] for item in data]

In [None]:
word_counts.index(max(word_counts))

In [None]:
chars_per_word = [item[1]/item[2] for item in data if item[2] > 1]

In [None]:
chars_per_word.index(max(chars_per_word))

In [None]:
cicero_sents[15215]

In [None]:
sum(word_counts) / len(word_counts)

In [None]:
char_counts = [item[1] for item in data]

In [None]:
sum(char_counts) / len(char_counts)

In [1]:
markers = ["[1]", "[I]", "[ 1 ]", "[ 1]", "I. "]

start = []

for text in cicero_texts:
    temp = len(text)
    for marker in markers:
        if text.find(marker) < temp and text.find(marker) != -1:
            temp = text.index(marker)
    start.append(temp)

print(start)
    
print(cicero_texts[5])

NameError: name 'cicero_texts' is not defined

In [2]:
def dalechall(sentence): 
    chars = len(sentence)
    words = len(sentence.split())
    ari = 4.71*(chars/words) + 0.5*(words)-21.43
    return ari