In [116]:
from tqdm import tqdm 
from docx import Document

from docx.oxml import OxmlElement
from docx.oxml.ns import qn

doc = Document()

from cltk.corpus.utils.formatter import assemble_phi5_author_filepaths
from cltk.corpus.utils.formatter import phi5_plaintext_cleanup
from cltk.stem.latin.j_v import JVReplacer
from collections import Counter
from nltk.tokenize.punkt import PunktLanguageVars
import os
from cltk.lemmatize.latin.backoff import BackoffLatinLemmatizer
lemmatizer = BackoffLatinLemmatizer()

path = 'lexica/Lewis_Short_XML/lat.ls.perseus-eng1.xml'
path2 = 'lexica/Lewis_Short_XML/lat.ls.perseus-eng2.xml'

from collections import Counter
from cltk.corpus.utils.importer import CorpusImporter
corpus_importer = CorpusImporter('latin')
corpus_importer.list_corpora

from cltk.corpus.readers import get_corpus_reader
corpus_importer.import_corpus('latin_text_perseus')
reader = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library')

stops = list(stops[stops['cumsum'] < .705].lemma) # set stop limit 

reader._fileids = ['ammianus/14.txt'] # ammianus book 14

paras = list(reader.paras())

paras = [item for sublist in paras for item in sublist]

numbers = ('1', '2', '3', '4', '5', '6', '7', '8', '9')
punc = ['.', ',', ';', '"', "'", '-que', '-ne', '-ve']

In [213]:
df = pd.read_csv('data/latin_word_counts.csv')

In [215]:
df.head(20)

Unnamed: 0,lemma,token_count,pct_of_tokens,cumsum
0,sum,132427,0.030682,0.030682
1,et,121212,0.028084,0.058766
2,qui,109323,0.025329,0.084096
3,in,78831,0.018265,0.10236
4,-que,65771,0.015239,0.117599
5,is,55632,0.01289,0.130489
6,non,48894,0.011328,0.141817
7,hic,48074,0.011138,0.152955
8,ut,37646,0.008722,0.161678
9,cum2,35809,0.008297,0.169974


'this is:test text'

In [243]:
document.save('bold_test.docx')

In [221]:
lemmatizer.lemmatize(['subditicii'])

[('subditicii', 'subditicii')]

In [206]:
#####
## REFINEMENTS
#### 

# gender appearing on verbs 
# bold type for vocab words 
# alphabetize vocab words 


def divide_chunks(l, n): 
      
    # looping till length l 
    for i in range(0, len(l), n-1):  
        yield l[i:i + n]

def parse_paragraph(paragraph: 'str') -> "str":
    """Function to take paragraph of a Latin text and return a dictionary including definitions (but not citations). 
    The goal is to use this function to create short entries for a paragraph of a text. We can then use the paragraphs to build our Pharr formatted document. 
    
    :param paragraph: paragraph of parsed text
    """
    
    in_list = paragraph.split(' ')
    # lemmatize 
    in_list = [lemmatizer.lemmatize([x])[0][1] for x in in_list]
    in_list = [_ for _ in in_list if _ not in stops]
    out_str = '' 
    path = 'lexica/Lewis_Short_XML/lat.ls.perseus-eng1.xml'
    tree = ET.parse(path)
    entries = tree.xpath('//entryFree')
    endings = ''
    gender = ""
    out_list = []
    
    for word in in_list: 
        lemma = lemmatizer.lemmatize([word])[0][1]
        
        for entry in entries:
            senses = []
            if entry.get('key') == lemma:
                if entry.find('itype') is not None: 
                    endings = f'{lemma} {entry.find("itype").text}'
                if entry.find('gen') is not None: 
                    gender = entry.find('gen').text
                for sense in entry.findall('sense')[:4]:
                    # print(sense.get('level'))
                    if sense.get("level") in ['1', '2']:
                        for tr in sense.findall('hi')[1:3]:
                            senses.append(tr.text)
#                 print(senses)
                if endings != '':
                    out_string = f"""{endings} {gender}: {'; '.join(senses).strip('., ')}"""
                else:
                    out_string = f'{lemma} {gender}: {"; ".join(senses).strip("., ")}'
                if senses == []:
                    pass 
                else:
                    out_list.append(out_string)
    return '\n'.join(out_list)

def clean_paragraph(ls):
    out = f""
    for i in range(len(ls) - 1):  
        if ls[i + 1] in punc: 
            out += ''.join([ls[i], ls[i + 1].strip('-')]) + ' '
        elif ls[i] not in punc:
            out += f'{ls[i]} '
            
        else: 
            pass
    return out



In [217]:
x = divide_chunks(paras, 150)

In [218]:
br = '———————————————————————————————————————'

In [219]:
from docx.shared import Inches, Cm
doc = Document()
for i in tqdm(x): 
    p = clean_paragraph(i)
    t = parse_paragraph(p)
    doc.add_paragraph(p)
    doc.add_paragraph(br)
    doc.add_section(0)
    
    section = doc.sections[-1]

    sectPr = section._sectPr
    cols = sectPr.xpath('./w:cols')[0]
    cols.set(qn('w:num'),'2')
    doc.add_paragraph(t)

    doc.add_section(0)
    section = doc.sections[-1]

    sectPr = section._sectPr
    cols = sectPr.xpath('./w:cols')[0]
    cols.set(qn('w:num'),'1')
    doc.add_page_break()
    
# clear margins 

sections = doc.sections
for section in sections:
    section.top_margin = Inches(0.5)
    section.bottom_margin = Inches(0.5)
    section.left_margin = Inches(.75)
    section.right_margin = Inches(.75)

doc.save('test.docx')

64it [05:46,  5.42s/it]


In [161]:
test = '''de Caesare quisque sentiret. et haec confidenter agebat in urbe ubi pernoctantium luminum claritudo dierum solet imitari fulgorem. postremo agnitus saepe iamque -que, si prodisset, conspicuum se fore contemplans, non nisi luce palam egrediens ad agenda quae putabat seria cernebatur. et haec quidem medullitus multis gementibus agebantur. 10. Thalassius vero ea tempestate praefectus praetorio praesens ipse quoque adrogantis ingenii, considerans incitationem eius ad multorum augeri discrimina, non maturitate vel consiliis mitigabat, ut aliquotiens celsae potestates iras principum molliverunt, sed adversando iurgandoque cum parum congrueret, eum ad rabiem potius evibrabat, Augustum actus eius exaggerando creberrime docens, idque -que, incertum qua mente, ne lateret adfectans. quibus mox Caesar acrius efferatus, velut contumaciae quoddam vexillum altius erigens, sine respectu salutis alienae vel suae ad vertenda opposita instar rapidi fluminis irrevocabili impetu ferebatur. '
'''

125720