In [2]:
# modules 
import pandas as pd
import os
import sys 

sys.path.append('/Users/aleedom/cltk/open_words/')
from open_words.parse import Parse

from tqdm import tqdm 

from docx import Document
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
from docx.shared import Pt


from collections import Counter

from cltk.corpus.utils.importer import CorpusImporter

from cltk.corpus.utils.formatter import assemble_phi5_author_filepaths
from cltk.corpus.utils.formatter import phi5_plaintext_cleanup
from cltk.stem.latin.j_v import JVReplacer
from nltk.tokenize.punkt import PunktLanguageVars
from cltk.lemmatize.latin.backoff import BackoffLatinLemmatizer
from cltk.corpus.readers import get_corpus_reader

# variables

lemmatizer = BackoffLatinLemmatizer()

path = '../lexica/Lewis_Short_XML/lat.ls.perseus-eng1.xml'

corpus_importer = CorpusImporter('latin')
corpus_importer.list_corpora
corpus_importer.import_corpus('latin_text_perseus')

reader = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library')
reader._fileids = ['ammianus/14.txt'] # ammianus book 14

stops = pd.read_csv('../data/latin_word_counts.csv')
stops = list(stops[stops['cumsum'] < .705].lemma) # set stop limit 

paras = list(reader.paras())
paras = [item for sublist in paras for item in sublist]

numbers = ('1', '2', '3', '4', '5', '6', '7', '8', '9')
punc = ['.', ',', ';', '"', "'", '-que', '-ne', '-ve']

doc = Document()
parser = Parse()

Help on function divide_chunks in module __main__:

divide_chunks(l, n)



In [52]:
#####
## REFINEMENTS
#### 

# gender appearing on verbs 
# bold type for vocab words 
# alphabetize vocab words 


def divide_chunks(l, n): 
      
    # looping till length l 
    for i in range(0, len(l), n-1):  
        yield l[i:i + n]

def parse_paragraph(paragraph: 'str') -> "str":
    """Function to take paragraph of a Latin text and return a dictionary including definitions (but not citations). 
    The goal is to use this function to create short entries for a paragraph of a text. We can then use the paragraphs to build our Pharr formatted document. 
    
    :param paragraph: paragraph of parsed text
    """
    
    in_list = paragraph.split(' ')
    # lemmatize 
    in_list = [lemmatizer.lemmatize([x])[0][1] for x in in_list]
    in_list = [_ for _ in in_list if _ not in stops]
    out_str = '' 
    path = '../lexica/Lewis_Short_XML/lat.ls.perseus-eng1.xml'
    tree = ET.parse(path)
    entries = tree.xpath('//entryFree')
    endings = ''
    gender = ""
    out_list = []
    
    for word in in_list: 
        lemma = lemmatizer.lemmatize([word])[0][1]
        
        for entry in entries:
            senses = []
            if entry.get('key') == lemma:
                if entry.find('itype') is not None: 
                    endings = f'{lemma} {entry.find("itype").text}'
                if entry.find('gen') is not None: 
                    gender = entry.find('gen').text
                for sense in entry.findall('sense')[:4]:
                    # print(sense.get('level'))
                    if sense.get("level") in ['1', '2']:
                        for tr in sense.findall('hi')[1:3]:
                            senses.append(tr.text)
#                 print(senses)
                if endings != '':
                    out_string = f"""{endings} {gender}: {'; '.join(senses).strip('., ')}"""
                else:
                    out_string = f'{lemma} {gender}: {"; ".join(senses).strip("., ")}'
                if senses == []:
                    pass 
                else:
                    out_list.append(out_string)
    return '\n'.join(out_list)

def clean_paragraph(ls):
    out = f""
    for i in range(len(ls) - 1):  
        if ls[i + 1] in punc: 
            out += ''.join([ls[i], ls[i + 1].strip('-')]) + ' '
        elif ls[i] not in punc:
            out += f'{ls[i]} '
            
        else: 
            pass
    return out.strip(str(punc)) # added .strip(punc) to clear '-que' '-ne' etc.

# helper function from stackoverflow https://stackoverflow.com/questions/6039103/counting-depth-or-the-deepest-level-a-nested-list-goes-to

def depth(l):
    if isinstance(l, list):
        return 1 + max(depth(item) for item in l)
    else:
        return 0

def flatten_paragraphs(paras): 
    while depth(paras) >= 2: 
        paras = [item for sublist in paras for item in sublist]
    return paras

In [4]:
paras = list(reader.paras())

paras = flatten_paragraphs(paras)

In [186]:
#  out_dict['insuperabilis'] # test

{'word': 'insuperabilis',
 'dictionary_form': ['insuperabile', 'insuperabil'],
 'senses': ['insurmountable', 'unconquerable']}

## todo: 
- fix endings so they're included 
- ensure stop words are registered properly

Otherwise, works well

In [27]:
parser.latin_to_english('pavore')

[{'w': {'pos': 'N',
   'n': [3, 1],
   'parts': ['pavor', 'pavor'],
   'senses': ['fear, panic'],
   'form': '3 1 M T',
   'orth': 'pavor',
   'id': 29589},
  'stems': [{'st': {'pos': 'N',
     'form': '3 1 M T',
     'orth': 'pavor',
     'n': [3, 1],
     'wid': 29589},
    'infls': [{'ending': 'e',
      'pos': 'N',
      'note': '',
      'n': [3, 0],
      'form': 'LOC S X'},
     {'ending': 'e', 'pos': 'N', 'note': '', 'n': [3, 0], 'form': 'DAT S X'},
     {'ending': 'e', 'pos': 'N', 'note': '', 'n': [3, 0], 'form': 'ABL S C'},
     {'ending': 'e',
      'pos': 'N',
      'note': 'stem_ends_in_cons',
      'n': [3, 2],
      'form': 'ABL S N'},
     {'ending': 'e',
      'pos': 'N',
      'note': 'greek',
      'n': [3, 8],
      'form': 'VOC S X'}]}]}]

In [75]:
from docx.shared import Inches, Cm

path = '../lexica/Lewis_Short_XML/lat.ls.perseus-eng1.xml'
tree = ET.parse(path)
entries = tree.xpath('//entryFree')


x = divide_chunks(paras, 125)

doc = Document()
br = '———————————————————————————————————————'
for i in tqdm(x): 
    passage_dict = {}
    # format text 
    p = clean_paragraph(i)
    
    # add text to document 
    main_paragraph = doc.add_paragraph().add_run(p)
    main_paragraph.font.size = Pt(12)
    # visual line break 
    p2 = doc.add_paragraph()
    p2.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER
    p2.add_run(br)
    
    # change formatting
    
    doc.add_section(0)
    
    section = doc.sections[-1]

    sectPr = section._sectPr
    cols = sectPr.xpath('./w:cols')[0]
    cols.set(qn('w:num'),'2')
    
    # dictionary lookup 
    for w in i: 
        endings = []
        gender = []
        
        # create dictionary for entries on the page 
        
        lemma = lemmatizer.lemmatize([w])[0][1]
        if lemma in stops: # expensive but maybe saves lookup time in the next step
            continue # should need to add all the remaining words, though there is doubtless some slippage between the two systems 
        else: 
            parsed = parser.parse(w)
            word = parsed['word']
            # word 
            passage_dict[word] = {}
            passage_dict[word]['word'] = word
            passage_dict[word]['lemma'] = lemma
            if len(parsed['defs']) >=1:
                passage_dict[word]['senses'] = parsed['defs'][0]['senses']
            else:
                continue
            # inflection
            for entry in entries:
                if entry.get('key') == lemma:
                    if entry.find('itype') is not None: 
                        endings.append(f'{entry.find("itype").text} ')
                    if entry.find('gen') is not None: 
                        endings.append(entry.find('gen').text)
            passage_dict[word]['dictionary_form'] = endings
    
    # now format dictionary for page 
    
    # cleanup so the dictionary won't break 
    delete = [k for k in passage_dict.keys() if passage_dict[k].get('dictionary_form') is None]
    
    for k in delete: 
        del passage_dict[k]
    
    # sort vocab words so they're easy to find on the page   
    vocab_words_sorted = sorted(list(passage_dict.keys()))
    
    for word in vocab_words_sorted:
        bold_text = f'{" ".join(passage_dict[word]["dictionary_form"]).strip(".")}: '
        plain_text = f"{', '.join(passage_dict[word]['senses'])}" 
        
        p = doc.add_paragraph()
        
        
        bold_run = p.add_run()
        bold_run.bold = True 
        bold_run.text = f'{passage_dict[word]["lemma"]} ' + bold_text
        bold_run.font.size = Pt(10)
        
        plain_run = p.add_run()
        plain_run.bold = False
        plain_run.text = plain_text
        plain_run.font.size = Pt(10)
        
        p.paragraph_format.space_after = Pt(0)
        
    # reset column formatting and move to next page 
    
    doc.add_section(0)
    section = doc.sections[-1]

    sectPr = section._sectPr
    cols = sectPr.xpath('./w:cols')[0]
    cols.set(qn('w:num'),'1')
    doc.add_page_break()
    
sections = doc.sections
for section in sections:
    section.top_margin = Inches(0.5)
    section.bottom_margin = Inches(0.5)
    section.left_margin = Inches(.75)
    section.right_margin = Inches(.75)
    
doc.save('test_2.docx')

77it [1:23:46, 65.28s/it] 


In [None]:
def lookup_words_in_paragraph(paragraph: 'list') -> 'dictionary': 
    """Uses open words to define words"""
    

In [97]:
parser.parse('campestrem')

{'word': 'campestrem',
 'defs': [{'orth': ['campestris', 'campestr'],
   'senses': ['deities who presided over contests/games (pl.)',
    'country deities'],
   'infls': [{'ending': 'em',
     'pos': 'noun',
     'form': {'declension': 'accusative',
      'number': 'singular',
      'gender': 'C'}},
    {'ending': 'em',
     'pos': 'noun',
     'form': {'declension': 'accusative',
      'number': 'singular',
      'gender': ''}}]},
  {'orth': ['campestre', 'campestr'],
   'senses': ['flat/level country/ground (pl.)', 'plains'],
   'infls': [{'ending': 'em',
     'pos': 'noun',
     'form': {'declension': 'accusative',
      'number': 'singular',
      'gender': 'C'}},
    {'ending': 'em',
     'pos': 'noun',
     'form': {'declension': 'accusative',
      'number': 'singular',
      'gender': ''}}]},
  {'orth': ['campestre', 'campestr'],
   'senses': ['level, even, flat, of level field',
    'on open plain/field',
    'plain-dwelling'],
   'infls': [{'ending': 'em',
     'pos': 'adject

In [35]:
br = '———————————————————————————————————————'

In [36]:
from docx.shared import Inches, Cm
doc = Document()
for i in tqdm(x): 
    p = clean_paragraph(i)
    t = parse_paragraph(p)
    doc.add_paragraph(p)
    doc.add_paragraph(br)
    doc.add_section(0)
    
    section = doc.sections[-1]

    sectPr = section._sectPr
    cols = sectPr.xpath('./w:cols')[0]
    cols.set(qn('w:num'),'2')
    doc.add_paragraph(t)

    doc.add_section(0)
    section = doc.sections[-1]

    sectPr = section._sectPr
    cols = sectPr.xpath('./w:cols')[0]
    cols.set(qn('w:num'),'1')
    doc.add_page_break()
    
# clear margins 

sections = doc.sections
for section in sections:
    section.top_margin = Inches(0.5)
    section.bottom_margin = Inches(0.5)
    section.left_margin = Inches(.75)
    section.right_margin = Inches(.75)

doc.save('test.docx')

0it [00:57, ?it/s]


KeyboardInterrupt: 