In [1]:
import cltk
import os
from pathlib import Path
import json
import pandas as pd
from beta_code import beta_code_to_greek, greek_to_beta_code
from xmlUtils import XMLTag, TagType, TagListContentExtractor
from collections import Counter

cltk.curr_version

cltk 1.1.6 (c:\users\annet\anaconda3\lib\site-packages)

In [2]:
fc = cltk.data.fetch.FetchCorpus("grc")

In [3]:
lexica_dir = f"{str(Path.home())}\\cltk_data\\grc\\lexicon\\greek_lexica_perseus"
if not os.path.exists(lexica_dir):
    fc.import_corpus("greek_lexica_perseus")
lex1, lex2 = json.load(open(f"{lexica_dir}\\greek-analyses_1.json", "r", encoding='utf-8')), json.load(open(f"{lexica_dir}\\greek-analyses_2.json", "r", encoding='utf-8'))

In [4]:
perseus_dir = f"{str(Path.home())}\\cltk_data\\grc\\text\\grc_text_perseus"
working_dir = f"{perseus_dir}\\Herodotus\\opensource"
if not os.path.exists(perseus_dir):
    fc.import_corpus("grc_text_perseus")

In [6]:
raw_gk = open(f"{working_dir}\\hdt_gk.xml", "r").read()

In [7]:
def get_all_elements(corpora):
    processed_content = []
    curr_element = ""
    for i in range(len(corpora)):
        c = corpora[i]
        if c == "<":
            processed_content.append(curr_element)
            curr_element = "<"
        elif c == ">":
            processed_content.append(curr_element + ">")
            curr_element = ""
        else:
            curr_element += c
    return processed_content


#tags_to_be_removed = ["/l", "\n", ".\n", "\n\n", "\n\n\n", "l", "p", "/p", "/body", "/text", '/TEI.2', ""]
tags_to_be_removed = ["", "\n", "\n"*2, "\n"*3, "<p>", "</p>"]
def clean_tags(all_tags):
    all_tags = all_tags[all_tags.index("<body>")+1:]
    all_tags = all_tags[:all_tags.index("</body>")]
    all_tags = [t.strip("\n") for t in all_tags if t not in tags_to_be_removed]
    return all_tags

def remove_between_tag_old(all_tags, tag_type="note"):
    filtering_out = False
    filtered_tags = []
    for tag in all_tags:
        #if tag.isTag and tag.tagName == tag_type and tag.tagType == TagType.OPENING:
        if tag.startswith(f"<{tag_type} ") or tag.startswith(f"<{tag_type}>"):
            filtering_out = True
            continue
        #elif tag.isTag and tag.tagName == tag_type and tag.tagType == TagType.CLOSING:
        elif tag.startswith(f"</{tag_type}") or tag.startswith(f"</{tag_type}>"):
            filtering_out = False
            continue
        if not filtering_out:
            filtered_tags.append(tag)
    return filtered_tags


def remove_tags(all_tags, tag_type, keep_content=True):
    filtering_out = False
    filtered_tags = []
    for tag in all_tags:
        if tag.isTag and tag.tagName == tag_type and tag.tagType == TagType.OPENING:
        #if tag.startswith(f"<{tag_type} ") or tag.startswith(f"<{tag_type}>"):
            filtering_out = True
            continue
        elif tag.isTag and tag.tagName == tag_type and tag.tagType == TagType.CLOSING:
        # elif tag.startswith(f"</{tag_type}") or tag.startswith(f"</{tag_type}>"):
            filtering_out = False
            continue
        if not filtering_out or keep_content:
            filtered_tags.append(tag)
    return filtered_tags

xml_args_regex = r'\w+=\"[^\"]+\"'
# hierarchy = ["chapter", "section", "para"]
def args_to_obj(args):
    o = {}
    for arg in args:
        k,v = arg.split("=")
        o[k] = v.strip('"')
    return o

empty_properties = {
    "book": None,
    "chapter": None,
    "section": None,
    "quote": None,
    "source": None
}

def flatten_text_tags(all_tags):
    output = []
    flattening = False
    text_so_far = []
    for tag in all_tags:
        if tag.tagName == "text":
            if not flattening:
                flattening = True
            text_so_far.append(tag)
        else:
            flattening = False
            if text_so_far != []:
                newTag = XMLTag("".join(t.string for t in text_so_far))
                output.append(newTag)
                text_so_far = []
            output.append(tag)
    return output

# then, group text. if not quote, join by concat
# if quote is not none, join with "\n"

In [178]:
all_elements_gk = get_all_elements(raw_gk)
all_elements_gk = clean_tags(all_elements_gk)
all_elements_gk = [XMLTag(e) for e in all_elements_gk]
all_elements_gk = remove_tags(all_elements_gk, "note", keep_content=False)
print(len(all_elements_gk))
all_elements_gk = flatten_text_tags(all_elements_gk)
print(len(all_elements_gk))
text = TagListContentExtractor(all_elements_gk).parse_greek()
text_df = pd.DataFrame(text).astype(str)
text_df.text = text_df.text.str.strip(" \n")
text_df["text_g"] = text_df.text.apply(beta_code_to_greek)

12327
12254


In [179]:
text_df

Unnamed: 0,book,chapter,section,quote,source,text,text_g
0,1,1,0,,,*(hrodo/tou *(alikarnhsse/os i(stori/hs a)po/d...,"Ἡροδότου Ἁλικαρνησσέος ἱστορίης ἀπόδεξις ἥδε, ..."
1,1,1,1,,,*perse/wn me/n nun oi( lo/gioi *foi/nikas ai)t...,Περσέων μέν νυν οἱ λόγιοι Φοίνικας αἰτίους φασ...
2,1,1,2,,,to\ de\ *)/argos tou=ton to\n xro/non proei=xe...,τὸ δὲ Ἄργος τοῦτον τὸν χρόνον προεῖχε ἅπασι τῶ...
3,1,1,3,,,pe/mpth| de\ h)\ e(/kth| h(me/rh| a)p' h(=s a)...,"πέμπτῃ δὲ ἢ ἕκτῃ ἡμέρῃ ἀπ' ἧς ἀπίκοντο, ἐξεμπο..."
4,1,1,4,,,tau/tas sta/sas kata/ pru/mnhn th=s neo\s w)ne...,ταύτας στάσας κατά πρύμνην τῆς νεὸς ὠνέεσθαι τ...
...,...,...,...,...,...,...,...
4385,9,121,1,,,tau=ta de\ poih/santes a)pe/pleon e)s th\n *(e...,"ταῦτα δὲ ποιήσαντες ἀπέπλεον ἐς τὴν Ἑλλάδα, τά..."
4386,9,122,1,,,tou/tou de\ *)artau/+ktew tou= a)nakremasqe/nt...,τούτου δὲ Ἀρταΰκτεω τοῦ ἀνακρεμασθέντος προπάτ...
4387,9,122,2,,,&ldquo;e)pei\ *zeu\s *pe/rsh|si h(gemoni/hn di...,"&λδθυο;ἐπεὶ Ζεὺς Πέρσῃσι ἡγεμονίην διδοῖ, ἀνδρ..."
4388,9,122,3,,,&rdquo; *ku=ros de\ tau=ta a)kou/sas kai\ ou) ...,&ρδθυο; Κῦρος δὲ ταῦτα ἀκούσας καὶ οὐ θωμάσας ...


In [180]:
#[(e.tagName, e.tagType, e.string, e.tagProps) for e in all_elements_gk]

In [182]:
raw_eng = open(f"{working_dir}\\hdt_eng.xml", "r").read()

In [183]:
def fix_foreign(all_tags):
    output = []
    parsing_foreign = False
    foreign_list = []
    foreign_lang = None
    for tag in all_tags:
        if tag.tagName == "foreign" and tag.tagType == TagType.OPENING:
            parsing_foreign = True
            foreign_list = []
            foreign_lang = tag.tagProps["lang"]
        elif tag.tagName == "foreign" and tag.tagType == TagType.CLOSING:
            parsing_foreign = False
            foreign_str = "".join(t.string for t in foreign_list if t.tagName == "text")
            if foreign_lang == "greek":
                foreign_str = beta_code_to_greek(foreign_str)
            newTag = XMLTag(foreign_str)
            output.append(newTag)
            foreign_lang = None
        else:
            if parsing_foreign:
                foreign_list.append(tag)
            else:
                output.append(tag)
    return output

def taglist_to_string(all_tags):
    s = "".join(e.string for e in all_tags)
    return s

def process_notes(all_tags):
    output = []
    looping_text = False
    #note_count = 0
    note_count = 0
    looping_note = False
    text_so_far = []
    note_so_far = []
    notes_so_far = []
    for tag in all_tags:
        #print(f"{tag.string=}, {looping_text=}, {looping_note=}")
        if tag.tagName == "text":
            if looping_note:
                note_so_far.append(tag)
            elif looping_text:
                text_so_far.append(tag)
            else:
                looping_text = True
                text_so_far = [tag]
                notes_so_far = []
        elif tag.tagName == "note":
            if tag.tagType == TagType.OPENING:
                looping_note = True
                note_so_far = []
                text_so_far.append(XMLTag(f"[^{note_count}]"))
                note_count += 1
            else:
                looping_note = False
                note_content = "".join(e.string for e in note_so_far)
                notes_so_far.append(note_content)
        else:
            if text_so_far != []:
                if notes_so_far != []:
                    text_so_far.append(XMLTag("\n\n"))
                    for i, note in enumerate(notes_so_far):
                        text_so_far.append(XMLTag(f"[^{i}]: {note}\n"))
                newTag = XMLTag("".join(e.string for e in text_so_far))
                output.append(newTag)
            note_count = 0
            looping_text = False
            looping_note = False
            text_so_far = []
            note_so_far = []
            notes_so_far = []
            output.append(tag)
    return output



In [184]:
# for testing purposes
notes_processed = process_notes([
    XMLTag('<milestone n="18" unit="chapter"/>'),
    XMLTag('<milestone n="1" unit="section"/>'),
    XMLTag('<milestone unit="para"/>'),
    XMLTag('When the horsemen had ridden away, Mardonius sent a herald, with this message: &ldquo;Men of Phocis, be of good courage, for you have shown yourselves to be valiant men, and not as it was reported to me. Now push this war zealously forward, for you will outdo neither myself nor the king in the rendering of service.&rdquo;'),
    XMLTag('<note anchored="yes" resp="ed">'),
    XMLTag('That is, serve us and we will serve you.'),
    XMLTag('</note>'),
    XMLTag(' This is how the matter of the Phocians'),
    XMLTag('<note anchored="yes" resp="ed">'),
    XMLTag('From the region of Phocis.'),
    XMLTag('</note>'),
    XMLTag(' turned out.'),
    XMLTag('<milestone n="19" unit="chapter"/>'),
    XMLTag('<milestone n="19" unit="chapter"/>'),
    XMLTag('When the horsemen had ridden away, Mardonius sent a herald, with this message: &ldquo;Men of Phocis, be of good courage, for you have shown yourselves to be valiant men, and not as it was reported to me. Now push this war zealously forward, for you will outdo neither myself nor the king in the rendering of service.&rdquo;'),
    XMLTag('<note anchored="yes" resp="ed">'),
    XMLTag('That is, serve us and we will serve you.'),
    XMLTag('</note>'),
    XMLTag(' This is how the matter of the Phocians'),
    XMLTag('<note anchored="yes" resp="ed">'),
    XMLTag('From the region of Phocis.'),
    XMLTag('</note>'),
    XMLTag(' turned out.'),
    XMLTag('<milestone n="19" unit="chapter"/>'),
    XMLTag('<milestone n="19" unit="chapter"/>')
])
#notes_processed
#[e.string for e in notes_processed]

In [185]:
all_elements_eng = get_all_elements(raw_eng)
all_elements_eng = clean_tags(all_elements_eng)
all_elements_eng = [XMLTag(e) for e in all_elements_eng]
tags_to_be_removed_keep_content = ["name", "placeName", "date", "title", "dateRange", "cit", "bibl"]
for tagName in tags_to_be_removed_keep_content:
    all_elements_eng = remove_tags(all_elements_eng, tagName, keep_content=True)
# TODO: handle bibl separately otherwise problems with quotes?
all_elements_eng = fix_foreign(all_elements_eng)
all_elements_eng = flatten_text_tags(all_elements_eng)
all_elements_eng = process_notes(all_elements_eng)
text_eng = TagListContentExtractor(all_elements_eng).parse_eng()
df_text_eng = pd.DataFrame(text_eng).astype(str)
df_text_eng.text = df_text_eng.text.str.strip(" \n")
df_text_eng = df_text_eng.rename(columns={"text": "text_eng"})


# DONE name: remove tag, retain contents
# DONE milestone: handle as before
# DONE placeName: remove tag, retain contents
# DONE note: loop over texts, same logic as flattening. when happen upon a note node, do not stop loop, but rather, save the note into a list, and in its place, leave [^notecount] as text tag. When loop over (happen upon non-text, non-node), add as many new text tags as there are nodes in the format [^notecount]: note content
# DONE foreign: if <foreign lang="greek", replace beta code. otherwise, remove tag, do not touch in-between.
# DONE date: remove tag, retain contents
# DONE title: remove tag, retain contents
# DONE div1: handle as before
# DONE dateRange: remove tag, retain contents
# DONE? bibl: remove tag, retain contents # NVM when bibl inside note, remove tag, do not touch in-between. Otherwise, add to last text row
# DONE quote: join l's within
# DONE cit: remove tag, keep content?. TODO: later, maybe make words within it italic?
# DONE text: merge consecutive TEXT tags

In [186]:
def get_rows(df, book, chapter, section):
    return df[(df.book == str(book)) & (df.chapter == str(chapter)) & (df.section == str(section))]

In [187]:
# fix 1: wrong section number in eng file
idx = df_text_eng[(df_text_eng.book == "1") & (df_text_eng.chapter == "74") & (df_text_eng.section == "6")].index[0]
df_text_eng.loc[idx, "section"] = "5"

In [188]:
# fix 2: double section
idx = df_text_eng[(df_text_eng.book == "3") & (df_text_eng.chapter == "39") & (df_text_eng.section == "2")].index[1]
df_text_eng.loc[idx, "section"] = "3"

In [189]:
# fix 3: double section
idx = get_rows(df_text_eng, 4, 203, 1).index[1]
df_text_eng.loc[idx, "section"] = "3"

In [190]:
# fix 4: misnumbered section
idx = get_rows(df_text_eng, 6, 11, 4).index[0]
df_text_eng.loc[idx, "section"] = "3"

In [191]:
# fix 5: misnumbered section
idx = get_rows(df_text_eng, 6, 49, 3).index[0]
df_text_eng.loc[idx, "section"] = "2"

In [192]:
# fix 6: misnumbered section
idx = get_rows(df_text_eng, 6, 58, 6).index[0]
df_text_eng.loc[idx, "section"] = "3"

In [193]:
# fix 7: misnumbered section
idx = get_rows(df_text_eng, 7, 19, 3).index[0]
df_text_eng.loc[idx, "section"] = "2"

In [194]:
# fix 8: misnumbered section
idx = get_rows(df_text_eng, 7, 37, 4).index[0]
df_text_eng.loc[idx, "section"] = "3"

In [195]:
# fix 9: misnumbered section
idx = get_rows(df_text_eng, 7, 41, 3).index[0]
df_text_eng.loc[idx, "section"] = "2"

In [196]:
# fix 10: misnumbered section
idx = get_rows(df_text_eng, 7, 67, 3).index[0]
df_text_eng.loc[idx, "section"] = "2"

In [197]:
# fix 11: misnumbered section
idx = get_rows(df_text_eng, 7, 83, 3).index[0]
df_text_eng.loc[idx, "section"] = "2"

In [198]:
# fix 12: double section
idx = get_rows(text_df, 6, 5, 2).index[1]
text_df.loc[idx, "section"] = "3"

In [199]:
# fix 13: double section
idx = get_rows(text_df, 8, 135, 2).index[1]
text_df.loc[idx, "section"] = "3"

In [204]:
assert len(set(text_df.set_index(["book", "chapter", "section"]).index) - set(df_text_eng.set_index(["book", "chapter", "section"]).index)) == 0

In [205]:
assert len(set(df_text_eng.set_index(["book", "chapter", "section"]).index) - set(text_df.set_index(["book", "chapter", "section"]).index)) == 0

In [206]:
text_df.to_parquet("herodotus_books_grc.parquet")
df_text_eng.to_parquet("herodotus_books_eng.parquet")

In [880]:
#text_df[text_df.quote != "None"]["text_g"].iloc[0]
#text_df["text_g"].iloc[0]

In [884]:
text_df

Unnamed: 0,book,chapter,section,quote,source,text,text_g
0,1,1,0,,,*(hrodo/tou *(alikarnhsse/os i(stori/hs a)po/d...,"Ἡροδότου Ἁλικαρνησσέος ἱστορίης ἀπόδεξις ἥδε, ..."
1,1,1,1,,,*perse/wn me/n nun oi( lo/gioi *foi/nikas ai)t...,Περσέων μέν νυν οἱ λόγιοι Φοίνικας αἰτίους φασ...
2,1,1,2,,,to\ de\ *)/argos tou=ton to\n xro/non proei=xe...,τὸ δὲ Ἄργος τοῦτον τὸν χρόνον προεῖχε ἅπασι τῶ...
3,1,1,3,,,pe/mpth| de\ h)\ e(/kth| h(me/rh| a)p' h(=s a)...,"πέμπτῃ δὲ ἢ ἕκτῃ ἡμέρῃ ἀπ' ἧς ἀπίκοντο, ἐξεμπο..."
4,1,1,4,,,tau/tas sta/sas kata/ pru/mnhn th=s neo\s w)ne...,ταύτας στάσας κατά πρύμνην τῆς νεὸς ὠνέεσθαι τ...
...,...,...,...,...,...,...,...
4457,9,121,1,,,tau=ta de\ poih/santes a)pe/pleon e)s th\n *(e...,"ταῦτα δὲ ποιήσαντες ἀπέπλεον ἐς τὴν Ἑλλάδα, τά..."
4458,9,122,1,,,tou/tou de\ *)artau/+ktew tou= a)nakremasqe/nt...,τούτου δὲ Ἀρταΰκτεω τοῦ ἀνακρεμασθέντος προπάτ...
4459,9,122,2,,,&ldquo;e)pei\ *zeu\s *pe/rsh|si h(gemoni/hn di...,"&λδθυο;ἐπεὶ Ζεὺς Πέρσῃσι ἡγεμονίην διδοῖ, ἀνδρ..."
4460,9,122,3,,,&rdquo; *ku=ros de\ tau=ta a)kou/sas kai\ ou) ...,&ρδθυο; Κῦρος δὲ ταῦτα ἀκούσας καὶ οὐ θωμάσας ...
