In [1]:
import cltk
import os
from pathlib import Path
import json
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from tei_reader import TeiReader
from beta_code import beta_code_to_greek, greek_to_beta_code
import re

cltk.curr_version

cltk 1.1.6 (c:\users\annet\anaconda3\lib\site-packages)

In [2]:
fc = cltk.data.fetch.FetchCorpus("grc")

In [3]:
lexica_dir = f"{str(Path.home())}\\cltk_data\\grc\\lexicon\\greek_lexica_perseus"
if not os.path.exists(lexica_dir):
    fc.import_corpus("greek_lexica_perseus")
lex1, lex2 = json.load(open(f"{lexica_dir}\\greek-analyses_1.json", "r", encoding='utf-8')), json.load(open(f"{lexica_dir}\\greek-analyses_2.json", "r", encoding='utf-8'))

In [4]:
perseus_dir = f"{str(Path.home())}\\cltk_data\\grc\\text\\grc_text_perseus"
working_dir = f"{perseus_dir}\\Herodotus\\opensource"
if not os.path.exists(perseus_dir):
    fc.import_corpus("grc_text_perseus")

In [5]:
hdt_gk = json.load(open(f"{working_dir}\\hdt_gk.xml.json", "r"))

In [211]:
reader = TeiReader()
corpora_gk = reader.read_file(f"{working_dir}\\hdt_gk.xml")
txt_gk_beta = corpora_gk.text
txt_gk = beta_code_to_greek(txt_gk_beta)
rx = r"\.(?=\S)"
txt_gk = re.sub(rx, ". ", txt_gk)
txt_gk = re.sub(r"\.[ ]{1,}", ". ", txt_gk)
txt_gk = txt_gk.strip(".")
books_gk = [b.strip(".") for b in txt_gk.split("\n")]

In [212]:
book1 = books_gk[0]

In [213]:
book1

"Ἡροδότου Ἁλικαρνησσέος ἱστορίης ἀπόδεξις ἥδε, ὡς μήτε τὰ γενόμενα ἐξ ἀνθρώπων τῷ χρόνῳ ἐξίτηλα γένηται, μήτε ἔργα μεγάλα τε καὶ θωμαστά, τὰ μὲν Ἕλλησι τὰ δὲ βαρβάροισι ἀποδεχθέντα, ἀκλεᾶ γένηται, τά τε ἄλλα καὶ δι' ἣν αἰτίην ἐπολέμησαν ἀλλήλοισι. Περσέων μέν νυν οἱ λόγιοι Φοίνικας αἰτίους φασὶ γενέσθαι τῆς διαφορῆς. τούτους γὰρ ἀπὸ τῆς Ἐρυθρῆς καλεομένης θαλάσσης ἀπικομένους ἐπὶ τήνδε τὴν θάλασσαν, καὶ οἰκήσαντας τοῦτον τὸν χῶρον τὸν καὶ νῦν οἰκέουσι, αὐτίκα ναυτιλίῃσι μακρῇσι ἐπιθέσθαι, ἀπαγινέοντας δὲ φορτία Αἰγύπτιά τε καὶ Ἀσσύρια τῇ τε ἄλλῃ ἐσαπικνέεσθαι καὶ δὴ καὶ ἐς Ἄργος. τὸ δὲ Ἄργος τοῦτον τὸν χρόνον προεῖχε ἅπασι τῶν ἐν τῇ νῦν Ἑλλάδι καλεομένῃ χωρῇ. ἀπικομένους δὲ τούς Φοίνικας ἐς δὴ τὸ Ἄργος τοῦτο διατίθεσθαι τὸν φόρτον. πέμπτῃ δὲ ἢ ἕκτῃ ἡμέρῃ ἀπ' ἧς ἀπίκοντο, ἐξεμπολημένων σφι σχεδόν πάντων, ἐλθεῖν ἐπὶ τὴν θάλασσαν γυναῖκας ἄλλας τε πολλάς καὶ δὴ καὶ τοῦ βασιλέος θυγατέρα· τὸ δέ οἱ οὔνομα εἶναι, κατὰ τὠυτὸ τὸ καὶ Ἕλληνές λέγουσι, Ἰοῦν τὴν  Ἰνάχου·ταύτας στάσας κατά πρύμνην 

In [214]:
from cltk import NLP
import pandas as pd
cltk_nlp = NLP(language="grc")

‎𐤀 CLTK version '1.1.6'.
Pipeline for language 'Ancient Greek' (ISO: 'grc'): `GreekNormalizeProcess`, `GreekStanzaProcess`, `GreekEmbeddingsProcess`, `StopsProcess`.


In [331]:
def analyze_sentences(text):
    analyzed_words = []
    scount = 0
    spartcount = -1
    for sentence in text.split("."):
        analysis = cltk_nlp.analyze(text=sentence)
        for word in analysis:
            if int(word.index_token) == 0:
                spartcount += 1
            analyzed_words.append({
                "string": word.string,
                "lemma": word.lemma,
                "features": word.features,
                "upos": word.upos,
                "sentence": scount,
                "sentencepart": spartcount,
                "idxtoken": word.index_token,
                "idxgovernor": word.governor
            })
        scount += 1
    return analyzed_words
def analyze_book(text):
    # takes some time
    print("Analyzing sentences...")
    analyzed_words = analyze_sentences(text)
    return analyzed_words
analyzed_words = analyze_book(text=books_gk[0])

Analyzing sentences...


In [333]:
def not_none(l, excluded=["[definition unavailable]", ""]):
    return [e.strip(",.;") for e in l if e == e and e is not None and e not in excluded]

def get_definition(row):
    s = row["string_beta"]
    if s in lex1 and lex1[s] is not None and len(lex1[s]) != 0:
        return not_none([l["definition"] for l in lex1[s]])
    elif s in lex2 and lex2[s] is not None and len(lex2[s]) != 0:
        return not_none([l["definition"] for l in lex2[s]])
    return None

def get_detail(row):
    s = row["string_beta"]
    if s in lex1 and lex1[s] is not None and len(lex1[s]) != 0:
        return not_none([l["pos"] for l in lex1[s]])
    elif s in lex2 and lex2[s] is not None and len(lex2[s]) != 0:
        return not_none([l["pos"] for l in lex2[s]])
    return None

def get_morph_features(feat):
    keys = feat.keys()
    return {str(key): str(feat[key]).strip("[]") for key in keys}

def get_df(analyzed_words):
    df_book = pd.DataFrame(analyzed_words)
    # remove commas for strings to get beta code for dictionary lookup
    df_book["string_nocomma"] = df_book["string"].str.strip(",·")
    df_book["string_beta"] = df_book["string_nocomma"].apply(greek_to_beta_code)
    df_book["lemma_beta"] = df_book["lemma"].apply(greek_to_beta_code)
    # definition and detail are constructed based on the beta code
    df_book["definition"] = df_book.apply(get_definition, axis=1)
    df_book["detail"] = df_book.apply(get_detail, axis=1)
    df_book["morph_features"] = df_book.features.apply(get_morph_features)
    # generate unique token ids for each token
    df_book["token_id"] = df_book["sentencepart"].astype(str) + "_" + df_book["idxtoken"].astype(str)
    df_book["gov_id"] = df_book["sentencepart"].astype(str) + "_" + df_book["idxgovernor"].astype(str)
    used_cols = ["string", "lemma", "upos", "sentence", "sentencepart", "string_beta", "lemma_beta", "definition", "detail", "morph_features", "token_id", "gov_id"]
    return df_book[used_cols]
df_book1 = get_df(analyzed_words)

In [334]:
df_book1

Unnamed: 0,string,lemma,upos,sentence,sentencepart,string_beta,lemma_beta,definition,detail,morph_features,token_id,gov_id
0,Ἡροδότου,Ἡρόδοτος,PROPN,0,0,*(hrodo/tou,*(hro/dotos,[],[noun sg masc gen],"{'Case': 'genitive', 'Gender': 'masculine', 'N...",0_0,0_2
1,Ἁλικαρνησσέος,Ἁλικαρνασσεύς,NOUN,0,0,*(alikarnhsse/os,*(alikarnasseu/s,,,"{'Case': 'genitive', 'Gender': 'masculine', 'N...",0_1,0_2
2,ἱστορίης,ἱστορία,NOUN,0,0,i(stori/hs,i(stori/a,[inquiry],[noun sg fem gen epic ionic],"{'Case': 'genitive', 'Gender': 'feminine', 'Nu...",0_2,0_3
3,ἀπόδεξις,ἀπόδειξις,NOUN,0,0,a)po/decis,a)po/deicis,"[acceptance, showing forth, making]","[noun sg fem nom, noun pl fem acc epic doric i...","{'Case': 'nominative', 'Gender': 'feminine', '...",0_3,0_-1
4,"ἥδε,",ὅδε,DET,0,0,h(/de,o(/de,[this],[pron sg fem nom indeclform],"{'Case': 'genitive', 'Gender': 'feminine', 'Nu...",0_4,0_2
...,...,...,...,...,...,...,...,...,...,...,...,...
29047,τῶν,ὁ,DET,1112,2010,tw=n,o(,[the following],[article pl neut gen indeclform],"{'Case': 'genitive', 'Definiteness': 'definite...",2010_10,2010_11
29048,θνητῶν,θνητός,NOUN,1112,2010,qnhtw=n,qnhto/s,"[liable to death, mortal]",[adj pl fem gen],"{'Case': 'genitive', 'Gender': 'masculine', 'N...",2010_11,2010_8
29049,τὸ,ὁ,DET,1112,2010,to\,o(,,,"{'Case': 'accusative', 'Definiteness': 'defini...",2010_12,2010_13
29050,τάχιστον,ταχύς,ADJ,1112,2010,ta/xiston,taxu/s,"[swift, fleet]",[adj sg masc acc irreg_superl],"{'Case': 'accusative', 'Degree': 'superlative'...",2010_13,2010_14


In [335]:
df_book1.to_parquet("herodotus_book_1.parquet")

In [336]:
df_book1 = pd.read_parquet("herodotus_book_1.parquet")

In [337]:
print(df_book1.sentence.min())
print(df_book1.sentence.max())

0
1112
