In [7]:
DATA_DIR = "../raw_data/cltk_json"

In [8]:
import os
import json
from cltk import NLP 
from cltk.core.data_types import Word, Doc, Sentence
from tqdm import tqdm
# import pickle
import pandas as pd

In [9]:
grc_json_list = [f for f in os.listdir(DATA_DIR) if "__grc.json" in f]

In [10]:
removed_chars = ["\n", "\t"]
def recursive_traversal(d: dict):
    values = d.values()
    out_strings = []
    if all(type(v) is str for v in values):
        out_strings.append(" ".join(values))
    else:
        for value in values:
            if type(value) is dict:
                out_strings.extend(recursive_traversal(value))
            else:
                out_strings.append(value)
    return out_strings

def postprocess_string(t: str):
    for r in removed_chars:
        t = t.replace(r, "")
    t = t\
        .replace("   ", " ")\
        .replace("  ", " ")\
        .replace("..", ".")\
        .strip()\
        .lstrip(". ")
    return t

def get_file_texts(filename: str, dir: str=DATA_DIR):
    full_path = f"{DATA_DIR}/{filename}"
    file_json = json.load(open(full_path))
    # print(file_json)
    metadata = {
        "language": file_json.get("language", "unknown"),
        "title": file_json.get("englishTitle", file_json.get('work', '')),
        "urn": file_json.get("urn", ""),
        "author": file_json.get("author", ""),
        "edition": file_json.get("edition", "")
    }
    # print(f"{file_json['englishTitle']} by {file_json['author']}")
    text_strings = recursive_traversal(file_json["text"])
    postprocessed_strings = list(map(postprocess_string, text_strings))
    return postprocessed_strings, metadata

In [11]:
def sent_to_raw_text(s: Sentence):
    sent_str = " ".join([w.string for w in s.words])
    sent_str = sent_str\
        .replace(" ,", ",")\
        .replace(" ·", "·")\
        .replace(" .", ".")
    return sent_str

In [12]:
df_sents_raw = pd.DataFrame(columns=("sentence", "metadata"))
grc_nlp = NLP(language="grc", suppress_banner=True)
for path in tqdm(grc_json_list):
    try:
        texts, metadata = get_file_texts(path)
        metadata["file"] = path
        for t_i, text in enumerate(texts):
            nlp_doc = grc_nlp.analyze(text=text) # piecewise, rather than a concat of the entire document, to save memory
            sentences = nlp_doc.sentences
            for s_i, s in enumerate(sentences):
                metadata["sentence_idx"] = f"{t_i}_{s_i}"
                df_sents_raw.loc[len(df_sents_raw)] = s, metadata
    except Exception as e:
        print(f"Error reading {path}: {e}")
        break

100%|██████████| 221/221 [11:11<00:00,  3.04s/it]


In [13]:
df_sents = df_sents_raw.copy()

In [14]:
import string
relevant_vars = ["string", "pos", "lemma", "upos", "features", "category", "index_token", "governor"]
def extract_features(w: Word):
    feats = {k:v for k,v in vars(w).items() if k in relevant_vars}
    feats["pos"] = str(feats["pos"])
    feats["features"] = {str(k):str(v).strip("[]") for k,v in feats["features"].items()}
    feats["category"] = {str(k):str(v).strip("[]") for k,v in feats["category"].items()}
    return feats
df_sents["sentence_obj"] = df_sents["sentence"].apply(lambda s: [extract_features(w) for w in s.words])
df_sents["sentence_txt"] = df_sents["sentence"].apply(sent_to_raw_text)
latin_alphabet_indices = df_sents["sentence_txt"].str.contains(r"[A-Za-z]")
df_sents.loc[latin_alphabet_indices, "sentence_txt"] = df_sents[latin_alphabet_indices]["sentence_txt"]\
    .str.replace(f"[A-Za-z0-9]", "", regex=True)\
    .str.strip(string.printable) # drop latin characters, there are some mistakenly in a few places
df_sents = df_sents.drop(columns="sentence")

In [15]:
df_sents["len_words"] = df_sents["sentence_obj"].str.len()
df_sents["len_chars"] = df_sents["sentence_txt"].str.len()
df_sents = df_sents[df_sents["len_chars"] >= 4]

In [17]:
df_sents.to_parquet("../data/sentences.parquet")