In [34]:
import cltk
from cltk import NLP
import pandas as pd
import os
import json
from pathlib import Path
from beta_code import beta_code_to_greek, greek_to_beta_code
cltk_nlp = NLP(language="grc")

‎𐤀 CLTK version '1.1.6'.
Pipeline for language 'Ancient Greek' (ISO: 'grc'): `GreekNormalizeProcess`, `GreekStanzaProcess`, `GreekEmbeddingsProcess`, `StopsProcess`.


In [30]:
fc = cltk.data.fetch.FetchCorpus("grc")

In [31]:
df = pd.read_parquet("herodotus_books_grc.parquet")

In [32]:
lexica_dir = f"{str(Path.home())}\\cltk_data\\grc\\lexicon\\greek_lexica_perseus"
if not os.path.exists(lexica_dir):
    fc.import_corpus("greek_lexica_perseus")
lex1, lex2 = json.load(open(f"{lexica_dir}\\greek-analyses_1.json", "r", encoding='utf-8')), json.load(open(f"{lexica_dir}\\greek-analyses_2.json", "r", encoding='utf-8'))

In [75]:
def get_rows(df, book, chapter, section):
    return df[(df.book == str(book)) & (df.chapter == str(chapter)) & (df.section == str(section))]

def analyze_section(subdf): #, book, chapter, section):
    #rows = get_rows(df, book, chapter, section)

    analyzed_words = []
    spartcount = -1
    # print(rows)
    for row in subdf.to_dict('records'):
        # print(row)
        #if row["quote"] != "None":
        #    analyzed_words.append('\n\"')
        analysis = cltk_nlp.analyze(text=row["text_g"])
        for word in analysis:
            if int(word.index_token) == 0:
                spartcount += 1
            analyzed_words.append({
                "string": word.string,
                "lemma": word.lemma,
                "features": word.features,
                "upos": word.upos,
                "sentencepart": spartcount,
                "idxtoken": word.index_token,
                "idxgovernor": word.governor
            })
        #if row["quote"] != "None":
        #    analyzed_words.append('"\n')
    return analyzed_words

In [47]:
def not_none(l, excluded=["[definition unavailable]", ""]):
    return [e.strip(",.;") for e in l if e == e and e is not None and e not in excluded]

def get_definition(row):
    s = row["string_beta"]
    if s in lex1 and lex1[s] is not None and len(lex1[s]) != 0:
        return not_none([l["definition"] for l in lex1[s]])
    elif s in lex2 and lex2[s] is not None and len(lex2[s]) != 0:
        return not_none([l["definition"] for l in lex2[s]])
    return None

def get_detail(row):
    s = row["string_beta"]
    if s in lex1 and lex1[s] is not None and len(lex1[s]) != 0:
        return not_none([l["pos"] for l in lex1[s]])
    elif s in lex2 and lex2[s] is not None and len(lex2[s]) != 0:
        return not_none([l["pos"] for l in lex2[s]])
    return None

def get_morph_features(feat):
    keys = feat.keys()
    return {str(key): str(feat[key]).strip("[]") for key in keys}

def get_df(analyzed_words):
    #analyzed_words = [(w if type(w) == dict else {"string": w} ) for w in analyzed_words]
    df_book = pd.DataFrame(analyzed_words)
    # remove commas for strings to get beta code for dictionary lookup
    df_book["string_nocomma"] = df_book["string"].str.strip(",·.")
    df_book["string_beta"] = df_book["string_nocomma"].apply(greek_to_beta_code)
    df_book["lemma_beta"] = df_book["lemma"].apply(greek_to_beta_code)
    # definition and detail are constructed based on the beta code
    df_book["definition"] = df_book.apply(get_definition, axis=1)
    df_book["detail"] = df_book.apply(get_detail, axis=1)
    df_book["morph_features"] = df_book.features.apply(get_morph_features)
    # generate unique token ids for each token
    df_book["token_id"] = df_book["sentencepart"].astype(str) + "_" + df_book["idxtoken"].astype(str)
    df_book["gov_id"] = df_book["sentencepart"].astype(str) + "_" + df_book["idxgovernor"].astype(str)
    used_cols = ["string", "lemma", "upos", "sentencepart", "string_beta", "lemma_beta", "definition", "detail", "morph_features", "token_id", "gov_id"]
    return df_book[used_cols]

In [53]:
from collections import Counter

In [61]:
df = df.set_index(["book", "chapter", "section"]) #.loc["5", "60", "1"]

In [72]:
df.index.value_counts().sort_values(ascending=False)

(5, 60, 1)     3
(5, 59, 1)     3
(8, 20, 2)     3
(8, 96, 2)     3
(4, 29, 1)     3
              ..
(9, 98, 4)     1
(9, 99, 1)     1
(9, 99, 2)     1
(9, 99, 3)     1
(9, 122, 4)    1
Length: 4338, dtype: int64

In [66]:
all_indices = df.index.unique()

In [71]:
all_indices

MultiIndex([('1',   '1', '0'),
            ('1',   '1', '1'),
            ('1',   '1', '2'),
            ('1',   '1', '3'),
            ('1',   '1', '4'),
            ('1',   '2', '1'),
            ('1',   '2', '2'),
            ('1',   '2', '3'),
            ('1',   '3', '1'),
            ('1',   '3', '2'),
            ...
            ('9', '119', '2'),
            ('9', '120', '1'),
            ('9', '120', '2'),
            ('9', '120', '3'),
            ('9', '120', '4'),
            ('9', '121', '1'),
            ('9', '122', '1'),
            ('9', '122', '2'),
            ('9', '122', '3'),
            ('9', '122', '4')],
           names=['book', 'chapter', 'section'], length=4338)

In [85]:
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [86]:
# print(len(all_indices))
for i, index in enumerate(all_indices):
    print(f"{round(100*(i+1)/len(all_indices), 2)}% done")
    analyzed_df = get_df(analyze_section(df.loc[index]))
    book, chapter, section = index[0], index[1], index[2]
    if "books" not in os.listdir():
        os.mkdir("books")
    if f"b_{book}" not in os.listdir("books"):
        os.mkdir(f"books/b_{book}")
    if f"c_{chapter}" not in os.listdir(f"books/b_{book}"):
        os.mkdir(f"books/b_{book}/c_{chapter}")
    analyzed_df.to_parquet(f"books/b_{book}/c_{chapter}/s_{section}.parquet")

0.02% done
0.05% done
0.07% done
0.09% done
0.12% done
0.14% done
0.16% done
0.18% done
0.21% done
0.23% done
0.25% done
0.28% done
0.3% done
0.32% done
0.35% done
0.37% done
0.39% done
0.41% done
0.44% done
0.46% done
0.48% done
0.51% done
0.53% done
0.55% done
0.58% done
0.6% done
0.62% done
0.65% done
0.67% done
0.69% done
0.71% done
0.74% done
0.76% done
0.78% done
0.81% done
0.83% done
0.85% done
0.88% done
0.9% done
0.92% done
0.95% done
0.97% done
0.99% done
1.01% done
1.04% done
1.06% done
1.08% done
1.11% done
1.13% done
1.15% done
1.18% done
1.2% done
1.22% done
1.24% done
1.27% done
1.29% done
1.31% done
1.34% done
1.36% done
1.38% done
1.41% done
1.43% done
1.45% done
1.48% done
1.5% done
1.52% done
1.54% done
1.57% done
1.59% done
1.61% done
1.64% done
1.66% done
1.68% done
1.71% done
1.73% done
1.75% done
1.78% done
1.8% done
1.82% done
1.84% done
1.87% done
1.89% done
1.91% done
1.94% done
1.96% done
1.98% done
2.01% done
2.03% done
2.05% done
2.07% done
2.1% done
2.12% 

In [99]:
!pip install natsort

Collecting natsort
  Downloading natsort-8.3.1-py3-none-any.whl (38 kB)
Installing collected packages: natsort
Successfully installed natsort-8.3.1


In [100]:
import natsort
def get_books_chapters():
    idx_to_path = []
    # books = os.listdir("books")
    ibook, ichapt, isect = 0, 0, 0
    for b in natsort.natsorted(os.listdir("books")):
        ibook = b[2:]
        for c in natsort.natsorted(os.listdir(f"books/{b}")):
            print(c)
            ichapt = c[2:]
            for s in natsort.natsorted(os.listdir(f"books/{b}/{c}")):
                isect = s.split(".parquet")[0][2:]
                idx = (ibook, ichapt, isect)
                path = f"books/{b}/{c}/{s}"
                idx_to_path.append({"book": ibook, "chapter": ichapt, "section": isect, "idx": idx, "path": path})
    return idx_to_path
idx_to_path = get_books_chapters()
print(idx_to_path)

c_1
c_2
c_3
c_4
c_5
c_6
c_7
c_8
c_9
c_10
c_11
c_12
c_13
c_14
c_15
c_16
c_17
c_18
c_19
c_20
c_21
c_22
c_23
c_24
c_25
c_26
c_27
c_28
c_29
c_30
c_31
c_32
c_33
c_34
c_35
c_36
c_37
c_38
c_39
c_40
c_41
c_42
c_43
c_44
c_45
c_46
c_47
c_48
c_49
c_50
c_51
c_52
c_53
c_54
c_55
c_56
c_57
c_58
c_59
c_60
c_61
c_62
c_63
c_64
c_65
c_66
c_67
c_68
c_69
c_70
c_71
c_72
c_73
c_74
c_75
c_76
c_77
c_78
c_79
c_80
c_81
c_82
c_83
c_84
c_85
c_86
c_87
c_88
c_89
c_90
c_91
c_92
c_93
c_94
c_95
c_96
c_97
c_98
c_99
c_100
c_101
c_102
c_103
c_104
c_105
c_106
c_107
c_108
c_109
c_110
c_111
c_112
c_113
c_114
c_115
c_116
c_117
c_118
c_119
c_120
c_121
c_122
c_123
c_124
c_125
c_126
c_127
c_128
c_129
c_130
c_131
c_132
c_133
c_134
c_135
c_136
c_137
c_138
c_139
c_140
c_141
c_142
c_143
c_144
c_145
c_146
c_147
c_148
c_149
c_150
c_151
c_152
c_153
c_154
c_155
c_156
c_157
c_158
c_159
c_160
c_161
c_162
c_163
c_164
c_165
c_166
c_167
c_168
c_169
c_170
c_171
c_172
c_173
c_174
c_175
c_176
c_177
c_178
c_179
c_180
c_181
c_182
c_183
c_184
c_18

In [103]:
pd.DataFrame(idx_to_path).to_parquet("nlp_paths.parquet")