In [17]:
import os
from dotenv import load_dotenv
load_dotenv() # API key comes from .env
import anthropic
import pandas as pd
import json
import tqdm

In [18]:
import google.generativeai as genai
genai.configure(api_key=os.environ["GEMINI_API_KEY"])

In [19]:
generation_config = {
  "temperature": 1,
  "top_p": 0.95,
  "top_k": 64,
  "max_output_tokens": 8192,
  "response_mime_type": "application/json",
}
model = genai.GenerativeModel(
  model_name="gemini-1.5-pro",
  generation_config=generation_config,
)

In [20]:
#response = model.generate_content([
#  "input: Tõlgi järgnev vana-kreekakeelne lause ja märksõna eesti keelde. \nOle grammatiliselt originaalile nii lähedane, kui võimalik. Ära tee eesti keeles grammatikavigu. Samuti tõlgi eraldi sõnad. \n\nVäljund peab olema korrektne JSON objekt, kus peavad olema väljad: \n\"lause\" - eestikeelne tõlge\n\"märksõna\" - märksõna eestikeelne tõlge koos lemmaga. Lemma PEAB olema eestikeelne. (nt: loetav [lemma: lugema])\n\"tõlked\" - list iga sõna individuaalse tõlkega. Listi elemendid on objektid võtmetega \"grc\" - algne sõne ja \"ee\" - tõlge. Nendele tõlgetele EI PEA lisama lemmat. Kui kontekst seda nõuab, võib ühe sõna asemel tõlkida terve sõnapaari või kombinatsiooni.\nKui märksõna liik on (determiner), lisa tõlkesse (\"märksõna\" välja) LEMMA ASEMEL RUUTSULGUDE SISSE selle sugu ja kääne, kui võimalik, formaadis: \"nende (m. gen. pl.) [lemma: nemad]\".\n\n\"lause\" väljas võib semantilistele sõna või sõnapaari tõlkele järgneda otsetõlge (sulgude sees), kuid mitte \"tõlked\" väljas. \nOtsetõlke korral EI TOHI MITTE KUNAGI sulgudesse lisada midagi üleliigset, nagu märget \"sõna-sõnalt\" või vana-kreekakeelset algvarianti.\nKui lause tõlge sisaldab sõna, mida algses vana-kreekakeelses lauses ei esinenud, kasuta \"lause\" väljas selle sõna ümber [ruutsulge], kuid ürita selliseid olukordi vältida.\nTõlkides \"märksõna\", tõlgi lisaks märksõnale ka lemma. \n\nLähtu näidissisendist ja näidisväljundist.\n\n{\"märksõna\": \"πίνει [lemma: πίνω] (verb) | finite; present.active.indicative.third.singular\", \"lause\": \"καὶ αὐτὸς ἐκ τῆς τοῦ ἑτέρου πίνει.\", \"allikas\": \"histories by herodotus\", \"sõnad\": [\"καὶ [lemma: καί] (coordinating_conjunction)\", \"αὐτὸς [lemma: αὐτός] (adjective) | singular.masculine.nominative\", \"ἐκ [lemma: ἐκ] (adposition)\", \"τῆς [lemma: ὁ] (determiner) | singular.feminine.genitive\", \"τοῦ [lemma: ὁ] (determiner) | singular.masculine.genitive\", \"ἑτέρου [lemma: ἕτερος] (adjective) | singular.masculine.genitive\", \"πίνει [lemma: πίνω] (verb) | finite; present.active.indicative.third.singular\"]}",
#  "output: {\"lause\": \"Ja [tema] ise teise [karikast] joob.\", \"märksõna\": \"joob [lemma: jooma]\", \"tõlked\": [{\"grc\": \"καὶ\", \"ee\": \"ja\"}, {\"grc\": \"αὐτὸς\", \"ee\": \"ise\"}, {\"grc\": \"ἐκ τῆς τοῦ ἑτέρου\", \"ee\": \"teisest\"}, {\"grc\": \"πίνει\", \"ee\": \"joob\"}]}",
#  "input: {'märksõna': 'δὲ [lemma: δέ] (coordinating_conjunction)', 'lause': 'ἡ μὲν ἄπωθεν, ἡ δὲ πέλας·', 'allikas': 'Argonautica by apollonius rhodius', 'sõnad': ['ἡ [lemma: ὁ] (determiner) | singular.feminine.nominative', 'μὲν [lemma: μέν] (adverb)', 'ἄπωθεν [lemma: ἄπωθεν] (adverb)', 'ἡ [lemma: ὁ] (determiner) | singular.feminine.nominative', 'δὲ [lemma: δέ] (coordinating_conjunction)', 'πέλας [lemma: πέλας] (adverb)']}",
#  "output: ",
#])

In [21]:
from translation_utils import get_system_msg, get_user_prompt, parse_output, find_matching_sentences, sentence_to_prompt_input_dict, filter_matching_sentences, word_to_simple_identifier, PromptType

In [22]:
#client = anthropic.Anthropic()

In [23]:
prompt_dict = {
    "target": "πίνει [lemma: πίνω] (verb) | finite; present.active.indicative.third.singular",
    "phrase": "ὅ τε ἵππος αὐτοῖς κριθῆς μὲν οὐδʼ ὅλως γεύεται, ποηφαγῶν ἀεί, πίνει δὲ διὰ πολλοῦ.",
    "source": "Punic Wars by appianus of alexandria",
    "words": ["ὅ [lemma: ὁ] (determiner) | singular.masculine.nominative",
"τε [lemma: τε] (adverb)",
"ἵππος [lemma: ἵππος] (noun) | singular.feminine.nominative",
"αὐτοῖς [lemma: αὐτός] (pronoun) | plural.masculine.dative",
"κριθῆς [lemma: κριθή] (adjective) | singular.neuter.genitive",
"μὲν [lemma: μέν] (adverb)",
"οὐδʼ [lemma: οὐδʼ] (adverb)",
"ὅλως [lemma: ὅλος] (adverb)",
"γεύεται [lemma: γεύεται] (verb) | finite; present.middle.subjunctive.third.plural",
"ποηφαγῶν [lemma: ποηφαγέω] (verb) | participle; present.active.singular.nominative",
"ἀεί [lemma: ἀεί] (adverb)",
"πίνει [lemma: πίνω] (verb) | finite; past.active.indicative.third.singular",
"δὲ [lemma: δέ] (particle)",
"διὰ [lemma: διά] (adposition)",
"πολλοῦ [lemma: πολύς] (adjective) | singular.neuter.genitive",]
}
print(json.dumps(prompt_dict, ensure_ascii=False))

{"target": "πίνει [lemma: πίνω] (verb) | finite; present.active.indicative.third.singular", "phrase": "ὅ τε ἵππος αὐτοῖς κριθῆς μὲν οὐδʼ ὅλως γεύεται, ποηφαγῶν ἀεί, πίνει δὲ διὰ πολλοῦ.", "source": "Punic Wars by appianus of alexandria", "words": ["ὅ [lemma: ὁ] (determiner) | singular.masculine.nominative", "τε [lemma: τε] (adverb)", "ἵππος [lemma: ἵππος] (noun) | singular.feminine.nominative", "αὐτοῖς [lemma: αὐτός] (pronoun) | plural.masculine.dative", "κριθῆς [lemma: κριθή] (adjective) | singular.neuter.genitive", "μὲν [lemma: μέν] (adverb)", "οὐδʼ [lemma: οὐδʼ] (adverb)", "ὅλως [lemma: ὅλος] (adverb)", "γεύεται [lemma: γεύεται] (verb) | finite; present.middle.subjunctive.third.plural", "ποηφαγῶν [lemma: ποηφαγέω] (verb) | participle; present.active.singular.nominative", "ἀεί [lemma: ἀεί] (adverb)", "πίνει [lemma: πίνω] (verb) | finite; past.active.indicative.third.singular", "δὲ [lemma: δέ] (particle)", "διὰ [lemma: διά] (adposition)", "πολλοῦ [lemma: πολύς] (adjective) | singular.

In [24]:
df_sents = pd.read_parquet("../../data/sentences.parquet").drop_duplicates(subset=["sentence_txt"]).reset_index().rename(columns={"index": "grc_sent_idx"}).set_index(["grc_sent_idx"])
df_sents["ee_sent_idx"] = None

In [25]:
df_words = pd.read_parquet("../../data/word_to_lemma.parquet").reset_index().rename(columns={"level_0": "grc_target_idx"}).set_index(["grc_target_idx"])
df_words["ee_sent_idx"] = None

In [26]:
df_sents_ee = pd.DataFrame(columns=['ee_sent_idx', "ee_phrase", "ee_target", "ee_words", "grc_sent_idx", "grc_target_idx"]).set_index("ee_sent_idx")

In [27]:
chosenSentence = df_sents[df_sents.sentence_txt == "καὶ αὐτὸς ἐκ τῆς τοῦ ἑτέρου πίνει."].iloc[0]
def findTarget(w_str: str, sentence: dict):
    return [w for w in sentence.sentence_obj if w["string"] == w_str][0]
example_input_dict = sentence_to_prompt_input_dict(
    row=chosenSentence,
    targetWord=word_to_simple_identifier(findTarget("πίνει", chosenSentence))
)

In [28]:
chosenSentence = df_sents[df_sents.sentence_txt == "ὅ τε ἵππος αὐτοῖς κριθῆς μὲν οὐδʼ ὅλως γεύεται, ποηφαγῶν ἀεί, πίνει δὲ διὰ πολλοῦ."].iloc[0]
example_input_dict = sentence_to_prompt_input_dict(
    row=chosenSentence,
    targetWord=word_to_simple_identifier(findTarget("πίνει", chosenSentence))
)


In [29]:
example_input_dict

{'märksõna': 'πίνει [lemma: πίνω] (verb) | finite; past.active.indicative.third.singular',
 'lause': 'ὅ τε ἵππος αὐτοῖς κριθῆς μὲν οὐδʼ ὅλως γεύεται, ποηφαγῶν ἀεί, πίνει δὲ διὰ πολλοῦ.',
 'allikas': 'Punic Wars by appianus of alexandria',
 'sõnad': ['ὅ [lemma: ὁ] (determiner) | singular.masculine.nominative',
  'τε [lemma: τε] (adverb)',
  'ἵππος [lemma: ἵππος] (noun) | singular.feminine.nominative',
  'αὐτοῖς [lemma: αὐτός] (pronoun) | plural.masculine.dative',
  'κριθῆς [lemma: κριθή] (adjective) | singular.neuter.genitive',
  'μὲν [lemma: μέν] (adverb)',
  'οὐδʼ [lemma: οὐδʼ] (adverb)',
  'ὅλως [lemma: ὅλος] (adverb)',
  'γεύεται [lemma: γεύεται] (verb) | finite; present.middle.subjunctive.third.plural',
  'ποηφαγῶν [lemma: ποηφαγέω] (verb) | participle; present.active.singular.nominative',
  'ἀεί [lemma: ἀεί] (adverb)',
  'πίνει [lemma: πίνω] (verb) | finite; past.active.indicative.third.singular',
  'δὲ [lemma: δέ] (particle)',
  'διὰ [lemma: διά] (adposition)',
  'πολλοῦ [lemma: 

In [30]:
get_system_msg(PromptType.GEMINI)

['instruction: Tõlgi järgnev vana-kreekakeelne lause ja märksõna eesti keelde. \nOle grammatiliselt originaalile nii lähedane, kui võimalik. Ära tee eesti keeles grammatikavigu. Samuti tõlgi eraldi sõnad. \n\nVäljund peab olema korrektne JSON objekt, kus peavad olema väljad: \n"lause" - eestikeelne tõlge\n"märksõna" - märksõna eestikeelne tõlge koos lemmaga. Lemma PEAB olema eestikeelne. (nt: loetav [lemma: lugema])\n"tõlked" - list iga sõna individuaalse tõlkega. Listi elemendid on objektid võtmetega "grc" - algne sõne ja "ee" - tõlge. Nendele tõlgetele EI PEA lisama lemmat. Kui kontekst seda nõuab, võib ühe sõna asemel tõlkida terve sõnapaari või kombinatsiooni.\nKui märksõna liik on (determiner), lisa tõlkesse ("märksõna" välja) LEMMA ASEMEL RUUTSULGUDE SISSE selle sugu ja kääne, kui võimalik, formaadis: "nende (m. gen. pl.) [lemma: nemad]".\n\n"lause" väljas võib semantilistele sõna või sõnapaari tõlkele järgneda otsetõlge (sulgude sees), kuid mitte "tõlked" väljas. \nOtsetõlke kor

In [31]:
get_user_prompt(example_input_dict, PromptType.GEMINI)

["input: {'märksõna': 'πίνει [lemma: πίνω] (verb) | finite; past.active.indicative.third.singular', 'lause': 'ὅ τε ἵππος αὐτοῖς κριθῆς μὲν οὐδʼ ὅλως γεύεται, ποηφαγῶν ἀεί, πίνει δὲ διὰ πολλοῦ.', 'allikas': 'Punic Wars by appianus of alexandria', 'sõnad': ['ὅ [lemma: ὁ] (determiner) | singular.masculine.nominative', 'τε [lemma: τε] (adverb)', 'ἵππος [lemma: ἵππος] (noun) | singular.feminine.nominative', 'αὐτοῖς [lemma: αὐτός] (pronoun) | plural.masculine.dative', 'κριθῆς [lemma: κριθή] (adjective) | singular.neuter.genitive', 'μὲν [lemma: μέν] (adverb)', 'οὐδʼ [lemma: οὐδʼ] (adverb)', 'ὅλως [lemma: ὅλος] (adverb)', 'γεύεται [lemma: γεύεται] (verb) | finite; present.middle.subjunctive.third.plural', 'ποηφαγῶν [lemma: ποηφαγέω] (verb) | participle; present.active.singular.nominative', 'ἀεί [lemma: ἀεί] (adverb)', 'πίνει [lemma: πίνω] (verb) | finite; past.active.indicative.third.singular', 'δὲ [lemma: δέ] (particle)', 'διὰ [lemma: διά] (adposition)', 'πολλοῦ [lemma: πολύς] (adjective) | s

In [32]:
def get_translation_gemini(model: genai.GenerativeModel, greek_sentence_dict: dict):
    response = model.generate_content(get_system_msg(PromptType.GEMINI) + get_user_prompt(greek_sentence_dict, PromptType.GEMINI))
    return response
def get_translation_anthropic(client: anthropic.Anthropic, greek_sentence_dict: dict) -> anthropic.types.message.Message:
    #print(get_system_msg())
    #print(get_user_prompt(greek_sentence_dict))
    #return None
    message = client.messages.create(
        model="claude-3-opus-20240229",
        max_tokens=1024,
        system=get_system_msg(),
        messages=[
            {"role": "user", "content": get_user_prompt(greek_sentence_dict)}
        ]
    )
    return message


In [38]:
df_sents_ee.head()

Unnamed: 0_level_0,ee_phrase,ee_target,ee_words,grc_sent_idx,grc_target_idx
ee_sent_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1


In [39]:
df_words.head()

Unnamed: 0_level_0,index,text,count,lemma,pos,Aspect,Case,Definiteness,Degree,Gender,...,Number,Person,Polarity,Possessive,PronominalType,Reflexive,Tense,VerbForm,Voice,ee_sent_idx
grc_target_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,καὶ [lemma: καί] (coordinating_conjunction),καὶ,76985,καί,coordinating_conjunction,,,,,,...,,,,,,,,,,
4,δὲ [lemma: δέ] (adverb),δὲ,26946,δέ,adverb,,,,,,...,,,,,,,,,,
5,καὶ [lemma: καί] (adverb),καὶ,21000,καί,adverb,,,,,,...,,,,,,,,,,
6,τὴν [lemma: ὁ] (determiner) | singular.feminin...,τὴν,19781,ὁ,determiner,,accusative,,,feminine,...,singular,,,,,,,,,
7,ἐν [lemma: ἐν] (adposition),ἐν,19662,ἐν,adposition,,,,,,...,,,,,,,,,,


In [40]:
df_sents

Unnamed: 0_level_0,metadata,sentence_obj,sentence_txt,len_words,len_chars,ee_sent_idx
grc_sent_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,"{'author': 'lysias', 'edition': 'Lysias with a...","[{'category': {'F': 'neg', 'N': 'pos', 'V': 'p...","ἴσως τινὲς ὑμῶν, ὦ ἄνδρες δικασταί, διὰ τὸ βού...",23,120,
1,"{'author': 'lysias', 'edition': 'Lysias with a...","[{'category': {'F': 'pos', 'N': 'pos', 'V': 'n...","ἐγὼ δὲ τοσούτου δέω περὶ τῶν μὴ, προσηκόντων ἱ...",31,150,
2,"{'author': 'lysias', 'edition': 'Lysias with a...","[{'category': {'F': 'neg', 'N': 'neg', 'V': 'p...","οἴμαι μὲν οὖν, ἐάν πάντα διηγήσωμαι τὰ πεπραγμ...",30,172,
3,"{'author': 'lysias', 'edition': 'Lysias with a...","[{'category': {'F': 'pos', 'N': 'neg', 'V': 'n...",ἐξ ἀρχῆς οὖν ἀκούσατε.,5,22,
4,"{'author': 'lysias', 'edition': 'Lysias with a...","[{'category': {'F': 'neg', 'N': 'neg', 'V': 'p...",ἐράτων ὁ Ἐρασιφῶντος πατὴρ ἐδανείσατο παρὰ τοῦ...,12,71,
...,...,...,...,...,...,...
134639,"{'author': 'homeric hymns', 'edition': 'Hymni ...","[{'category': {'F': 'neg', 'N': 'pos', 'V': 'p...","αἰδοίην, χρυσοστέφανον, καλὴν Ἀφροδίτην ᾁσομαι...",30,184,
134640,"{'author': 'homeric hymns', 'edition': 'Hymni ...","[{'category': {'F': 'pos', 'N': 'pos', 'V': 'n...","τὴν δὲ χρυσάμπυκες Ὧραι δέξαντ ̓ ἀσπασίως, περ...",28,140,
134641,"{'author': 'homeric hymns', 'edition': 'Hymni ...","[{'category': {'F': 'pos', 'N': 'neg', 'V': 'n...",ἐν δὲ τρητοῖσι λοβοῖσιν ἄνθεμ ̓ ὀρειχάλκου χρυ...,42,242,
134642,"{'author': 'homeric hymns', 'edition': 'Hymni ...","[{'category': {'F': 'pos', 'N': None, 'V': Non...","αὐτὰρ ἐπειδὴ πάντα περὶ χροῒ κόσμον ἔθηκαν, ἦγ...",37,214,


In [41]:
def save_output(df_sents_ee: pd.DataFrame, df_sents_grc: pd.DataFrame, df_words: pd.DataFrame, grc_sent_idx: int, grc_target_idx: int, parsed_msg_output: dict):
    ee_sent_idx = len(df_sents_ee)
    row = {
        # "sent_idx": ee_sent_idx,
        "ee_phrase": parsed_msg_output["lause"],
        "ee_target": parsed_msg_output["märksõna"],
        "ee_words": parsed_msg_output["tõlked"],
        "grc_sent_idx": grc_sent_idx,
        "grc_target_idx": grc_target_idx
    }
    df_sents_ee.loc[ee_sent_idx] = row
    df_sents_grc.loc[grc_sent_idx, "ee_sent_idx"] = ee_sent_idx
    df_words.loc[grc_target_idx, "ee_sent_idx"] = ee_sent_idx

# save_output(grc_sent_idx, grc_target_idx, output)


In [52]:
def extract_response_gemini(response):
    return dict(json.loads(response.text))
def extract_response_anthropic(message: anthropic.types.message.Message):
    return dict(json.loads(message.content[0].text))

In [53]:
def persist_tables(df_sents_grc: pd.DataFrame, df_words: pd.DataFrame, df_sents_ee: pd.DataFrame, persist_all=False) -> None:
    df_sents_ee.to_parquet("ee_sentences.parquet")
    if persist_all:
        df_sents_grc.to_parquet("grc_sentences.parquet")
        df_words.to_parquet("grc_words.parquet")

In [54]:
if all(not os.path.exists(path) for path in ["ee_sentences.parquet", "grc_sentences.parquet", "grc_words.parquet"]):
    persist_tables(df_sents, df_words, df_sents_ee, persist_all=True)

if os.path.exists("ee_sentences.parquet"):
    df_sents_ee = pd.read_parquet("ee_sentences.parquet")
if os.path.exists("grc_sentences.parquet"):
    df_sents = pd.read_parquet("grc_sentences.parquet")
if os.path.exists("grc_words.parquet"):
    df_words = pd.read_parquet("grc_words.parquet")


In [56]:
LENGTH_THRESHOLD = 501 # because of API limits
i = -1
for grc_target_idx, w in df_words.sort_values(by="count", ascending=False).iterrows():
    # print(i, w)
    i += 1
    print()
    if len(df_sents_ee) > LENGTH_THRESHOLD:
        break
    target_word = w.to_dict()
    target_word_str = target_word["index"]
    print(i, target_word_str) #, wd["grc_target_idx"])
    if sum(df_sents_ee.grc_target_idx == grc_target_idx) > 0: # already generated
        print(f"Sentence already generated for word {target_word['index']}, skipping")
        continue
    # print(target_word)
    matches = find_matching_sentences(df_sents, target_word, sortby=[])
    # print(f"Pre-filter found {len(matches)} matches")
    filtered_matches = filter_matching_sentences(matches, min_words=4, max_words=10, topk=5).drop_duplicates(subset="sentence_txt", keep="last")
    if len(filtered_matches) == 0:
        print(f"No matches found for word {target_word['index']}, skipping")
        continue
    selected_sentence = filtered_matches.iloc[0]    
    grc_sent_idx = selected_sentence.name
    print(selected_sentence.sentence_txt)
    # get translation
    try:
        input_dict = sentence_to_prompt_input_dict(
            row=selected_sentence.to_dict(),
            targetWord=target_word_str
        )
        response = get_translation_gemini(model, input_dict) # get_translation_anthropic(client, input_dict)
        parsed_output = extract_response_gemini(response)
        print(parsed_output)
        save_output(df_sents_ee=df_sents_ee, df_sents_grc=df_sents, df_words=df_words, grc_sent_idx=grc_sent_idx, grc_target_idx=grc_target_idx, parsed_msg_output=parsed_output)
        persist_tables(df_sents, df_words, df_sents_ee)
    except Exception as e:
        print(f"Could not translate sentence {grc_sent_idx}; {e}")
        continue


0 καὶ [lemma: καί] (coordinating_conjunction)
Sentence already generated for word καὶ [lemma: καί] (coordinating_conjunction), skipping

1 δὲ [lemma: δέ] (adverb)
Sentence already generated for word δὲ [lemma: δέ] (adverb), skipping

2 καὶ [lemma: καί] (adverb)
Sentence already generated for word καὶ [lemma: καί] (adverb), skipping

3 τὴν [lemma: ὁ] (determiner) | singular.feminine.accusative
Sentence already generated for word τὴν [lemma: ὁ] (determiner) | singular.feminine.accusative, skipping

4 ἐν [lemma: ἐν] (adposition)
Sentence already generated for word ἐν [lemma: ἐν] (adposition), skipping

5 ὁ [lemma: ὁ] (determiner) | singular.masculine.nominative
Sentence already generated for word ὁ [lemma: ὁ] (determiner) | singular.masculine.nominative, skipping

6 μὲν [lemma: μέν] (adverb)
Sentence already generated for word μὲν [lemma: μέν] (adverb), skipping

7 τῶν [lemma: ὁ] (determiner) | plural.masculine.genitive
Sentence already generated for word τῶν [lemma: ὁ] (determiner) | pl

In [57]:
persist_tables(df_sents, df_words, df_sents_ee, persist_all=True)

In [58]:
df_sents_ee

Unnamed: 0_level_0,ee_phrase,ee_target,ee_words,grc_sent_idx,grc_target_idx
ee_sent_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Ning need kirjalikud näidised loetakse teile e...,ning [lemma: ning],"[{'ee': 'loetakse ette', 'grc': 'ἀναγνωσθήσοντ...",31,1
1,Aga ka nende tunnistajaid toon ma teile.,aga [lemma: aga],"[{'ee': 'tunnistajaid', 'grc': 'μάρτυρας'}, {'...",12,4
2,Sest nad on kirja pannud igaühe [neist] kolm j...,"ja, isegi [lemma: ja]","[{'ee': 'kolm', 'grc': 'τρεῖς'}, {'ee': 'sest'...",16,5
3,Ja mina imetlesin tema vastust ja ütlesin:,selle (f. acc. sg.) [lemma: see],"[{'ee': 'ja', 'grc': 'καὶ'}, {'ee': 'mina', 'g...",47,6
4,See on looduses (kõige) liikuvam.,"-s, -l [lemma: sees]","[{'ee': 'kõige liikuvam', 'grc': 'κινητικώτατο...",457,7
...,...,...,...,...,...
497,"Aga kui need [asjad] olid tehtud, läksid nad t...",oma (m. gen. pl.) [lemma: oma],"[{'grc': 'τούτων', 'ee': 'need [asjad]'}, {'gr...",37563,526
498,Vanadest aga nüüd (olgu) töödest küllalt :,töödest (n. gen. pl.) [lemma: töö],"[{'grc': 'παλαιῶν', 'ee': 'vanadest'}, {'grc':...",11841,527
499,Täiesti hukkunuks [see] mitte kunagi ei saaks.,saaks [lemma: saama],"[{'grc': 'πανώλεθρος', 'ee': 'täiesti hukkunuk...",4533,528
500,Ja nende rahade abil ma püüan kodanikke valvata.,kodanikke (m. gen. pl.) [lemma: kodanik],"[{'grc': 'ἐκ τῶν ... χρημάτων', 'ee': 'nende r...",14001,529


In [59]:
grc_sent_idx = selected_sentence.name

In [60]:
df_words.loc[df_sents_ee.grc_target_idx]

Unnamed: 0_level_0,index,text,count,lemma,pos,Aspect,Case,Definiteness,Degree,Gender,...,Number,Person,Polarity,Possessive,PronominalType,Reflexive,Tense,VerbForm,Voice,ee_sent_idx
grc_target_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,καὶ [lemma: καί] (coordinating_conjunction),καὶ,76985,καί,coordinating_conjunction,,,,,,...,,,,,,,,,,0.0
4,δὲ [lemma: δέ] (adverb),δὲ,26946,δέ,adverb,,,,,,...,,,,,,,,,,1.0
5,καὶ [lemma: καί] (adverb),καὶ,21000,καί,adverb,,,,,,...,,,,,,,,,,2.0
6,τὴν [lemma: ὁ] (determiner) | singular.feminin...,τὴν,19781,ὁ,determiner,,accusative,,,feminine,...,singular,,,,,,,,,3.0
7,ἐν [lemma: ἐν] (adposition),ἐν,19662,ἐν,adposition,,,,,,...,,,,,,,,,,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
526,αὑτῶν [lemma: ἑαυτοῦ] (pronoun) | plural.mascu...,αὑτῶν,335,ἑαυτοῦ,pronoun,,genitive,,,masculine,...,plural,,,,,,,,,497.0
527,ἔργων [lemma: ἔργον] (noun) | plural.neuter.ge...,ἔργων,334,ἔργον,noun,,genitive,,,neuter,...,plural,,,,,,,,,498.0
528,γένοιτο [lemma: γίγνομαι] (verb) | finite; pas...,γένοιτο,334,γίγνομαι,verb,,,,,,...,singular,third,,,,,past,finite,middle,499.0
529,πολιτῶν [lemma: πολίτης] (noun) | plural.mascu...,πολιτῶν,334,πολίτης,noun,,genitive,,,masculine,...,plural,,,,,,,,,500.0


In [61]:
pd.merge(df_sents_ee, df_words, left_on="grc_target_idx", right_index=True)[["ee_phrase", "ee_target", "text"]]

Unnamed: 0_level_0,ee_phrase,ee_target,text
ee_sent_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Ning need kirjalikud näidised loetakse teile e...,ning [lemma: ning],καὶ
1,Aga ka nende tunnistajaid toon ma teile.,aga [lemma: aga],δὲ
2,Sest nad on kirja pannud igaühe [neist] kolm j...,"ja, isegi [lemma: ja]",καὶ
3,Ja mina imetlesin tema vastust ja ütlesin:,selle (f. acc. sg.) [lemma: see],τὴν
4,See on looduses (kõige) liikuvam.,"-s, -l [lemma: sees]",ἐν
...,...,...,...
497,"Aga kui need [asjad] olid tehtud, läksid nad t...",oma (m. gen. pl.) [lemma: oma],αὑτῶν
498,Vanadest aga nüüd (olgu) töödest küllalt :,töödest (n. gen. pl.) [lemma: töö],ἔργων
499,Täiesti hukkunuks [see] mitte kunagi ei saaks.,saaks [lemma: saama],γένοιτο
500,Ja nende rahade abil ma püüan kodanikke valvata.,kodanikke (m. gen. pl.) [lemma: kodanik],πολιτῶν


In [33]:
selected_sentence.name

31

In [37]:
save_output(df_sents_ee=df_sents_ee, df_sents_grc=df_sents, df_words=df_words, grc_sent_idx=grc_sent_idx, grc_target_idx=grc_target_idx, parsed_msg_output=sentence_translation_dict)

In [38]:
df_sents_ee

Unnamed: 0_level_0,ee_phrase,ee_target,ee_words,grc_sent_idx,grc_target_idx
ee_sent_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Aga [need] lepingud (apograafid) teile ette lo...,loetakse ette [lemma: ette lugema],"[{'grc': 'ἀναγνωσθήσονται', 'ee': 'loetakse et...",31,1


In [None]:
save_output(df_sents_ee=df_sents_ee, df_sents_grc=df_sents, df_words=df_words, grc_sent_idx: int, grc_target_idx: int, parsed_msg_output=sentence_translation_dict)


In [24]:
type(selected_sentence_translation)

anthropic.types.message.Message

In [26]:
sentence_translation_dict = dict(json.loads(selected_sentence_translation.content[0].text))
sentence_translation_dict

{'lause': 'Aga [need] lepingud (apograafid) teile ette loetakse (tulevad loetud).',
 'märksõna': 'loetakse ette [lemma: ette lugema]',
 'tõlked': [{'grc': 'ἀναγνωσθήσονται', 'ee': 'loetakse ette'},
  {'grc': 'δὲ', 'ee': 'aga'},
  {'grc': 'ὑμῖν', 'ee': 'teile'},
  {'grc': 'καὶ', 'ee': 'ja'},
  {'grc': 'αὗται αἱ ἀπογραφαί', 'ee': 'need lepingud'}]}

In [None]:
msg_output = get_raw_translation(client, prompt_input)
parsed_msg_output = parse_output(msg_output)


In [19]:
# populate df_sents_ee
# first, make an ordering of words in df_words
# then iterate over the ordering. For first n=100 words,:
# get best matching sentence
# from this sentence, generate prompt
# from prompt, generate response
# store response in df_sents_ee with ref to sentence (grc_sent_idx) in df_sents and ref to word (grc_target_idx) in df_words
# update corresponding sentence in df_sents with ee_sent_idx
# update corresponding word in df_words with ee_sent_idx

In [None]:
#idx = 1
#matching_word = df_words.iloc[idx]
#print(matching_word["index"])
#matching_sents = find_matching_sentences(df_sents, word_hedrinks)
#matching_sents = filter_matching_sentences(matching_sents)

In [11]:
df_sents

Unnamed: 0,sentence_idx,metadata,sentence_obj,sentence_txt,len_words,len_chars,ee_sent_idx
0,0,"{'author': 'lysias', 'edition': 'Lysias with a...","[{'category': {'F': 'neg', 'N': 'pos', 'V': 'p...","ἴσως τινὲς ὑμῶν, ὦ ἄνδρες δικασταί, διὰ τὸ βού...",23,120,
1,1,"{'author': 'lysias', 'edition': 'Lysias with a...","[{'category': {'F': 'pos', 'N': 'pos', 'V': 'n...","ἐγὼ δὲ τοσούτου δέω περὶ τῶν μὴ, προσηκόντων ἱ...",31,150,
2,2,"{'author': 'lysias', 'edition': 'Lysias with a...","[{'category': {'F': 'neg', 'N': 'neg', 'V': 'p...","οἴμαι μὲν οὖν, ἐάν πάντα διηγήσωμαι τὰ πεπραγμ...",30,172,
3,3,"{'author': 'lysias', 'edition': 'Lysias with a...","[{'category': {'F': 'pos', 'N': 'neg', 'V': 'n...",ἐξ ἀρχῆς οὖν ἀκούσατε.,5,22,
4,4,"{'author': 'lysias', 'edition': 'Lysias with a...","[{'category': {'F': 'neg', 'N': 'neg', 'V': 'p...",ἐράτων ὁ Ἐρασιφῶντος πατὴρ ἐδανείσατο παρὰ τοῦ...,12,71,
...,...,...,...,...,...,...,...
121099,134639,"{'author': 'homeric hymns', 'edition': 'Hymni ...","[{'category': {'F': 'neg', 'N': 'pos', 'V': 'p...","αἰδοίην, χρυσοστέφανον, καλὴν Ἀφροδίτην ᾁσομαι...",30,184,
121100,134640,"{'author': 'homeric hymns', 'edition': 'Hymni ...","[{'category': {'F': 'pos', 'N': 'pos', 'V': 'n...","τὴν δὲ χρυσάμπυκες Ὧραι δέξαντ ̓ ἀσπασίως, περ...",28,140,
121101,134641,"{'author': 'homeric hymns', 'edition': 'Hymni ...","[{'category': {'F': 'pos', 'N': 'neg', 'V': 'n...",ἐν δὲ τρητοῖσι λοβοῖσιν ἄνθεμ ̓ ὀρειχάλκου χρυ...,42,242,
121102,134642,"{'author': 'homeric hymns', 'edition': 'Hymni ...","[{'category': {'F': 'pos', 'N': None, 'V': Non...","αὐτὰρ ἐπειδὴ πάντα περὶ χροῒ κόσμον ἔθηκαν, ἦγ...",37,214,
