In [1]:
from typing import TypedDict, Annotated
from langgraph.graph.message import add_messages
from langchain_core.messages import AnyMessage, HumanMessage, AIMessage, SystemMessage
from langgraph.prebuilt import ToolNode
from langgraph.graph import START, StateGraph
from langgraph.prebuilt import tools_condition
from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace

In [2]:
import os
import json

In [3]:
import copy

In [4]:
from pydantic_model import Definicion, Palabra, Sufijo, DefinicionSufijo, Posposicion

* 'anystr_strip_whitespace' has been renamed to 'str_strip_whitespace'
* 'min_anystr_length' has been renamed to 'str_min_length'


In [5]:
import pandas as pd

In [6]:
# load to pydantic objects
with open("data/diccionarios/aymara/dic-2-ay-es.json", "r", encoding="utf-8") as f:
    dictionary = json.load(f)
    dictionary = {
        "diccionario": dictionary["diccionario"],
        "version": dictionary["version"],
        "fecha": dictionary["fecha"],
        "fuente": dictionary["fuente"],
        "palabras": {key: Palabra(**value) for key, value in dictionary["palabras"].items()},
        "sufijos": {key: Sufijo(**value) for key, value in dictionary["sufijos"].items()},
        "posposiciones": {key: Posposicion(**value) for key, value in dictionary["posposiciones"].items()},
    }

In [7]:
with open("americasnlp2025/ST1_MachineTranslation/data/aymara-spanish/train.aym") as f:
    aym_list = f.readlines()
with open("americasnlp2025/ST1_MachineTranslation/data/aymara-spanish/train.es") as f:
    es_list = f.readlines()
df = pd.DataFrame({"es": es_list, "aym": aym_list})

In [8]:
len(df)

6531

In [9]:
df_filtered = df[~df["es"].isin(df["aym"])]
len(df_filtered)

6138

In [10]:
from tqdm import tqdm

In [11]:
palabras_sin_sufijo = {}
counter = 0
for key, value in tqdm(dictionary["palabras"].items()):
    tiene_sufijo = False
    for sufijo in dictionary["sufijos"].values():
        if key.endswith(sufijo.sufijo):
            if len(key) == len(sufijo.sufijo):
                continue
            palabras_sin_sufijo[key[:-len(sufijo.sufijo)]] = value
            tiene_sufijo = True
            counter += 1
            break
    if not tiene_sufijo:
        palabras_sin_sufijo[key] = value
        
print(counter)
palabras_sin_sufijo

100%|██████████| 6512/6512 [00:00<00:00, 80606.85it/s]

5785





{'äa!': Palabra(palabra='äa!', definiciones=[Definicion(definicion='Expresión para manifestar que “ahora ya entiendo o he comprendido”. Se responde así cuando se recibe una segunda explicación de algún asunto.', pronunciacion='[á:a]', PoS='interjección', otras_etiquetas=None, sinonimos=None, variantes=None)], pronunciaciones=['[á:a]'], PoS=['interjección'], sinonimos_comunes=[]),
 'a': Palabra(palabra='at', definiciones=[Definicion(definicion='ì Gibado. Corcovado, jorobado.', pronunciacion='[áti -k’úmu]', PoS='adjetivo', otras_etiquetas=None, sinonimos=['atì qulu'], variantes=None), Definicion(definicion='ì Jorobado. Corcovado, gibado.', pronunciacion='[áti -qólu]', PoS='adjetivo', otras_etiquetas=None, sinonimos=None, variantes=None)], pronunciaciones=['[áti -k’úmu]', '[áti -qólu]'], PoS=['adjetivo'], sinonimos_comunes=['atì qulu']),
 'acha': Palabra(palabra='achala', definiciones=[Definicion(definicion='m. Lascivo. Se refiere al hombre o animal macho dominado por el deseo sexual.', p

In [12]:
palabras_formateadas = {}
for key, value in tqdm(palabras_sin_sufijo.items()):
    key = key.replace("'", "’")
    palabras_formateadas[key] = copy.deepcopy(value)
    palabras_formateadas[key].palabra = key

100%|██████████| 5623/5623 [00:00<00:00, 33499.00it/s]


In [13]:
diccionario_general = copy.deepcopy(palabras_sin_sufijo)
diccionario_general.update(dictionary["palabras"])
diccionario_general.update(dictionary["posposiciones"])

In [14]:
df_morph = pd.read_csv("data/fuentes/aymara/aym/aym", sep="\t", header=None, names=["lemma", "word", "morph"])
df_morph

Unnamed: 0,lemma,word,morph
0,ñach'a,ñach'iriksma,V;CF;PRS/PST+IMMED;1;SG;AC2
1,ñach'a,ñach'irikta,V;CF;PRS/PST+IMMED;1;SG;AC3
2,ñach'a,ñach'iriksta,V;CF;PRS/PST+IMMED;1;SG;AC3;FORM2
3,ñach'a,ñach'sña,V;CF;PRS/PST+IMMED;1+INCL;AC3
4,ñach'a,ñach'itasma,V;CF;PRS/PST+IMMED;2;SG;AC1
...,...,...,...
336337,yuxch'a,yuxch'apanakakata,N;VERS;PL;PSS3S
336338,yuxch'a,yuxch'asakata,N;VERS;PSS1PL
336339,yuxch'a,yuxch'axakata,N;VERS;PSS1S
336340,yuxch'a,yuxch'amakata,N;VERS;PSS2S


In [15]:
morph_dict = {}
for index, row in df_morph.iterrows():
    # morph_dict[row["word"]] = {
    #     "lemma": row["lemma"],
    #     "morph": row["morph"],
    # }
    morph_dict[row["lemma"]] = {
        "lemma": row["lemma"],
        "morph": row["morph"],
    }
    

In [16]:
len(morph_dict)

3410

In [17]:
from langchain.tools import Tool
import re


def tokenize(sentence: str) -> str:
    """Tokeniza una oracion."""
    return sentence.split()


def search_dictionary(sentence: str, dictionary_to_search: dict) -> str:
    """Entrega las traducciones por palabra de una oracion."""
    # Dummy weather data
    sentence = re.sub(r"[\,\.\!\?\(\)\"\”\“\:\;\¿\¡\/]", "", sentence).lower()
    tokens = tokenize(sentence)
    translations = []
    in_words = 0
    out_words = 0
    for token in tokens:
        found_flag = False
        for i in range(len(token), max(0, len(token)-8), -1):
            # Check if the token is in the dictionary
            if token[:i] in dictionary_to_search:
                translations.append(dictionary_to_search[token[:i]])
                in_words += 1
                found_flag = True
                break
        if not found_flag:    
            translations.append([f"No translation found for '{token}'"])
            out_words += 1
        # if token.star dictionary_to_search:
        #     translations.append(dictionary_to_search[token])
        #     in_words += 1
        # else:
        #     translations.append([f"No translation found for '{token}'"])
        #     out_words += 1
    return translations, in_words, out_words

In [18]:
from rapidfuzz import fuzz
from heapq import heappush, heappop
from collections import deque

def fast_similarity(a, b):
    return fuzz.ratio(a, b) / 100.0  # Normalize to 0-1

def smart_similarity(candidate, target):
    sub_target = target[:len(candidate)]
    sim_part = fuzz.ratio(candidate, sub_target) / 100.0
    sim_full = fuzz.ratio(candidate, target) / 100.0
    sim = (sim_part + sim_full) / 2.0
    return min(sim, 1.0)

def find_best_concat(words: dict, suffixes: dict, target: str, beam_width=5, max_depth=8):
    target = target.lower()
    visited = set()
    heap = []

    # Initialize with words
    for word in words.keys():
        sim = smart_similarity(word.lower(), target)
        heappush(heap, (-sim, word, 1, [word]))  # max-heap by negating sim

    results = []

    while heap:
        new_heap = []
        for _ in range(min(len(heap), beam_width)):
            neg_sim, current, depth, path = heappop(heap)
            sim = -neg_sim

            if ((current, depth) in visited) or (depth > max_depth):
                continue
            visited.add((current, depth))

            results.append((current, sim, path))

            for suffix_key in suffixes.keys():
                new_str = current + suffix_key
                new_sim = smart_similarity(new_str, target)
                heappush(new_heap, (-new_sim, new_str, depth + 1, path + [suffix_key]))

        heap = new_heap

    # Sort by similarity and return top matches
    results.sort(key=lambda x: -x[1])
    final_results = []
    for i in range(beam_width):
        word, sim, path = results[i]
        if sim <= 0.5:
            break
        path = [words[path[0]]] + [suffixes[sufijo] for sufijo in path[1:]]
        final_results.append((word, sim, path))
    return final_results

In [19]:
def fix_apostrophes_and_quotes(text):
    output = list(text)
    quote_indices = []

    for i, char in enumerate(text):
        if char != "'":
            continue

        prev = text[i - 1] if i > 0 else ''
        next = text[i + 1] if i + 1 < len(text) else ''

        # Rule 1: Surrounded by letters → apostrophe
        if prev.isalpha() and next.isalpha():
            output[i] = '’'
        # Rule 2: Preceded by space/punct, followed by letter → apostrophe at word-start
        elif not next.isalpha():
            output[i] = '"'
        # Rule 3: Else treat as a quote, matching in pairs
        elif next.isupper():
            output[i] = '"'
        else:
            output[i] = "’"
            quote_indices.append(i)
    text = ''.join(output)
    text = re.sub(r'\s*’\s*', '’', text)
    return text

In [20]:
fix_apostrophes_and_quotes("Bolivia: Pulliran warminakan nuwt 'asiwip uñacht 'äwi")

'Bolivia: Pulliran warminakan nuwt’asiwip uñacht’äwi'

In [21]:
import unicodedata

def get_filtered_characters(sentence):
    result = []
    for char in sentence:
        # Get Unicode category (e.g., "Pd" for dash punctuation, "Po" for other punctuation)
        category = unicodedata.category(char)
        
        # Allow letters, numbers, spaces, and marks (Mn, Mc, Me), and specific punctuation marks
        if (category.startswith(('L', 'N', 'Z')) or  # letters, numbers, separators
            category.startswith('M') or              # marks: accents, dieresis, etc.
            char in ["´", "`", "^", "¨", "’"]):      # explicitly keep these
            result.append(char)
        # else: remove other punctuation
    return ''.join(result)

In [22]:
get_filtered_characters(fix_apostrophes_and_quotes("Uka jaqi jiway jaqinakaxa (mercenarios) walja qullqinakpunwa katuqasipxatapa ukjam jiwkatayañatakixa … Serbia tuqit jutapxatapa, khä bosnia tuqita croata tuqinkirinakaw jupanakarux akatakix jawsthapipxatayna … jupanakasti \"legionarios serbios\" satanakapxataynawa, kunalaykutix khaysa ex Yugoslavia ch ’ axwawin chikañchasipxatayna."))

'Uka jaqi jiway jaqinakaxa mercenarios walja qullqinakpunwa katuqasipxatapa ukjam jiwkatayañatakixa  Serbia tuqit jutapxatapa khä bosnia tuqita croata tuqinkirinakaw jupanakarux akatakix jawsthapipxatayna  jupanakasti legionarios serbios satanakapxataynawa kunalaykutix khaysa ex Yugoslavia ch’axwawin chikañchasipxatayna'

In [23]:
word_list = list(diccionario_general.keys())
suffix_list = list(dictionary["sufijos"].keys())
find_best_concat(diccionario_general, dictionary["sufijos"], "Chanipa", beam_width=8, max_depth=5)

[('chanipa',
  1.0,
  [Palabra(palabra='chani', definiciones=[Definicion(definicion='Precio. Valor, costo.', pronunciacion=None, PoS='sustantivo', otras_etiquetas=None, sinonimos=None, variantes=None)], pronunciaciones=[], PoS=['sustantivo'], sinonimos_comunes=[]),
   Sufijo(sufijo='pa', definiciones=[DefinicionSufijo(definicion='3ª persona', descripcion='. Utapa ‘su casa’. –', tipo='Flexivo nominal', naturaleza=' de naturaleza normal'), DefinicionSufijo(definicion='iterativo', descripcion='. Isapaña ‘Escuchar con frecuencia, entender’; laphapiña ‘flamear’. –', tipo='Derivativo verbal', naturaleza=' de naturaleza normal')])]),
 ('chanipa',
  1.0,
  [Palabra(palabra='charu', definiciones=[Definicion(definicion='Piezas de carrizo cortadas a lo largo.', pronunciacion=None, PoS='sustantivo', otras_etiquetas=None, sinonimos=None, variantes=None), Definicion(definicion='Prenda de vestir rota por la costura ( CS ).', pronunciacion=None, PoS='adjetivo', otras_etiquetas=None, sinonimos=None, va

In [24]:
def tokenize(sentence: str, strategy="split") -> str:
    """Tokeniza una oracion."""
    if strategy == "split":
        return sentence.split()
    # TODO: implement other strategies if needed
    return sentence.split()

In [278]:
def build_prompt(word_list: str, possible_translations_sentence: list):
    """Construye un prompt para el modelo."""
    string = ""
    for word, possible_compositions in zip(word_list, possible_translations_sentence):
        string += f"\nPosibles composiciones para palabra: {word}\n"
        for i, possible_composition in enumerate(possible_compositions):
            palabra, similitud, partes = possible_composition
            definition_str = ""
            composition_str = []
            for part in partes:
                if isinstance(part, Palabra):
                    composition_str.append(f"{part.palabra}")
                elif isinstance(part, Sufijo):
                    composition_str.append(f"-{part.sufijo}")
                elif isinstance(part, Posposicion):
                    composition_str.append(f"{part.posposicion}")
                definition_str += f"{part}\n"
            palabra = ''.join(composition_str).replace("-", "")
            similitud = smart_similarity(palabra, word)
            string += f"\n{i+1}. {palabra}: {' + '.join(composition_str)} (similitud: {similitud})"
            string += f"\n{definition_str}"
    return string

In [279]:
def process_sentence(sentence: str, dictionary_to_search: dict, suffixes: dict):
    # TODO: Pipeline para procesar la oracion
    # Step 1: Limpiar espacios
    # Step 2: Corregir apostrofes y comillas
    # Step 3: Tokenizar
    # Step 4: Buscar en el diccionario
    # Step 5: Armar prompt
    
    sentence = re.sub(r"\s+", " ", sentence).strip()
    sentence = fix_apostrophes_and_quotes(sentence)
    filtered_sentence = get_filtered_characters(sentence).lower()
    word_list = tokenize(filtered_sentence)
    possible_translations = []
    for word in word_list:
        translations = find_best_concat(dictionary_to_search, suffixes, word)
        possible_translations.append(translations)
    
    prompt = build_prompt(word_list, possible_translations)
    print(prompt)
    return prompt

In [280]:
text = "Kunjams aru jaqukipañataki, qilqañatak pacha apsta?"
# text = "Kunawsati, jaqulipañax utjki ukhax mä uru, pacha churañatakix apst 'astxa."
process_sentence(text, diccionario_general, dictionary["sufijos"])


Posibles composiciones para palabra: kunjams

1. kunkajams: kunka + -ja + -m + -s (similitud: 0.875)
	kunka: 
		- Cuello. Parte que une la cabeza con el tronco en la mayoría de los seres vertebrados. Cogote, pescuezo.
		- Voz. El sonido y timbre característico de los hablantes.
	ja: 
		- 1ª persona: . Utaja ‘mi casa’. El sufijo -ja presenta variación regional, como ser la forma -ña en Oruro y occidente de Cochabamba, la forma -xa en provincias Pacajes e Ingavi de La Paz y la forma -nha [ηa] en Chile. Así equivale a ‘mi casa’.
		- partitivo/divisor: . Aljaña ‘vender’.
		- cuantificador: . Akja ‘este tanto’.
		- locativo puntual: . Akjawa ‘Es exactamente este lugar’. –
		- comparativo: . Utjama ‘como la casa’. –
	m: 
		- 2ª persona, imperativo: . Apam ‘lleva usted’.
	s: 
		- dubitativo: . Qillqchi ‘(él) tal vez escribe’.

2. kunkajam: kunka + -ja + -m (similitud: 0.8)
	kunka: 
		- Cuello. Parte que une la cabeza con el tronco en la mayoría de los seres vertebrados. Cogote, pescuezo.
		-

'\nPosibles composiciones para palabra: kunjams\n\n1. kunkajams: kunka + -ja + -m + -s (similitud: 0.875)\n\tkunka: \n\t\t- Cuello. Parte que une la cabeza con el tronco en la mayoría de los seres vertebrados. Cogote, pescuezo.\n\t\t- Voz. El sonido y timbre característico de los hablantes.\n\tja: \n\t\t- 1ª persona: . Utaja ‘mi casa’. El sufijo -ja presenta variación regional, como ser la forma -ña en Oruro y occidente de Cochabamba, la forma -xa en provincias Pacajes e Ingavi de La Paz y la forma -nha [ηa] en Chile. Así equivale a ‘mi casa’.\n\t\t- partitivo/divisor: . Aljaña ‘vender’.\n\t\t- cuantificador: . Akja ‘este tanto’.\n\t\t- locativo puntual: . Akjawa ‘Es exactamente este lugar’. –\n\t\t- comparativo: . Utjama ‘como la casa’. –\n\tm: \n\t\t- 2ª persona, imperativo: . Apam ‘lleva usted’.\n\ts: \n\t\t- dubitativo: . Qillqchi ‘(él) tal vez escribe’.\n\n2. kunkajam: kunka + -ja + -m (similitud: 0.8)\n\tkunka: \n\t\t- Cuello. Parte que une la cabeza con el tronco en la mayoría d

In [281]:
def search_sentence_in_dictionary(sentence: str):
    """Busca una oracion en el diccionario y devuelve las traducciones por palabra."""
    # Step 1: Limpiar espacios
    # Step 2: Corregir apostrofes y comillas
    # Step 3: Tokenizar
    # Step 4: Buscar en el diccionario
    # Step 5: Armar prompt
    
    sentence = re.sub(r"\s+", " ", sentence).strip()
    sentence = fix_apostrophes_and_quotes(sentence)
    filtered_sentence = get_filtered_characters(sentence).lower()
    return process_sentence(filtered_sentence, diccionario_general, dictionary["sufijos"])

In [685]:
# Initialize the tool
search_dictionary_tool = Tool(
    name="search_dictionary",
    func=search_sentence_in_dictionary,
    description="Entrega las posibles composiciones de cada palabra y sus definiciones en una oracion."
)

In [686]:
from huggingface_hub import InferenceClient

llm = HuggingFaceEndpoint(
    repo_id="Qwen/Qwen3-30B-A3B",
    provider="fireworks-ai",
    huggingfacehub_api_token=os.environ['HF_TOKEN'],
    
)

chat = ChatHuggingFace(llm=llm, verbose=True)
tools = [search_dictionary_tool]
model = chat.bind_tools(tools)

In [687]:
ex_system_prompt = """You are an expert linguist specializing in low-resource languages of America, with native-level fluency in <target_language>{TARGET_LANGUAGE}</target_language>. Your task is to analyze and interpret a sentence in <source_language>{SOURCE_LANGUAGE}</source_language>, a low-resource language, using the provided tool to search for definitions and translations.

Your task is to carefully analyze the sentence that the user provide you and provide the most plausible interpretations based on the definitions get by the tool and the given context. Before providing any analysis, always call the tool by giving it whole sentence or the whole provided text, and then work through the following steps inside <linguistic_breakdown> tags in your thinking block:

0. Call the tool with the provided sentence to get definitions and translations, if it is not already done. Do NOT THINK ANYTHING before calling the tool! You must call the tool with this format
<tool_call>
{{"name": TOOL_NAME, "arguments": {{"_arg1": SENTENCE}}}}
</tool_call>

1. List each word in the sentence along with its corresponding definition(s).
2. For each word:
   a. Consider all possible meanings provided by the tool, maybe the appropiate base word is not the one with most repetitions.
   b. Note any grammatical markers or structures.
   c. Identify if it could be a potential loanword, name, or typo if the word does not fit with the rest of the context.
   d. List possible interpretations based on the provided definitions.
   e. Rate the likelihood of each interpretation (e.g., very likely, possible, unlikely).
3. Analyze the overall sentence structure and any notable features.
4. Propose 5-10 possible overall interpretations of the sentence, considering the context and your analysis of individual words/phrases.

After your linguistic breakdown, provide your final linguistic analysis using the following structure:

<linguistic_analysis>
1. Sentence Overview:
   [Provide a brief overview of the sentence structure and any notable features]

2. Word List:
   [List all words/phrases in the sentence, numbered for reference and all the possible definitions] 

3. Grammatical Structure:
   [Identify any grammatical structures or patterns in the sentence]

4. Word-by-Word Analysis:
   [For each word or phrase in the sentence]
   - Word/Phrase: [Insert word or phrase]
   - Most likely interpretation(s): [Explain your choice(s)]
   - Alternative interpretations (if applicable): [Discuss other plausible options]
   - Notes: [Include any observations about potential loanwords, names, or lack of suitable options]

5. Cultural and historical Context:
   [Discuss any cultural or historical factors that might influence interpretation]

6. Overall Interpretation:
   [Summarize the most plausible composition(s) of the sentence]
   [Discuss any ambiguities or uncertainties]
   [Explain how the context influenced your interpretation]

7. Confidence Level:
   [Indicate your level of confidence in the overall interpretation and explain why]
</linguistic_analysis>

Remember to consider the unique challenges of low-resource language analysis throughout your examination. Your expertise in navigating these complexities is crucial for providing an accurate and insightful interpretation.

Your final output should consist only of the linguistic analysis and should not duplicate or rehash any of the work you did in the linguistic breakdown section.
"""

user_prompt = """Analiza el siguiente texto:

<sentence>
{SENTENCE}
</sentence>"""

In [688]:
system_prompt = """You are an expert linguist specializing in low-resource languages of America. You have native-level fluency in the following target language:

<target_language>
{TARGET_LANGUAGE}
</target_language>

Your task is to analyze and interpret a sentence in the following source language:

<source_language>
{SOURCE_LANGUAGE}
</source_language>

This is a low-resource language, and you always use a provided tool to search for definitions and translations.

IMPORTANT: NEVER attempt any analysis, interpretation, or suposition before calling the tool with the entire sentence. This is crucial to ensure a good interpretation of the sentence.

When presented with a sentence to analyze, follow these steps:

1. Call the tool with the entire sentence using this exact format:
<tool_call>
{{"name": TOOL_NAME, "arguments": {{"_arg1":_SENTENCE}}}}
</tool_call>

2. After receiving the tool's output, perform a linguistic breakdown inside <analysis_workbench> tags in your thinking block. In this breakdown:
   a. List each word in the sentence with all of its corresponding definition(s) without omitting any of them.
   b. For each word:
      - Consider all possible meanings provided by the tool, maybe the appropiate base word is not the one with most repetitions.
      - Note any grammatical markers or structures.
      - Identify potential loanwords, names, or typos if the options do not fit in the context of the sentence or the proposed options are more different from the reference word.
      - List possible interpretations based on the provided definitions.
      - Rate the likelihood of each interpretation (e.g., very likely, possible, unlikely).
      - Consider the word's role in the overall sentence structure.
   c. Analyze the overall sentence structure and any notable features.
   d. Propose 5-10 possible overall interpretations of the sentence, considering the context and your analysis of individual words/phrases.
   e. Compare and contrast different interpretations, noting similarities and differences.
   f. List potential cultural or historical references that might be relevant to the sentence.

3. After completing the linguistic breakdown, provide your final linguistic analysis using the following structure:

<linguistic_analysis>
1. Sentence Overview:
   [Provide a brief overview of the sentence structure and any notable features]

2. Word List:
   [List all words/phrases in the sentence and all the possible definitions] 

3. Grammatical Structure:
   [Identify any grammatical structures or patterns in the sentence]

4. Word-by-Word Analysis:
   [For each word or phrase in the sentence]
   - Word/Phrase: [Insert word or phrase]
   - Most likely interpretation(s): [Explain your choice(s)]
   - Alternative interpretations (if applicable): [Discuss other plausible options]
   - Notes: [Include any observations about potential loanwords, names, or lack of suitable options if the word does not fit with the context]

5. Cultural and Historical Context:
   [Discuss any cultural or historical factors that might influence interpretation]

6. Overall Interpretation:
   [Summarize the most plausible composition(s) of the sentence]
   [Discuss any ambiguities or uncertainties]
   [Explain how the context influenced your interpretation]

7. Confidence Level:
   [Indicate your level of confidence in the overall interpretation and explain why]
</linguistic_analysis>

Remember to consider the unique challenges of low-resource language analysis throughout your examination. Your expertise in navigating these complexities is crucial for providing an accurate and insightful interpretation.

Your final output should consist only of the linguistic analysis and should not duplicate or rehash any of the work you did in the analysis workbench section.

When you receive a sentence to analyze, begin your response with the tool call, followed by your linguistic breakdown and analysis.
"""

user_prompt = """Analiza el siguiente texto:

<sentence>
{SENTENCE}
</sentence>"""

In [689]:
from dataclasses import dataclass, field
from typing import cast, Sequence, Literal
from langchain_core.messages import ToolMessage

In [690]:
def try_parse_tool_calls(content: str):
    """Try parse the tool calls."""
    tool_calls = []
    offset = 0
    for i, m in enumerate(re.finditer(r"<tool_call>\n(.+)?\n</tool_call>", content)):
        if i == 0:
            offset = m.start()
        try:
            func = json.loads(m.group(1))
            tool_calls.append({"type": "function", "function": func, "id": f"tool_call_id_{i+1}"})
            if isinstance(func["arguments"], str):
                func["arguments"] = json.loads(func["arguments"])
        except json.JSONDecodeError as e:
            print(f"Failed to parse tool calls: the content is {m.group(1)} and {e}")
            pass
    if tool_calls:
        if offset > 0 and content[:offset].strip():
            c = content[:offset]
        else: 
            c = ""
        return {"role": "assistant", "content": content, "tool_calls": tool_calls}
    return {"role": "assistant", "content": re.sub(r"<\|im_end\|>$", "", content)}

In [691]:
# Generate the AgentState and Agent graph
@dataclass
class AgentState:
    messages: Annotated[list[AnyMessage], add_messages] = field(
        default_factory=list
    )

def call_model(state: AgentState):

    system_message = SystemMessage(
        content=system_prompt.format(
            SOURCE_LANGUAGE="Aymara",
            TARGET_LANGUAGE="Spanish",
        )
    )
    
    system_message = SystemMessage(system_prompt.format(SOURCE_LANGUAGE="Aymara", TARGET_LANGUAGE="Spanish"))
    
    response = model.invoke(
            [system_message, *state.messages]
        )
    # Parse the response to extract the tool calls
    parsed_response = try_parse_tool_calls(response.content)
    
    
    print(state)
    
    # Get the model's response
    response = cast(
        AIMessage,
        parsed_response
    )

    # Return the model's response as a list to be added to existing messages
    return {"messages": [response]}



## The graph
builder = StateGraph(AgentState, input=AgentState)

# Define nodes: these do the work
builder.add_node("evaluator", call_model)
builder.add_node("tools", ToolNode(tools))

# Define edges: these determine how the control flow moves
builder.add_edge(START, "evaluator")
builder.add_conditional_edges(
    "evaluator",
    # If the latest message requires a tool, route to tools
    # Otherwise, provide a direct response
    tools_condition,
)
builder.add_edge("tools", "evaluator")
agent = builder.compile()

In [692]:
text_to_check = "Kunjams aru jaqukipañataki, qilqañatak pacha apsta?"

In [693]:
messages = [
    # SystemMessage(system_prompt.format(SOURCE_LANGUAGE="Aymara", TARGET_LANGUAGE="Spanish")),
    HumanMessage(user_prompt.format(SENTENCE=text_to_check)),
]

In [694]:
response = agent.invoke(AgentState(messages=messages))

AgentState(messages=[HumanMessage(content='Analiza el siguiente texto:\n\n<sentence>\nKunjams aru jaqukipañataki, qilqañatak pacha apsta?\n</sentence>', additional_kwargs={}, response_metadata={}, id='eb4175e4-d9aa-405a-af23-4815070eccad')])

Posibles composiciones para palabra: kunjams

1. kunkajams: kunka + -ja + -m + -s (similitud: 0.875)
	kunka: 
		- Cuello. Parte que une la cabeza con el tronco en la mayoría de los seres vertebrados. Cogote, pescuezo.
		- Voz. El sonido y timbre característico de los hablantes.
	ja: 
		- 1ª persona: . Utaja ‘mi casa’. El sufijo -ja presenta variación regional, como ser la forma -ña en Oruro y occidente de Cochabamba, la forma -xa en provincias Pacajes e Ingavi de La Paz y la forma -nha [ηa] en Chile. Así equivale a ‘mi casa’.
		- partitivo/divisor: . Aljaña ‘vender’.
		- cuantificador: . Akja ‘este tanto’.
		- locativo puntual: . Akjawa ‘Es exactamente este lugar’. –
		- comparativo: . Utjama ‘como la casa’. –
	m: 
		- 2ª persona, imperativo: . Ap

In [695]:
response

{'messages': [HumanMessage(content='Analiza el siguiente texto:\n\n<sentence>\nKunjams aru jaqukipañataki, qilqañatak pacha apsta?\n</sentence>', additional_kwargs={}, response_metadata={}, id='eb4175e4-d9aa-405a-af23-4815070eccad'),
  AIMessage(content='<think>\nOkay, I need to analyze the Aymara sentence "Kunjams aru jaqukipañataki, qilqañatak pacha apsta?" using the provided tools. First, I should call the search_dictionary function with the entire sentence as the argument. Let me make sure I format the tool call correctly. The function name is "search_dictionary" and the argument is the sentence. So the tool call should be:\n\n{"name": "search_dictionary", "arguments": {"_arg1": "Kunjams aru jaqukipañataki, qilqañatak pacha apsta?"}}\n\nOnce I get the tool\'s response, I\'ll break down each word. Let\'s assume the tool returns definitions for each term. For example, "Kunjams" might be a name or a verb, "aru" could be a preposition, "jaqukipañataki" might be a verb phrase, "qilqañat

In [696]:
print(response["messages"][-1].content)

<think>
Okay, let's tackle this analysis. The user provided the Aymara sentence "Kunjams aru jaqukipañataki, qilqañatak pacha apsta?" and the tool responses with possible word compositions. First, I need to parse each word based on the tool's suggestions.

Starting with "Kunjams." The tool breaks it down into possible parts like "kunka" (neck), "ja" (1st person), "m" (2nd person imperative), and "s" (dubitativo). But since "Kunjams" is likely a name or a verb, maybe "kunka" + "jams"? Wait, but the tool's options don't have "jams." Hmm, maybe "Kunjams" is a proper noun, like a person's name, given the lack of clear definitions. Alternatively, "kunja" (to go) + "ms" as a suffix? Not sure. The tool's top suggestion is "kunkajams" with "kunka" and "ja" as parts. But without more context, it's tricky. I'll note that as a potential name or a verb with uncertain meaning.

Next, "aru" is listed as "aru" with meanings like language, word, or voice. That seems straightforward. It could be "langu

In [697]:
print(response["messages"][-3].content)

<think>
Okay, I need to analyze the Aymara sentence "Kunjams aru jaqukipañataki, qilqañatak pacha apsta?" using the provided tools. First, I should call the search_dictionary function with the entire sentence as the argument. Let me make sure I format the tool call correctly. The function name is "search_dictionary" and the argument is the sentence. So the tool call should be:

{"name": "search_dictionary", "arguments": {"_arg1": "Kunjams aru jaqukipañataki, qilqañatak pacha apsta?"}}

Once I get the tool's response, I'll break down each word. Let's assume the tool returns definitions for each term. For example, "Kunjams" might be a name or a verb, "aru" could be a preposition, "jaqukipañataki" might be a verb phrase, "qilqañatak" could be related to a place, "pacha" might mean world or earth, and "apsta" could be a question word. I'll list each word with its definitions, note grammatical structures, check for loanwords, and consider possible interpretations. Then, I'll structure the a