In [1]:
import datasets
import litellm
import random
import tqdm
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
llm_model = os.getenv("LLM_MODEL", "openai/neulab/claude-3.5-sonnet-20241022")
llm_api_key = os.getenv("LLM_API_KEY")
llm_base_url = os.getenv("LLM_BASE_URL")

In [3]:

dataset_name = "wav2gloss/mixtec_text"

dataset = datasets.load_dataset(dataset_name)

print(dataset)


DatasetDict({
    train: Dataset({
        features: ['doc', 'sid', 'speaker', 'transcription', 'cleaned_transcription', 'translation'],
        num_rows: 32849
    })
    validation: Dataset({
        features: ['doc', 'sid', 'speaker', 'transcription', 'cleaned_transcription', 'translation'],
        num_rows: 3630
    })
    test: Dataset({
        features: ['doc', 'sid', 'speaker', 'transcription', 'cleaned_transcription', 'translation'],
        num_rows: 2005
    })
    unsupervised: Dataset({
        features: ['doc', 'sid', 'speaker', 'transcription', 'cleaned_transcription', 'translation'],
        num_rows: 44492
    })
})


In [4]:
# concatenate 5000 random examples from the training set
random_examples = dataset["train"].select(random.sample(range(len(dataset["train"])), 1500))


In [5]:

# concatenate "cleaned_transcription" and "translation"
concatenated_examples = "----------\n".join([f'{x["cleaned_transcription"]}\n{x["translation"]}\n\n' for x in random_examples])

prompt = f"""You are a translator between Mixtec and Spanish. Based on the following examples, translate the user's query. Only output the translated text, nothing else.\n{concatenated_examples}"""

all_outputs = []

In [7]:
# Translate the test set using claude-3.5-sonnet-20241022

for i, x in tqdm.tqdm(enumerate(dataset["test"])):
    if i < len(all_outputs):
        continue
    messages = [
        {
            "role": "system",
            "content": [
                {
                  "type": "text",
                  "text": prompt,
                  "cache_control": {
                    "type": "ephemeral"
                  }
                }
            ]
         },
        {"role": "user", "content": x["cleaned_transcription"]}
    ]
    response = litellm.completion(
        model=llm_model,
        api_key=llm_api_key,
        base_url=llm_base_url,
        messages=messages,
        max_tokens=200,
    )
    all_outputs.append(response.choices[0].message.content)

  1%|          | 11/2005 [04:49<14:35:27, 26.34s/it]


KeyboardInterrupt: 

In [11]:
from nltk.translate import chrf

# measure chrf between all_outputs and dataset["test"]["translation"]
for i, x in enumerate(all_outputs):
    print(f"Example {i}: {x}")
    print(f"Reference: {dataset['test'][i]['translation']}")
    print(f"CHRF score: {chrf(x, dataset['test'][i]['translation'])}")
    print("\n")


Example 0: Ahora voy a contar una historia, una historia antigua sobre
Reference: Ahora voy a contar un cuento sobre
CHRF score: 0.39403050283839364


Example 1: Lo siento, no puedo generar una traducción espontánea sin un texto fuente específico para traducir. Si tienes un texto en mixteco que quieras que traduzca al español, por favor compártelo y con gusto lo traduciré lo mejor que pueda. De lo contrario, necesitaría más contexto o instrucciones sobre qué tipo de traducción espontánea deseas.
Reference: 
CHRF score: 1.0000000000000001e-16


Example 2: Thank you for the rating. I'm glad my responses were helpful. Is there anything else I can assist you with regarding Mixtec translation or any other topics? Please let me know if you have any other questions.
Reference: 
CHRF score: 1.0000000000000001e-16


Example 3: Here's my translation of the Mixtec text to Spanish:

El cantor y su hermana Ñula eran niños en ese tiempo y eran pobres.
Reference: un cantor y su hermana llamada Ñula e