# Translation

> Translate Finnish words/phrases to English and Japanese using OpenAI API

In [None]:
#| default_exp xlate

In [None]:
#| export

import json
import time
from openai import OpenAI

## Translation Functions

In [None]:
#| exporti
def __cli(
    cli: OpenAI,  # OpenAI client instance
    msgs: list[dict],  # List of message dicts
    model: str = "gpt-4o-mini",  # Model name
    temperature: float = 0.3  # Temperature parameter
) -> str:
    """Internal function to call OpenAI API without retry logic."""
    res = cli.chat.completions.create(
        model = model, 
        messages = msgs, 
        response_format = {"type": "json_object"}, 
        temperature = temperature,
    )
    return res.choices[0].message.content

In [None]:
#| export
def cli(
    cli: OpenAI,
    msgs: list[dict],
    model: str = "gpt-4o-mini",
    max_retries: int = 3
) -> str:
    """Call OpenAI API with exponential backoff retry."""
    for attempt in range(max_retries):
        try:
            return __cli(cli, msgs=msgs, model=model)            
        except Exception as e:
            if attempt == max_retries - 1:
                raise RuntimeError(
                    f"Translation failed after {max_retries} attempts: {e}"
                )
            wait_time = 2 ** attempt
            print(f"Attempt {attempt + 1} failed: {e}. Retrying in {wait_time}s...")
            time.sleep(wait_time)

In [None]:
#| exporti
def build_prompts(texts: list[str]) -> tuple[str, str]:
    """Build system and user prompts for translation."""
    finnish_list = "\n".join(f"{i+1}. {text}" for i, text in enumerate(texts))
    
    system_prompt = """You are a professional translator specializing in Finnish to English and Japanese translations.
Your translations should be:
- Natural and contextually appropriate (not overly literal)
- Suitable for language learning (clear and commonly used expressions)
- Consistent in formality level
- Accurate to the source meaning

When translating single words, provide the most common meaning.
When translating phrases or sentences, provide natural conversational translations.
For Japanese, use appropriate formality and include kanji with hiragana where appropriate."""

    user_prompt = f"""Translate the following Finnish texts to English and Japanese.
Return your response as a JSON object with a "translations" key containing an array where each object has: {{"Finnish": "...", "English": "...", "Japanese": "..."}}

Finnish texts to translate:
{finnish_list}"""

    return system_prompt, user_prompt

In [None]:
#| exporti
def parse_response(content: str, expected_count: int) -> list[dict]:
    """Parse OpenAI API JSON response."""
    result = json.loads(content)
    
    # Handle different JSON structures
    if isinstance(result, dict) and "translations" in result:
        translations = result["translations"]
    elif isinstance(result, list):
        translations = result
    else:
        # Fallback: try to extract list from dict
        translations = list(result.values())[0] if result else []
    
    # Validate count
    if len(translations) != expected_count:
        raise ValueError(
            f"Expected {expected_count} translations, got {len(translations)}"
        )
    
    return translations

In [None]:
#| export
def xtexts(
    texts: list[str],          # List of Finnish words/phrases
    client: OpenAI | None = None  # OpenAI client (creates new if None)
) -> list[dict]:             # List of dicts with keys: Finnish, English, Japanese
    """Translate batch of Finnish texts to English and Japanese.
    
    Args:
        texts: List of Finnish words/phrases to translate
        client: OpenAI client instance (optional, creates new if not provided)
    
    Returns:
        List of dicts with keys: Finnish, English, Japanese
    """  
    # Filter out empty/whitespace-only texts
    texts = [t.strip() for t in texts if t.strip()]
    
    # Early return for empty list
    if not texts:
        return []
    
    # Use provided client or create new one
    if client is None:
        client = OpenAI()
    
    prms = build_prompts(texts)
    msgs = [
        {"role": "system", "content": prms[0]},
        {"role": "user", "content": prms[1]}
    ]
    res = cli(client, msgs)
    return parse_response(res, len(texts))

## Tests

**Note on API Integration Tests:**

Full end-to-end tests for `xtexts()` and `cli()` require:
- `OPENAI_API_KEY` environment variable set
- Network access
- Cost (actual API calls)

For CI/CD environments, consider:
- Mocking the OpenAI API calls
- Using `#| eval: false` to skip expensive tests
- Separate integration test suite

The unit tests above cover the logic without requiring API calls.

In [None]:
#| test
# Test: build_prompts creates correct format

texts = ["kissa", "koira"]
sys_prompt, user_prompt = build_prompts(texts)

# Check system prompt contains key instructions
assert "Finnish to English and Japanese" in sys_prompt
assert "Natural and contextually appropriate" in sys_prompt

# Check user prompt contains the texts
assert "1. kissa" in user_prompt
assert "2. koira" in user_prompt
assert "JSON object" in user_prompt
assert '"translations"' in user_prompt

print("✓ build_prompts format test passed")

✓ build_prompts format test passed


In [None]:
#| test
# Test: parse_response raises error on count mismatch

json_one_item = '{"translations": [{"Finnish": "a", "English": "b", "Japanese": "c"}]}'

# Expect 2 items but got 1 - should raise ValueError
try:
    parse_response(json_one_item, 2)
    assert False, "Should raise ValueError for count mismatch"
except ValueError as e:
    assert "Expected 2 translations, got 1" in str(e), f"Wrong error message: {e}"

# Expect 0 items but got 1 - should raise ValueError
try:
    parse_response(json_one_item, 0)
    assert False, "Should raise ValueError for count mismatch"
except ValueError as e:
    assert "Expected 0 translations, got 1" in str(e)

print("✓ parse_response error handling tests passed")

✓ parse_response error handling tests passed


In [None]:
#| test
# Test: parse_response handles different JSON structures

# Case 1: Standard format with "translations" key
json1 = '{"translations": [{"Finnish": "pää", "English": "head", "Japanese": "頭"}]}'
result1 = parse_response(json1, 1)
assert len(result1) == 1
assert result1[0]["Finnish"] == "pää"
assert result1[0]["English"] == "head"
assert result1[0]["Japanese"] == "頭"

# Case 2: Direct array format
json2 = '[{"Finnish": "käsi", "English": "hand", "Japanese": "手"}]'
result2 = parse_response(json2, 1)
assert len(result2) == 1
assert result2[0]["Finnish"] == "käsi"
assert result2[0]["English"] == "hand"

# Case 3: Fallback format (dict with non-standard key)
json3 = '{"data": [{"Finnish": "jalka", "English": "foot", "Japanese": "足"}]}'
result3 = parse_response(json3, 1)
assert len(result3) == 1
assert result3[0]["Finnish"] == "jalka"
assert result3[0]["English"] == "foot"

print("✓ parse_response JSON format tests passed")

✓ parse_response JSON format tests passed


In [None]:
#| test
# Test: xtexts handles whitespace-only list
result = xtexts(["  ", "", "  "])
assert result == [], f"Expected empty list for whitespace-only input, got {result}"

print("✓ xtexts whitespace-only list test passed")

✓ xtexts whitespace-only list test passed


In [None]:
#| test
# Test: xtexts handles empty list
result = xtexts([])
assert result == [], f"Expected empty list, got {result}"

print("✓ xtexts empty list test passed")

✓ xtexts empty list test passed


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()