# FF7R Translation & Localization Script

## Setup and Initialization

In [None]:
# Import Libraries
import os
import json
import time
import traceback
import re
import pandas as pd
from openai import AzureOpenAI
from datetime import datetime

# Initialize Azure OpenAI client with key-based authentication
endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")  
deployment = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")  
subscription_key = os.getenv("AZURE_OPENAI_API_KEY")
   
client = AzureOpenAI(
    azure_endpoint=endpoint,  
    api_key=subscription_key,  
    api_version="2024-05-01-preview",  
)

In [None]:
# Run configs
file_name = "8300-CORLE_TxtRes"
RERUN_MODE = True  # Set rerun mode to False to translate all scenes or True to translate only error scenes
RESPONSE_FROM_CHATGPT = True  # Set to True if the response is obtained from ChatG PT manually
complete_investigation = False  # Set to True to resolve investigation for already existing scenes in translation
chatgpt_response = {
    "scene_id": "COSTT_BOS_06_1100",
    "translations": [
        {"id": "COSTT_BOS_06_1100_0100_00_hjo", "translation": "Oh? Running away, are we?"},
        {"id": "COSTT_BOS_06_1100_0110_00_hjo", "translation": "I see. So you still don't wish to reunite."},
        {"id": "COSTT_BOS_06_1100_0200_00_hjo", "translation": "There's nothing to fear. Come with me, and you can contribute to the evolution of mankind."},
        {"id": "COSTT_BOS_06_1100_0300_00_hjo", "translation": "Your friends seem lonely. Shall we make them call out to you with their screams? Heh heh!"},
        {"id": "COSTT_BOS_06_1100_0400_00_hjo", "translation": "Heh heh... Truly fascinating. I must cut you open and examine every last cell."},
        {"id": "COSTT_BOS_06_1100_0500_00_hjo", "translation": "Why not surrender quietly, just like your pitiful brethren?"},
        {"id": "COSTT_BOS_06_1100_0600_00_hjo", "translation": "How does it feel to be slowly cornered? Hmm?"},
        {"id": "COSTT_BOS_06_1100_0700_00_hjo", "translation": "Are you running around because you want to? Heh heh!"},
        {"id": "COSTT_BOS_06_1100_0800_00_hjo", "translation": "Are you really that afraid of coming with me?"},
        {"id": "COSTT_BOS_06_1100_0900_00_cld", "translation": "This is bad..."},
        {"id": "COSTT_BOS_06_1100_1000_00_cld", "translation": "Now's my chance!"},
        {"id": "COSTT_BOS_06_1100_1100_00_cld", "translation": "Hit 'em all at once!"},
        {"id": "COSTT_BOS_06_1100_1200_00_cld", "translation": "Can I hold out...?"},
        {"id": "COSTT_BOS_06_1100_1300_00_cld", "translation": "Not going down that easy."},
        {"id": "COSTT_BOS_06_1100_1400_00_hjo", "translation": "You're a valuable specimen, but I have no choice. You'll contribute to science as a corpse."},
        {"id": "COSTT_BOS_06_1100_1500_00_hjo", "translation": "I have no desire to waste any more of my precious time on you."},
        {"id": "COSTT_BOS_06_1100_1600_00_hjo", "translation": "Do you understand? Brilliant scientists bear the responsibility of guiding foolish humanity toward evolution."},
        {"id": "COSTT_BOS_06_1100_1700_00_hjo", "translation": "Now, once I return to the company... Ah, yes. I must resume that experiment."},
        {"id": "COSTT_BOS_06_1100_1800_00_hjo", "translation": "The variety of specimens is excellent, but the real question is... can the host endure?"},
        {"id": "COSTT_BOS_06_1100_1900_00_hjo", "translation": "Heh heh... The creation of a hero will soon become a reality."},
        {"id": "COSTT_BOS_06_1100_2000_00_hjo", "translation": "How dull... How much longer do you plan to drag this out?"},
        {"id": "COSTT_BOS_06_1100_2100_00_hjo", "translation": "Enough with the pointless resistance."},
        {"id": "COSTT_BOS_06_1100_2200_00_hjo", "translation": "Oh? Not bad at all."},
        {"id": "COSTT_BOS_06_1100_2300_00_hjo", "translation": "Do give it your best effort."},
        {"id": "COSTT_BOS_06_1100_2400_00_hjo", "translation": "Heh... Is that the best you can do?"},
        {"id": "COSTT_BOS_06_1100_2500_00_hjo", "translation": "Hmph... I'll give you a chance."},
        {"id": "COSTT_BOS_06_1100_2600_00_hjo", "translation": "Come now, hurry, hurry! Heh heh!"},
        {"id": "COSTT_BOS_06_1100_2700_00_hjo", "translation": "You should consider yourself honored."},
        {"id": "COSTT_BOS_06_1100_2800_00_hjo", "translation": "Breathe it in—deep into your lungs."},
        {"id": "COSTT_BOS_06_1100_2900_00_hjo", "translation": "Heh heh... How are you feeling?"},
        {"id": "COSTT_BOS_06_1100_3000_00_hjo", "translation": "Heh heh heh... Think you can find a way out?"},
        {"id": "COSTT_BOS_06_1100_3100_00_hjo", "translation": "Come now, show me your face twisted in fear."},
        {"id": "COSTT_BOS_06_1100_3200_00_hjo", "translation": "It's about time I reduced you to dust."},
        {"id": "COSTT_BOS_06_1100_3300_00_hjo", "translation": "I'll cut you down to your very cells."},
        {"id": "COSTT_BOS_06_1100_3400_00_hjo", "translation": "Heh heh... Go on, run around in vain!"},
        {"id": "COSTT_BOS_06_1100_3500_00_hjo", "translation": "I do so enjoy toying with my prey."}
    ]
}


In [None]:
# # Test the client
# prompt = "Describe Tifa Lockhart from Final Fantasy VII in a json format"

# completion = client.chat.completions.create(
#     model=deployment,
#     messages=[
#         {"role": "system", "content": "You are an expert in the lore and story of the game Final Fantasy VII which includes the orignal game, remakes and spin-offs."},
#         {"role": "user", "content": prompt}
#     ]
# )

# # Print the response
# print(completion.choices[0].message.content)

## Main Workflow

### Dialogue Extraction and Processing

Reads the game's dialogue files from 
- `f"./testing/ModifiedExports/{file_name}.csv"`
- `f"./testing/ModifiedExports/{file_name}_jp.csv"`

and processes them to
- `f"./testing/ModifiedExports/{file_name}_merged_sorted.csv"`

In [None]:
def process_dialogue_file(file_path):
    # Load CSV file
    df = pd.read_csv(file_path, encoding="utf-8")
    
    # Remove metadata row where id is 'language'
    df = df[df["id"] != "language"]
    
    # Remove rows where both 'sub_id' and 'text' are empty
    df = df.dropna(subset=["sub_id", "text"], how="all")
    
    # Ensure 'text' column is treated as a string and replace NaN with empty string
    df["text"] = df["text"].fillna("").astype(str)
    
    # Initialize a list to store structured dialogues
    dialogues = []
    
    # Count occurrences of each ID
    id_counts = df["id"].value_counts()
    
    # Iterate through unique IDs
    for unique_id, count in id_counts.items():
        rows = df[df["id"] == unique_id]
        
        if count == 2:
            # If there are two rows, determine speaker and dialogue
            speaker_row = rows[rows["sub_id"] == "ACTOR"]
            dialogue_row = rows[rows["sub_id"].isna()]
            
            if not speaker_row.empty and not dialogue_row.empty:
                speaker = speaker_row.iloc[0]["text"].strip()
                dialogue = dialogue_row.iloc[0]["text"].strip()
                
                if speaker and dialogue:
                    dialogues.append({"id": unique_id, "speaker": speaker, "dialogue": dialogue})
        
        elif count == 1:
            # If there is only one row, assume it's an NPC/system dialogue
            dialogue = rows.iloc[0]["text"].strip()
            if dialogue:
                dialogues.append({"id": unique_id, "speaker": "NPC", "dialogue": dialogue})
    
    # Convert structured dialogues into a DataFrame
    return pd.DataFrame(dialogues)

# Process English and Japanese files
en_file_path = f"./testing/ModifiedExports/{file_name}.csv"
jp_file_path = f"./testing/ModifiedExports/{file_name}_jp.csv"

en_dialogue_df = process_dialogue_file(en_file_path)
jp_dialogue_df = process_dialogue_file(jp_file_path)

# Merge English and Japanese dialogues using a left join
merged_dialogue_df = en_dialogue_df.merge(jp_dialogue_df, on="id", how="left", suffixes=("_en", "_jp"))

# Remove the speaker_jp column since we want to translate dialogue only
merged_dialogue_df = merged_dialogue_df.drop(columns=["speaker_jp"])
merged_dialogue_df.to_csv(f"./testing/ModifiedExports/{file_name}_merged.csv", index=False)

print(f"Relevant dialogues to be translated in {file_name}: {len(merged_dialogue_df)}")

def extract_sort_keys(id_str):
    parts = id_str.split('_')
    timestamp = int(parts[-3])  # Convert 3rd last part to integer
    scene_id = '_'.join(parts[:-3])  # Everything before last three parts
    return scene_id, timestamp

# Sort dataframe using extracted keys
merged_dialogue_df = merged_dialogue_df.sort_values(by=["id"], key=lambda x: x.map(extract_sort_keys))
merged_dialogue_df.to_csv(f"./testing/ModifiedExports/{file_name}_merged_sorted.csv", index=False)

### Utils

- `clean_json_output`: Takes an `ai_response` string and cleans it up for loading as a JSON object.

In [None]:
def clean_json_output(ai_response):
    # If the response contains ```json, extract the content within
    match = re.search(r"```json\s*(.*?)\s*```", ai_response, re.DOTALL)
    if match:
        return match.group(1)  # Extract only the JSON part
    return ai_response  # Return as-is if no backticks are found

### Get scene ids from the error logs for re-run

scene_ids will get stored into the list `error_scene_ids` for re-run.

In [None]:
# Obtain error scene ids from the error log
def get_error_scene_ids(file_name):
    error_log_folder = "./testing/logs"
    error_scene_ids = []

    for error_file in os.listdir(error_log_folder):
        if error_file.endswith(".json"):
            with open(os.path.join(error_log_folder, error_file), "r", encoding="utf-8") as f:
                error_data = json.load(f)
                error_scene_id = error_data["scene_id"]
                error_scene_ids.append(error_scene_id)

    # Write the error scene ids to a text file
    error_scene_ids_file = f"./testing/logs/{file_name}_error_scene_ids.txt"
    with open(error_scene_ids_file, "w", encoding="utf-8") as f:
        for scene_id in error_scene_ids:
            f.write(f"{scene_id}\n")

    # Delete the error log files
    for error_file in os.listdir(error_log_folder):
        if error_file.endswith(".json"):
            os.remove(os.path.join(error_log_folder, error_file))

    return error_scene_ids


### AI Workflow

- Takes the JP and EN dialogues from the merged file `f"./testing/ModifiedExports/{file_name}_merged_sorted.csv"`, prepares the prompt and runs the AI model to get the translation.
- Error logs get written to `f"./testing/logs/error_{file_name}_{scene_id}.json"`
- Output translations for all the scenes that processed successfully get written to `f"./testing/ModifiedExports/{file_name}_translated.csv"`

In [None]:
# Read the merged dialogue CSV file
merged_file_path = f"./testing/ModifiedExports/{file_name}_merged_sorted.csv"
merged_df = pd.read_csv(merged_file_path, encoding="utf-8")

if not RERUN_MODE:
    scene_ids = sorted(set(merged_df["id"].str.split("_").apply(lambda x: "_".join(x[:-3]))))
    print(f"Translating {len(scene_ids)} scenes.")
else:
    scene_ids = get_error_scene_ids(file_name).copy()
    # scene_ids = ['$C3210_MAIN_0200', '$NIBLE_QST_03_1400', '$NIBLM_BOS_00_1100']
    print(f"Rerun mode enabled. Translating {len(scene_ids)} scenes.")
    print(scene_ids)

# Store translations for all scenes
all_translations = []

for scene_id in scene_ids:
    try:
        # Filter the dataframe for the current scene
        scene_df = merged_df[merged_df["id"].str.startswith(scene_id)].copy()
        
        # Split the scene into chunks of fixed dialogues
        max_dialogues_per_request = 10
        num_chunks = (len(scene_df) + max_dialogues_per_request - 1) // max_dialogues_per_request  # Round up
        
        # Store translated chunks
        scene_translations = {
            "scene_id": scene_id,
            "translations": []
        }

        for chunk_index in range(num_chunks):
            chunk_df = scene_df.iloc[chunk_index * max_dialogues_per_request:(chunk_index + 1) * max_dialogues_per_request]
            
            # Construct the prompt for AI translation
            prompt = f"""
You are an expert Japanese-to-English translator specializing in video game localization, particularly for *Final Fantasy VII Rebirth*. Your task is to translate the following lines while following the guidelines provided below.

### **Guidelines:**  
- Ensure the translation remains faithful to the original Japanese, while making it sound natural in English.  
- Maintain character-specific speech patterns, formality levels, and personality traits.  
- The provided official English localization is included as a reference. Use it for context, but prioritize accuracy to the original Japanese text.  
- If the official localization takes creative liberties, your translation should focus on capturing the original intent while still reading smoothly.

### **Output Format (JSON):**  
{{
    "scene_id": "{scene_id}",
    "translations": [
        {{"id": "<original_id>", "translation": "<your improved English translation>"}} 
        ...
    ]
}}

Here is the Japanese dialogue for a chunk of a scene along with its official English localization for reference:
"""

            for _, row in chunk_df.iterrows():
                prompt += f"\nID: {row['id']}"
                prompt += f"\n{row['speaker_en']} (JP): {row['dialogue_jp']}"
                prompt += f"\n{row['speaker_en']} (EN): {row['dialogue_en']}\n"

            prompt += "\nReturn the output in the specified **valid JSON format**"


            # Ping the client
            completion = client.chat.completions.create(
                model=deployment,
                messages=[
                    {"role": "system", "content": "You are an expert translator specializing in Final Fantasy VII localization."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=4096
            )

            # Parse the JSON response safely
            ai_response = completion.choices[0].message.content
            ai_response_clean = clean_json_output(ai_response)
            translated_data = json.loads(ai_response_clean)

            # Merge translations into the scene's full list
            scene_translations["translations"].extend(translated_data["translations"])

            # Small delay to avoid hitting rate limits
            time.sleep(2)

            # Log progress for each chunk
            print(f"Completed chunk {chunk_index + 1}/{num_chunks} for scene {scene_id}")

        # Append full scene translations
        all_translations.append(scene_translations)

        # Save progress after each scene if not in rerun mode
        if not RERUN_MODE:
            with open(f"./testing/translations_backup_{file_name}.json", "w", encoding="utf-8") as f:
                json.dump(all_translations, f, ensure_ascii=False, indent=4)

        # Log progress for full scene completion
        print(f"Completed scene {scene_id}")
        # TBD: We need a percentage completion logger (number of scenes translated/total number of scenes or something similar)

    except Exception as e:
        # Capture error details
        time_now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

        error_details = {
            "timestamp": time_now,
            "scene_id": scene_id,
            "error_message": str(e),
            "traceback": traceback.format_exc(),
            "prompt_used": prompt,
            "response_received": ai_response
        }

        error_log_file = f"./testing/logs/error_{file_name}_{scene_id}.json"

        # Save error details to a file
        with open(error_log_file, "w", encoding="utf-8") as f:
            json.dump(error_details, f, ensure_ascii=False, indent=4)

        print(f"Error processing scene {scene_id}. Logged details to {error_log_file}")

# Convert all translations to DataFrame
final_translation_df = pd.DataFrame([t for scene in all_translations for t in scene["translations"]])

# Save the final translations to a CSV file if not in rerun mode
if not RERUN_MODE:
    final_translation_df.to_csv(f"./testing/ModifiedExports/{file_name}_translated.csv", index=False, encoding="utf-8")

In [None]:
if RESPONSE_FROM_CHATGPT and RERUN_MODE:
    chatgpt_response_json = json.dumps(chatgpt_response, ensure_ascii=False, indent=4)
    translated_data = json.loads(chatgpt_response_json)
    final_translation_df = pd.DataFrame(translated_data["translations"])
    print(f"Response from ChatGPT written to final_translation_df for scene {translated_data['scene_id']}")

else:
    print("Response from ChatGPT skipped")

### Manual Fixing

- Suppose we did a rerun for specific scene ids, they will be stored in `final_translation_df`.
- We review it and append them to the existing translations in `f"./testing/ModifiedExports/{file_name}_translated.csv"`

In [None]:
if RERUN_MODE:
    new_translations = final_translation_df.copy()

    # Read the existing translation file
    existing_translation_df = pd.read_csv(f"./testing/ModifiedExports/{file_name}_translated.csv")

    # For all ids in the new translations, check if it already exists in the existing translations
    new_ids = new_translations["id"].unique()
    existing_ids = existing_translation_df["id"].unique()
    already_existing_ids = set(new_ids).intersection(existing_ids)
    if len(already_existing_ids) > 0:
        print("New translations contain IDs that already exist in the existing translations. Please check for duplicates.")
    else:
        print("No duplicate IDs found in the new translations. Appending to existing translations...")
        existing_translation_df = pd.concat([existing_translation_df, new_translations], ignore_index=True)
        existing_translation_df.to_csv(f"./testing/ModifiedExports/{file_name}_translated.csv", index=False)
else:
    print("Rerun mode disabled. Skipping manual fixing.")

# Investigate already existing scene ids
if RERUN_MODE:
    print("-----------------------------------")
    print("Investigating scene ids present in both new and existing translations...")

    # Obtain all scene ids from the new translations
    new_scene_ids = final_translation_df["id"].str.split("_").apply(lambda x: "_".join(x[:-3])).unique()
    # Obtain all scene ids from the existing translations
    existing_scene_ids = existing_translation_df["id"].str.split("_").apply(lambda x: "_".join(x[:-3])).unique()

    # Find scene ids that are already present in the existing translations
    common_scene_ids = set(new_scene_ids).intersection(existing_scene_ids)
    print(f"Common scene ids between new and existing translations: {len(common_scene_ids)}")
    print(common_scene_ids)

    # Find scene ids that are present in the new translations but not in the existing translations
    new_only_scene_ids = set(new_scene_ids) - set(existing_scene_ids)
    print(f"Scene ids present only in new translations: {len(new_only_scene_ids)}")
    print(new_only_scene_ids)
else:
    print("Rerun mode disabled. Skipping investigation.")

if RERUN_MODE:
    print("-----------------------------------")
    if complete_investigation:
        print("Resolving already existing scene ids...")

        # Filter out the scene ids that are already present in the existing translations
        final_translation_df = final_translation_df[~final_translation_df["id"].str.split("_").apply(lambda x: "_".join(x[:-3])).isin(common_scene_ids)]

        print("Scenes deleted from new translations:")
    else:
        print("Skipping investigation resolution.")
else:
    print("Rerun mode disabled. Skipping investigation resolution.")
        

### Post-Processing

Writes the new translations in `f"./testing/ModifiedExports/{file_name}_translated.csv"` to the original dialogue file `f"./testing/ModifiedExports/{file_name}.csv"`

In [None]:
# Load the original and translated CSV files
df_original = pd.read_csv(f"./testing/ModifiedExports/{file_name}.csv")
df_translated = pd.read_csv(f"./testing/ModifiedExports/{file_name}_translated.csv")

# Merge original and translated datasets on 'id'
df_merged = df_original.merge(df_translated, on="id", how="left")

# Identify dialogue rows that have a matching translation (sub_id is NaN, text is not blank, and translation is not blank)
dialogue_mask = df_merged["sub_id"].isna() & df_merged["text"].notna() & df_merged["translation"].notna()

# Replace text column only for matching instances with a translation
df_merged.loc[dialogue_mask, "text"] = df_merged.loc[dialogue_mask, "translation"]

# Drop the extra 'translation' column after updating
df_merged.drop(columns=["translation"], inplace=True)

# Save the updated file
df_merged.to_csv(f"./testing/ModifiedExports/{file_name}_updated.csv", index=False)