<a href="https://colab.research.google.com/github/clangreformers/clangreformers-test.github.io/blob/master/translate_tw_to_en.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install openai
%pip install tiktoken

In [16]:
from openai import OpenAI
import os
import tiktoken
from google.colab import userdata

OPENAI_API_KEY = userdata.get('openai_api_key')
# Set the API key as an environment variable
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

# Directory containing files to translate
input_directory = "drive/MyDrive/clangreformers/tw"
output_directory = "drive/MyDrive/clangreformers/en"

# Ensure the output directory exists
os.makedirs(output_directory, exist_ok=True)

# Function to count tokens
def count_tokens(text, model="gpt-4o-mini"):
    """Calculate the number of tokens in a given text for the specified model."""
    encoding = tiktoken.get_encoding("gpt2")
    return len(encoding.encode(text))

# Function to split text into manageable chunks
def split_text(text, max_tokens, model="gpt-4o-mini"):
    """Split text into chunks that fit within the token limit."""
    encoding = tiktoken.get_encoding("gpt2")
    tokens = encoding.encode(text)
    chunks = []
    for i in range(0, len(tokens), max_tokens):
        chunk = tokens[i:i + max_tokens]
        chunks.append(encoding.decode(chunk))
    return chunks

# Function to translate text using OpenAI o1mini API
def translate_text(text, source_lang="zh-TW", target_lang="en", model="gpt-4o-mini"):
    """Translate text using OpenAI o1mini API."""
    prompt = f"Translate the following text from {source_lang} to {target_lang}:\n\n{text}"
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are a helpful translation assistant."},
                {"role": "user", "content": prompt}
            ]
        )
        print(response)
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error during translation: {e}")
        return None

# Token limit for the model
MODEL = "gpt-4o-mini"
MAX_TOKENS = 2000  # Adjust based on model's max context size
SAFE_LIMIT = MAX_TOKENS - 500  # Leave space for prompt and response

client = OpenAI()

# Iterate through all files in the input directory
for filename in os.listdir(input_directory):
    if filename.endswith(".txt"):  # Process only .txt files
        input_file_path = os.path.join(input_directory, filename)
        output_file_path = os.path.join(output_directory, filename)

        print(input_file_path)

        with open(input_file_path, "r", encoding="utf-8") as input_file:
            traditional_text = input_file.read()

        # Split text into manageable chunks if necessary
        if count_tokens(traditional_text, model=MODEL) > SAFE_LIMIT:
            text_chunks = split_text(traditional_text, SAFE_LIMIT, model=MODEL)
        else:
            text_chunks = [traditional_text]

        translated_text = ""

        # Translate each chunk
        for i, chunk in enumerate(text_chunks):
            print(f"Translating chunk {i + 1}/{len(text_chunks)} of {filename}...")
            result = translate_text(chunk, source_lang="Traditional Chinese", target_lang="English", model=MODEL)
            if result:
                translated_text += result + "\n"
            else:
                print(f"Failed to translate chunk {i + 1} of {filename}")

        # Save the translated text to the output file
        with open(output_file_path, "w", encoding="utf-8") as output_file:
            output_file.write(translated_text)

        print(f"Translated and saved: {filename}")

print("Translation completed.")


drive/MyDrive/clangreformers/tw/2024-12-21-profWangLostVoice-tw.txt
Translating chunk 1/7 of 2024-12-21-profWangLostVoice-tw.txt...
ChatCompletion(id='chatcmpl-Amx1r0DHcSSRiJBb7dzjwFMWmBBQi', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='---\n\nlayout: post  \ntitle: "Wah Wah Wah for Three Days, Can\'t Take It Anymore - Mr. Wang Shou-Kang"  \ndate: 2024-12-21 14:00:00 -0000  \ncategories: jekyll update  \nref: prof-wang-lost-voice  \nlang: Traditional Chinese  \n---\n\nAuthor: Wang Zhengfang\n\nExcerpt from "A Man with Ambition"\n\n![image](/assets/imgs/peterwang_trilogy3.jpg "A Man with Ambition"){ : width="200" }\n\nTen, Wah Wah Wah for Three Days, Can\'t Take It Anymore!\n\nI slept on the floor next to my father\'s hospital bed, unable to sleep all night. The ordinary ward housed sixteen patients, and among them was a liver disease patient whose painkillers were ineffective; he screamed loudly from the pain, straining his throat

# New Section