In [6]:
import os, openai
from pathlib import Path
import random, asyncio, aiohttp
from typing import List

async def translate_chunk_async(text: str, session) -> str:
    async with session.post(
        "https://api.openai.com/v1/chat/completions",
        headers={"Authorization": f"Bearer {openai.api_key}"},
        json={
            "model": "gpt-4o-mini-2024-07-18",
            "messages": [
                {"role": "system", "content": "Translate the following text to English. Keep the original structure and formatting."},
                {"role": "user", "content": text}
            ]
        }
    ) as response:
        result = await response.json()
        return result['choices'][0]['message']['content'].strip()

async def process_batch(paragraphs: List[str], session) -> List[str]:
    tasks = [translate_chunk_async(p, session) for p in paragraphs]
    results = await asyncio.gather(*tasks, return_exceptions=True)
    return [r for r in results if not isinstance(r, Exception)]

def split_into_paragraphs(text: str) -> List[str]:
    return [p.strip() for p in text.split('\n\n') if p.strip()]

async def process_files_async(input_dir: str, output_dir: str, batch_size: int = 5, min_paragraphs: int = 100):
    source_dir = Path(output_dir) / 'source'
    translations_dir = Path(output_dir) / 'translations'
    
    for dir_path in [source_dir, translations_dir]:
        dir_path.mkdir(parents=True, exist_ok=True)
    
    async with aiohttp.ClientSession() as session:
        for file_path in Path(input_dir).glob('*.txt'):
            text = file_path.read_text()
            paragraphs = split_into_paragraphs(text)
            
            if len(paragraphs) > min_paragraphs:
                paragraphs = random.sample(paragraphs, min_paragraphs)
            
            source_chunks = []
            translated_chunks = []
            
            for i in range(0, len(paragraphs), batch_size):
                batch = paragraphs[i:i+batch_size]
                translations = await process_batch(batch, session)
                
                source_chunks.extend(batch)
                translated_chunks.extend(translations)
                print(f"{file_path.stem}: {len(translated_chunks)}/{len(paragraphs)} paragraphs translated")
            
            # Save source and translation with === separators
            (source_dir / f"{file_path.stem}.txt").write_text('\n===\n'.join(source_chunks))
            (translations_dir / f"{file_path.stem}.txt").write_text('\n===\n'.join(translated_chunks))

if __name__ == "__main__":
    with open('../../secret.txt') as f:
        openai.api_key = f.read().strip()
    await process_files_async('../rawdata', 'trans')

Balladyna: 5/100 paragraphs translated
Balladyna: 10/100 paragraphs translated
Balladyna: 15/100 paragraphs translated
Balladyna: 20/100 paragraphs translated
Balladyna: 25/100 paragraphs translated
Balladyna: 30/100 paragraphs translated
Balladyna: 35/100 paragraphs translated
Balladyna: 40/100 paragraphs translated
Balladyna: 45/100 paragraphs translated
Balladyna: 50/100 paragraphs translated
Balladyna: 55/100 paragraphs translated
Balladyna: 60/100 paragraphs translated
Balladyna: 65/100 paragraphs translated
Balladyna: 70/100 paragraphs translated
Balladyna: 75/100 paragraphs translated
Balladyna: 80/100 paragraphs translated
Balladyna: 85/100 paragraphs translated
Balladyna: 90/100 paragraphs translated
Balladyna: 95/100 paragraphs translated
Balladyna: 100/100 paragraphs translated
Dziady_(Mickiewicz): 5/100 paragraphs translated
Dziady_(Mickiewicz): 10/100 paragraphs translated
Dziady_(Mickiewicz): 15/100 paragraphs translated
Dziady_(Mickiewicz): 20/100 paragraphs translated
D