In [2]:
import os, openai
from pathlib import Path
import asyncio, aiohttp, random
from typing import List, Tuple

async def summarize_chunk_async(text: str, session) -> str:
    async with session.post(
        "https://api.openai.com/v1/chat/completions",
        headers={"Authorization": f"Bearer {openai.api_key}"},
        json={
            "model": "gpt-4o-mini-2024-07-18",
            "messages": [
                {"role": "system", "content": "Provide a concise summary in Polish focusing on key points. Make it into a paragraph in one line."},
                {"role": "user", "content": text}
            ]
        }
    ) as response:
        result = await response.json()
        return result['choices'][0]['message']['content'].strip()

async def process_batch(chunks: List[str], session) -> List[str]:
    tasks = [summarize_chunk_async(chunk, session) for chunk in chunks]
    results = await asyncio.gather(*tasks, return_exceptions=True)
    return [r for r in results if not isinstance(r, Exception)]

def get_chunks(text: str, chunk_size: int = 2000, num_samples: int = 100) -> List[Tuple[int, str]]:
    # Create chunks with their positions
    chunks = [(i, text[i:i+chunk_size]) 
             for i in range(0, len(text), chunk_size)]
    
    # Sample if we have more chunks than needed
    if len(chunks) > num_samples:
        return random.sample(chunks, num_samples)
    return chunks

async def process_files_async(input_dir: str, output_dir: str, batch_size: int = 5, chunk_size: int = 2000):
    source_dir = Path(output_dir) / 'source'
    summaries_dir = Path(output_dir) / 'summaries'
    
    for dir_path in [source_dir, summaries_dir]:
        dir_path.mkdir(parents=True, exist_ok=True)
    
    async with aiohttp.ClientSession() as session:
        for file_path in Path(input_dir).glob('*.txt'):
            text = file_path.read_text()
            chunk_pairs = get_chunks(text, chunk_size)  # Get position-chunk pairs
            # Sort by position to maintain order
            chunk_pairs.sort(key=lambda x: x[0])
            chunks = [chunk for _, chunk in chunk_pairs]
            
            summaries = []
            for i in range(0, len(chunks), batch_size):
                batch = chunks[i:i+batch_size]
                batch_summaries = await process_batch(batch, session)
                summaries.extend(batch_summaries)
                print(f"{file_path.stem}: {len(summaries)}/{len(chunks)} chunks summarized")
            
            # Save source chunks and summaries with === separators
            (source_dir / f"{file_path.stem}.txt").write_text('\n===\n'.join(chunks))
            (summaries_dir / f"{file_path.stem}.txt").write_text('\n===\n'.join(summaries))

if __name__ == "__main__":
    with open('../../secret.txt') as f:
        openai.api_key = f.read().strip()
    await process_files_async('../rawdata', 'summary')

Balladyna: 5/86 chunks summarized
Balladyna: 10/86 chunks summarized
Balladyna: 15/86 chunks summarized
Balladyna: 20/86 chunks summarized
Balladyna: 25/86 chunks summarized
Balladyna: 30/86 chunks summarized
Balladyna: 35/86 chunks summarized
Balladyna: 40/86 chunks summarized
Balladyna: 45/86 chunks summarized
Balladyna: 50/86 chunks summarized
Balladyna: 55/86 chunks summarized
Balladyna: 60/86 chunks summarized
Balladyna: 65/86 chunks summarized
Balladyna: 70/86 chunks summarized
Balladyna: 75/86 chunks summarized
Balladyna: 80/86 chunks summarized
Balladyna: 85/86 chunks summarized
Balladyna: 86/86 chunks summarized
Dziady_(Mickiewicz): 5/100 chunks summarized
Dziady_(Mickiewicz): 10/100 chunks summarized
Dziady_(Mickiewicz): 15/100 chunks summarized
Dziady_(Mickiewicz): 20/100 chunks summarized
Dziady_(Mickiewicz): 25/100 chunks summarized
Dziady_(Mickiewicz): 30/100 chunks summarized
Dziady_(Mickiewicz): 35/100 chunks summarized
Dziady_(Mickiewicz): 40/100 chunks summarized
Dzia