In [8]:
import os
import openai
from pathlib import Path
import random
import asyncio
import aiohttp
from typing import List, Dict

async def get_qa_from_gpt_async(paragraph: str, session, previous_questions: List[str]) -> List[str]:
    async with session.post(
        "https://api.openai.com/v1/chat/completions",
        headers={"Authorization": f"Bearer {openai.api_key}"},
        json={
            "model": "gpt-4o-mini-2024-07-18",
            "messages": [
                {"role": "system", "content": f"""Generate 3 question-answer pairs about the following text. Format each pair as 'question|answer'. Both Question and Answer should be in polish. If you can't generate a question, just skip it.
Previous questions asked:
{chr(10).join(previous_questions) if previous_questions else 'None'}
Please generate unique questions that haven't been asked before."""},
                {"role": "user", "content": paragraph}
            ]
        }
    ) as response:
        result = await response.json()
        return result['choices'][0]['message']['content'].strip().split('\n')

async def process_files_async(input_dir: str, output_dir: str, batch_size: int = 5, min_qa_pairs: int = 100):
    base_dir = Path(output_dir)
    questions_dir = base_dir / 'questions'
    answers_dir = base_dir / 'answers'
    
    for dir_path in [questions_dir, answers_dir]:
        dir_path.mkdir(parents=True, exist_ok=True)
    
    async with aiohttp.ClientSession() as session:
        for file_path in Path(input_dir).glob('*.txt'):
            file_questions, file_answers = [], []
            text = file_path.read_text()
            paragraphs = split_into_paragraphs(text)
            processed_paragraphs = set()
            
            while len(file_questions) < min_qa_pairs and (len(processed_paragraphs) < len(paragraphs)):
                available_paragraphs = [p for i, p in enumerate(paragraphs) 
                                     if i not in processed_paragraphs]
                
                batch = random.sample(available_paragraphs, 
                                    min(batch_size, len(available_paragraphs)))
                
                questions, answers = await process_paragraph_batch(batch, session, file_questions)
                file_questions.extend(questions)
                file_answers.extend(answers)
                
                for p in batch:
                    processed_paragraphs.add(paragraphs.index(p))
                
                print(f"{file_path.stem}: {len(file_questions)}/{min_qa_pairs} QA pairs generated")
                
                if len(processed_paragraphs) == len(paragraphs) and len(file_questions) < min_qa_pairs:
                    print(f"Warning: Could not generate {min_qa_pairs} QA pairs from {file_path.stem}")
                    break
            
            if file_questions:
                (questions_dir / f"{file_path.stem}_questions.txt").write_text('\n'.join(file_questions))
                (answers_dir / f"{file_path.stem}_answers.txt").write_text('\n'.join(file_answers))

async def process_paragraph_batch(paragraphs: List[str], session, previous_questions: List[str]) -> tuple[List[str], List[str]]:
    questions, answers = [], []
    tasks = [get_qa_from_gpt_async(p, session, previous_questions) for p in paragraphs]
    results = await asyncio.gather(*tasks, return_exceptions=True)
    
    for qa_pairs in results:
        if isinstance(qa_pairs, Exception):
            continue
        for pair in qa_pairs:
            if '|' not in pair or pair.count('|') > 1:
                continue
            q, a = pair.split('|')
            q, a = q.strip(), a.strip()
            if q and a and q not in previous_questions:
                questions.append(q)
                answers.append(a)
    
    return questions, answers

# Usage
with open('../../secret.txt') as f:
    openai.api_key = f.read().strip()
#openai.api_key = ''
await process_files_async('../rawdata', 'q_a')

Balladyna: 8/100 QA pairs generated
Balladyna: 21/100 QA pairs generated
Balladyna: 30/100 QA pairs generated
Balladyna: 39/100 QA pairs generated
Balladyna: 46/100 QA pairs generated
Balladyna: 54/100 QA pairs generated
Balladyna: 60/100 QA pairs generated
Balladyna: 70/100 QA pairs generated
Balladyna: 79/100 QA pairs generated
Balladyna: 87/100 QA pairs generated
Balladyna: 100/100 QA pairs generated
Dziady_(Mickiewicz): 10/100 QA pairs generated
Dziady_(Mickiewicz): 22/100 QA pairs generated
Dziady_(Mickiewicz): 37/100 QA pairs generated
Dziady_(Mickiewicz): 48/100 QA pairs generated
Dziady_(Mickiewicz): 63/100 QA pairs generated
Dziady_(Mickiewicz): 74/100 QA pairs generated
Dziady_(Mickiewicz): 86/100 QA pairs generated
Dziady_(Mickiewicz): 101/100 QA pairs generated
Konrad_Wallenrod: 11/100 QA pairs generated
Konrad_Wallenrod: 23/100 QA pairs generated
Konrad_Wallenrod: 38/100 QA pairs generated
Konrad_Wallenrod: 53/100 QA pairs generated
Konrad_Wallenrod: 65/100 QA pairs genera