In [None]:
from openai import OpenAI
from docx import Document
import json
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Set up OpenAI client
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def read_docx(file_path):
    """Read text from a .docx file."""
    document = Document(file_path)
    return '\n'.join([paragraph.text for paragraph in document.paragraphs if paragraph.text.strip() != ""])

def chunk_text(text, chunk_size=300):
    """Split the text into smaller chunks."""
    words = text.split()
    return [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]

def generate_qa_from_chunk(chunk):
    """Generate a question and answer from a text chunk using OpenAI API."""
    question_completion = client.chat.completions.create(
        model="gpt-4o-mini-2024-07-18",
        messages=[
            {"role": "system", "content": "You are EssamRafie, a factual chatbot that is also sarcastic."},
            {"role": "user", "content": f"Based on the following text, generate an insightful question:\n\n{chunk}\n\nQuestion:"}
        ]
    )
    question = question_completion.choices[0].message.content 

    answer_completion = client.chat.completions.create(
        model="gpt-4o-mini-2024-07-18",
        messages=[
            {"role": "system", "content": "You are EssamRafie, a factual chatbot that is also sarcastic."},
            {"role": "user", "content": f"Based on the following text, generate an accurate answer:\n\n{chunk}\n\nAnswer:"}
        ]
    )
    answer = answer_completion.choices[0].message.content 

    return question, answer


def create_dataset_from_article(article_text, min_examples=10):
    """Create a dataset from an article, ensuring at least min_examples."""
    words = article_text.split()
    num_words = len(words)
    chunk_size = max(1, num_words // min_examples) # Calculate chunk size, at least 1 word

    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, num_words, chunk_size)]
    #If we don't have enough chunks, add some smaller ones to reach minimum
    while len(chunks) < min_examples:
        last_chunk = chunks[-1]
        if len(last_chunk.split()) >1: #Avoid creating chunks of just one word,
            chunks.append(last_chunk.split()[-1]) #Add last word as a new chunk
        else:
             break # No more words to split

    dataset = []
    for chunk in chunks:
        try:
            question, answer = generate_qa_from_chunk(chunk)
            dataset.append({
                "messages": [
                    {"role": "system", "content": "You are EssamRafie, a factual chatbot that is also sarcastic."},
                    {"role": "user", "content": question},
                    {"role": "assistant", "content": answer}
                ]
            })
        except Exception as e:
            print(f"Error processing chunk: {e}") #Handles potential errors gracefully.

    return dataset



def save_dataset_to_jsonl(dataset, file_name):
    """Save dataset to a JSONL file."""
    with open(file_name, 'w') as file:
        for entry in dataset:
            file.write(json.dumps(entry) + '\n')

# Example usage
docx_file_path = "../datasets/personal_statement.docx"  # Path to your document
article_text = read_docx(docx_file_path)

# Generate dataset
dataset = create_dataset_from_article(article_text)

# Save dataset
save_dataset_to_jsonl(dataset, "../datasets/generated_dataset.jsonl")

print("Dataset created and saved as 'generated_dataset.jsonl'.")


Dataset created and saved as 'generated_dataset.jsonl'.
