# Question-Answer data generator
This script generates a dataset for a question(input)-answer(label) LLM that has the tendency to compare its answer always to objects in space.

In [15]:
from ollama import Client
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm.notebook import tqdm_notebook

ollama = Client()

### Wikipedia Page Crawler

This function, `crawl_wikipedia_pages`, takes a list of Wikipedia URLs and extracts content under each top-level section header (`<h2>`). The output is a list of dictionaries containing section headings and their associated content, useful for text analysis or content aggregation.

In [10]:
def crawl_wikipedia_pages(urls):
    """
    Crawls a list of Wikipedia pages and returns a list of dicts
    in the format: {"title": <section_header>, "content": <section_content>}.
    
    Each dict corresponds to a top-level <h2> header and its immediate paragraphs.
    """
    results = []
    
    for url in tqdm_notebook(urls, desc="Crawling wikipedia pages..."):
        response = requests.get(url)
        if response.status_code != 200:
            print(f"Failed to retrieve {url}")
            continue
        
        soup = BeautifulSoup(response.text, "html.parser")

        # (Optional) Get the main page title (the big heading at the top).
        # Not to be confused with section headers.
        main_title_element = soup.find("h1", id="firstHeading")
        main_title = main_title_element.get_text(strip=True) if main_title_element else url
        
        # Find all top-level section headers (usually <h2> on Wikipedia).
        headers = soup.find_all("h2")
        
        for header in headers:
            # Extract the section title
            header_title = header.get_text(strip=True)
            
            # Gather paragraphs until the next <h2>
            section_paragraphs = []
            sibling = header.parent.next_sibling
            
            while sibling and not (sibling.name == "h2"):
                if sibling.name == "p":
                    section_paragraphs.append(sibling.get_text(strip=True))
                sibling = sibling.next_sibling
            
            section_content = "\n".join(section_paragraphs)
            results.append({
                "url": url,
                "heading": header_title,
                "content": section_content
            })
    
    return results

In [13]:
wiki_links_df = pd.read_csv("wiki_links.csv")
wiki_links_list = wiki_links_df["link"].tolist()
wiki_pages = crawl_wikipedia_pages(wiki_links_list)

Crawling wikipedia pages...:   0%|          | 0/22 [00:00<?, ?it/s]

## Question generator
This part makes up questions (input) for the dataset, based on the previously crawled wikipedia data.

In [None]:
# We'll build a function to generate questions for a single chunk of text
def generate_questions_for_chunk(chunk_text, example_questions, seed=0):
    """
    Given a chunk of text from Wikipedia,
    generate a list of questions anchored in that chunk.
    """
    example_string = ", ".join(example_questions)
    
    # Create a prompt that includes only the chunk_text as knowledge source
    question_generation_prompt = (
        f"<|im_start|>system\n"
        f"You are a data generator. Below is a chunk of Wikipedia text:\n\n"
        f"{chunk_text}\n\n"
        f"Using this information, please generate several questions that resemble these examples ({example_string}), "
        f"but are based ONLY on the text above. Separate your questions with commas.\n"
        f"<|im_end|>\n"
        f"<|im_start|>assistant\n"
        f"Sure, here are the questions based on the text above, separated by commas:\n"
    )
    
    options = {"seed": seed}
    generated = ollama.generate(
        model="qwen2.5:32b",
        prompt=question_generation_prompt,
        options=options
    )
    questions = generated["response"].split(",")
    return [q.strip() for q in questions if q.strip()]

## Answer generator
This code part makes up the answers (labels) for the dataset, by trying to answer the previously prompted question with the help of an pretrained LLM.

In [None]:
# We'll build a function to generate an answer for a single question, given the same chunk of text
def generate_answer_for_question(chunk_text, question, seed=1):
    """
    Generate a short answer for `question`, referencing or comparing to space objects,
    and anchored in `chunk_text`.
    """
    answer_generation_prompt = (
        f"<|im_start|>system\n"
        f"You are a data generator. Below is a chunk of Wikipedia text:\n\n"
        f"{chunk_text}\n\n"
        f"<|im_end|>\n"
        f"<|im_start|>\n"
        f"Create a short answer to the following question, '{question}'. "
        f"Compare or reference space objects or phenomena if possible. Use ONLY the text above as your knowledge source.\n"
        f"<|im_end|>\n"
        f"<|im_start|>assistant\n"
        f"Certainly! Here is a short answer based on the text, referencing space:\n"
    )
    
    options = {"seed": seed}
    generated = ollama.generate(
        model="qwen2.5:32b",
        prompt=answer_generation_prompt,
        options=options
    )
    return generated["response"].strip()

## Putting it all together
This script first generates a bunch of questions using the previously described question generator function and then creates the appropiate answers to these questions.

In [16]:
example_questions = [
    "What is the capital of France?",
    "How do you bake a cake?",
    "What is the formula for calculating speed?",
    "Can you explain photosynthesis?",
    "What are the symptoms of a cold?",
]

dataset = {
    "URL": [],
    "Section_Heading": [],
    "Question": [],
    "Answer": []
}

# We'll generate 1-3 questions per section (or more, depending on your preference)
# to keep the example simpler
questions_per_section = 3

section_counter = 0
for section_info in tqdm_notebook(wiki_pages, desc="Processing Sections"):
    url = section_info["url"]
    heading = section_info["heading"]
    chunk_text = section_info["content"]

    # Generate some questions anchored in this chunk
    # using a new random seed for each section
    questions = generate_questions_for_chunk(
        chunk_text=chunk_text,
        example_questions=example_questions,
        seed=section_counter  # you could vary or randomize the seed
    )

    # If we got more questions than we want, let's trim
    questions = questions[:questions_per_section]

    # Generate answers for each question
    for q in questions:
        answer = generate_answer_for_question(
            chunk_text=chunk_text,
            question=q,
            seed=1
        )
        
        dataset["URL"].append(url)
        dataset["Section_Heading"].append(heading)
        dataset["Question"].append(q)
        dataset["Answer"].append(answer)

    section_counter += 1


##################################
# 5. Save the Final Q&A to CSV
##################################
df = pd.DataFrame(dataset)
df.to_csv("wiki_qa_by_headline.csv", index=False)

print("Q&A dataset saved to 'wiki_qa_by_headline.csv'.")


Processing Sections:   0%|          | 0/301 [00:00<?, ?it/s]

KeyboardInterrupt: 