# Parse XML Data into SQuAD2.0 JSON Format

Written for xml data export from fitness.stackexchange.com

In [None]:
import xml.etree.ElementTree as ET
import html
import re
import json

In [None]:
def clean_html(raw_html):
    """
    Removes HTML tags from a string using regex.
    """
    clean = re.compile('<.*?>')
    return re.sub(clean, '', raw_html)

In [None]:
def parse_to_squad_format(file_path, output_path="fitness_squad.json", min_score=2):
    """
    Parses StackExchange Posts.xml and converts to a SQuAD-style JSON dataset.
    
    Parameters:
    - file_path: str, path to the Posts.xml file
    - output_path: str, path to save the resulting JSON
    - min_score: int, minimum score for a question to be included
    """

    # Parse the XML
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Dictionaries to store questions and answers
    questions = {}
    answers = {}

    # First pass: extract all questions and answers
    for row in root:
        attrib = row.attrib
        post_type = attrib.get("PostTypeId")

        # --- Handle Questions ---
        if post_type == "1":
            post_id = attrib.get("Id")
            score = int(attrib.get("Score", "0"))  # Default to 0 if missing
            accepted_id = attrib.get("AcceptedAnswerId")
            title = attrib.get("Title", "")
            body = attrib.get("Body", "")

            if accepted_id and score >= min_score:
                # Clean and combine title + body
                question_text = title.strip() + " " + clean_html(html.unescape(body)).strip()

                questions[post_id] = {
                    "question": question_text,
                    "accepted_answer_id": accepted_id
                }

        # --- Handle Answers ---
        elif post_type == "2":
            post_id = attrib.get("Id")
            body = attrib.get("Body", "")
            answer_text = clean_html(html.unescape(body)).strip()
            answers[post_id] = answer_text

    # Assemble the SQuAD-style JSON structure
    squad_data = {
        "data": [
            {
                "title": "FitnessStackExchange",
                "paragraphs": []
            }
        ]
    }

    # Match questions with their accepted answers
    for q_id, q_data in questions.items():
        accepted_id = q_data["accepted_answer_id"]

        # Ensure the accepted answer exists
        if accepted_id in answers:
            context = answers[accepted_id]
            question = q_data["question"]

            # Build one QA entry
            qa_entry = {
                "context": context,
                "qas": [
                    {
                        "id": f"fitness-{q_id}",
                        "question": question,
                        "answers": [
                            {
                                "text": context,
                                "answer_start": 0  # Whole answer as the span
                            }
                        ]
                    }
                ]
            }

            squad_data["data"][0]["paragraphs"].append(qa_entry)

    # Save the SQuAD-style JSON to a file
    with open(output_path, "w") as f:
        json.dump(squad_data, f, indent=2)

    print(f"Saved {len(squad_data['data'][0]['paragraphs'])} high-quality QA pairs to {output_path}")


In [None]:
# Execute conversion
file_path = 'Data/fitness.stackexchange.com/Posts.xml'
parse_to_squad_format(file_path, output_path="fitness_squad_filtered.json", min_score=2)


Saved 3262 high-quality QA pairs to fitness_squad_filtered.json


In [18]:
# load in the data as json format
json_filepath = 'fitness_squad_filtered.json'

with open(json_filepath, 'r') as f:
    fit_data = json.load(f)

In [20]:
# example data
print("Number of question and answer pairs:",len(fit_data['data'][0]['paragraphs']))

Number of question and answer pairs: 3262


In [None]:
# look at the first entry
fit_data['data'][0]['paragraphs'][0]

{'context': 'The main difference is in the "purity", how much lactose and fat is left with the protein after filtering. Whey isolate usually contains around 90% protein and whey concentrate is more like 70-85%.\n\nIf you have trouble digesting the lactose or are trying to minimize carbohydrate content, then whey isolate would be a good choice. Otherwise, it probably doesn\'t matter; just pick the concentrate since it\'s cheaper in terms of protein grams/dollar.',
 'qas': [{'id': 'fitness-2',
   'question': "What's the difference between Whey Isolate and Whey Concentrate in shakes? What's the difference? I'm looking at shake options and some contain whey isolate, some contain whey concentrate and some both.",
   'answers': [{'text': 'The main difference is in the "purity", how much lactose and fat is left with the protein after filtering. Whey isolate usually contains around 90% protein and whey concentrate is more like 70-85%.\n\nIf you have trouble digesting the lactose or are trying 