In [25]:
import json
# Define the file paths
input_file = "aqua_train.json"   # Source file: one JSON object per line
train_output_file = "aqua_train_processed.json"
#
# input_file = "aqua_test.json"   # Source file: one JSON object per line
test_output_file = "aqua_test_processed.json"

train_limit = 10000
test_limit = 10000

In [26]:
train_examples = {}
test_examples = {}
with open(input_file, "r", encoding="utf-8") as fin:
    for i, line in enumerate(fin, start=1):
        line = line.strip()
        if not line:
            continue  # Skip empty lines
        # Each line is a JSON object with keys: "question", "options", "rationale", "correct"
        try:
            data = json.loads(line)
        except:
            print(f"i={i}, line={line}")
            quit(1)
        
        # Build the input string: question + newline + "Answer Choices: " + joined options.
        question = data.get("question", "").strip()
        options = data.get("options", [])
        # Join the answer options with a space
        options_text = " ".join(opt.strip() for opt in options)
        input_text = f"{question}\nAnswer Choices: {options_text}"
        
        # Normalize the correct answer (e.g., "E" becomes "E")
        correct_answer = data.get("correct", "").strip().upper()
        
        if i <= train_limit:
            # Create the example entry using the record number as a key
            train_examples[str(i)] = {
                "input": input_text,
                "output": correct_answer
            }
        elif i > train_limit and i <= train_limit + test_limit:
            # Create the example entry using the record number as a key
            test_examples[str(i-train_limit)] = {
                "input": input_text,
                "output": correct_answer
            }
        else:
            break

# Build the final output dictionary
train_processed_data = {
    "metadata": {
        "num_examples": len(train_examples)
    },
    "examples": train_examples
}

# Build the final output dictionary
test_processed_data = {
    "metadata": {
        "num_examples": len(test_examples)
    },
    "examples": test_examples
}

# Write the processed data to the output file
with open(train_output_file, "w", encoding="utf-8") as fout:
    json.dump(train_processed_data, fout, ensure_ascii=False, indent=4)

# Write the processed data to the output file
with open(test_output_file, "w", encoding="utf-8") as fout:
    json.dump(test_processed_data, fout, ensure_ascii=False, indent=4)

print(f"Processed {len(train_examples)} examples. Output written to {train_output_file}.")
print(f"Processed {len(test_examples)} examples. Output written to {test_output_file}.")

Processed 10000 examples. Output written to aqua_train_processed.json.
Processed 10000 examples. Output written to aqua_test_processed.json.


In [24]:
# import json
# import re

# def normalize_option(option_str):
#     """
#     Normalize an option string so that the option letter is formatted as 'A)',
#     'B)', etc. This function uses a regex pattern to:
#       - Skip any leading whitespace.
#       - Capture the first letter.
#       - Skip any extra occurrences of that letter or punctuation.
#       - Capture the rest of the text.
#     For example:
#       "A)A)$60.00"    --> "A)$60.00"
#       "B)b.) I and II only" --> "B)I and II only"
#       "A)a.) I"       --> "A)I"
#     """
#     # The regex explanation:
#     # ^\s*           : Start at the beginning, skipping leading spaces.
#     # ([A-Za-z])     : Capture the first letter (the option letter).
#     # \W*            : Match zero or more non-word characters (e.g. a closing parenthesis).
#     # (?:[A-Za-z]\W*)*: Optionally match additional occurrences of letters and non-word characters.
#     # (.*)$          : Capture the rest of the option text.
#     m = re.match(r'^\s*([A-Za-z])\W*(?:[A-Za-z]\W*)*(.*)$', option_str)
#     if m:
#         # Ensure the letter is uppercase
#         letter = m.group(1).upper()
#         # Remove any extra spaces from the remaining text
#         text = m.group(2).strip()
#         return f"{letter}){text}"
#     else:
#         # If it doesn't match, simply strip the option string
#         return option_str.strip()

# # Define the file paths
# input_file = "aqua_train.json"            # Source file: one JSON object per line
# train_output_file = "aqua_train_processed.json"
# test_output_file = "aqua_test_processed.json"

# # Define limits for how many examples go into train and test sets
# train_limit = 10000
# test_limit = 10000

# train_examples = {}
# test_examples = {}

# with open(input_file, "r", encoding="utf-8") as fin:
#     for i, line in enumerate(fin, start=1):
#         line = line.strip()
#         if not line:
#             continue  # Skip empty lines

#         # Parse the JSON; if there's an error, print the line and quit.
#         try:
#             data = json.loads(line)
#         except Exception as e:
#             print(f"Error parsing JSON on line {i}: {line}\n{e}")
#             quit(1)
        
#         # Get the question and the list of options
#         question = data.get("question", "").strip()
#         options = data.get("options", [])

#         # Normalize each option using our helper function
#         norm_options = [normalize_option(opt) for opt in options]
#         # Join the normalized options with a space between them
#         options_text = " ".join(norm_options)
        
#         # Format the input string to include the question
#         # and the normalized answer choices, exactly as desired.
#         input_text = f"{question}\nAnswer Choices: {options_text}"
        
#         # Normalize the correct answer by stripping and converting to uppercase.
#         correct_answer = data.get("correct", "").strip().upper()
        
#         # Depending on the current line number, add the example to the train or test dictionary.
#         if i <= train_limit:
#             train_examples[str(i)] = {
#                 "input": input_text,
#                 "output": correct_answer
#             }
#         elif i <= train_limit + test_limit:
#             # For test examples, use (i - train_limit) as the key.
#             test_examples[str(i - train_limit)] = {
#                 "input": input_text,
#                 "output": correct_answer
#             }
#         else:
#             break

# # Build the final output dictionary for both train and test data.
# train_processed_data = {
#     "metadata": {
#         "num_examples": len(train_examples)
#     },
#     "examples": train_examples
# }

# test_processed_data = {
#     "metadata": {
#         "num_examples": len(test_examples)
#     },
#     "examples": test_examples
# }

# # Write the processed train data to the output file.
# with open(train_output_file, "w", encoding="utf-8") as fout:
#     json.dump(train_processed_data, fout, ensure_ascii=False, indent=4)

# # Write the processed test data to the output file.
# with open(test_output_file, "w", encoding="utf-8") as fout:
#     json.dump(test_processed_data, fout, ensure_ascii=False, indent=4)

# print(f"Processed {len(train_examples)} train examples. Output written to {train_output_file}.")
# print(f"Processed {len(test_examples)} test examples. Output written to {test_output_file}.")


Processed 10000 train examples. Output written to aqua_train_processed.json.
Processed 10000 test examples. Output written to aqua_test_processed.json.
