In [None]:
import os
from openai import OpenAI
import pandas as pd
import tqdm

client = OpenAI(api_key="")



In [None]:
def prompt_for_question_rephrasing(question):
   prompt = f"""You are a professional question rephraser specializing in generating lexically diverse variations. Your task is to produce **6 maximally divergent** rephrasings of the input question.

### CORE INSTRUCTIONS FOR MINIMAL OVERLAP

1.  **Preserve Intent:** Each rephrasing must maintain the **exact original meaning, specificity, and scope**. Do not add, remove, or change information.

2.  **Maximize Lexical Distance:**
    * **Goal:** Achieve the lowest possible Jaccard Similarity score (lexical overlap) across all seven questions (Original + 6 Variants).
    * **Prohibited:** Do not use the same content word (nouns, verbs, adjectives, adverbs) more than **once** across all 6 rephrased versions, even if it's a synonym.
    * **Essential Term Handling:** Proper nouns, named entities, dates, and numbers must be preserved, but **must be textually transformed** using alternatives like:
        * **Dates:** Use *ordinal numbers* (e.g., "the 16th of July"), *written-out numbers* (e.g., "July sixteenth"), or *alternative formats* (e.g., "7/16/2025").
        * **Numbers:** Use *written-out words* (e.g., "seven point three"), *fractions* (e.g., "seven and three-tenths"), or *technical abbreviations* (e.g., "M7.3").
        * **Locations:** Use *hypernyms* (e.g., "tri-state area," "the region") or *abbreviations* (e.g., "MI, OH, PA").

3.  **Structural Transformation:**
    * Vary the core grammatical structure for **every** version. Use: passive voice, nominalization (converting a verb to a noun, e.g., "analyze" $\to$ "analysis"), embedded clauses, and rhetorical devices.
    * Change the interrogative word (Who, What, Where, How, Describe, Identify) for **at least four** of the six versions.

4.  **Natural Language Quality:** Each question must be highly fluent, professional, and grammatically impeccable. Avoid any awkward or forced substitutions.

### DIVERSITY GUARANTEE CHECK

Before outputting, confirm that:
1.  All 6 versions use distinct starting words/phrases.
2.  The set of content words (non-stopwords) in version 1 is almost entirely disjoint from the set of content words in version 2, and so on.

### OUTPUT FORMAT
```
1. Rephrase the question using a structural change transformation.
2. Rephrase the question using a different structural change and transformation.
3. Rephrase the question using a different structural change and transformation.    
4. Rephrase the question using a different structural change and transformation.
5. Rephrase the question using a different structural change and transformation.
6. Rephrase the question using a different structural change and transformation.
```

Input Question: "{question}"

Output:
"""

   return prompt

In [51]:
import json
import os


data = {}
for filename in os.listdir("extracted_QA"):
    if filename.endswith(".json"):
        with open(os.path.join("extracted_QA", filename), "r") as f:
            data= json.load(f)




In [52]:
len(data)

3300

In [53]:
# randomly pick 30 questions from data with seed 42 data is a list of dicts with keys question and answer
import random
random.seed(41)
sampled_questions = random.sample(data, 30)
sampled_questions

[{'category': 'abrupt',
  'fact': 'More than 1.4 million people have returned or been forced to return to Afghanistan in 2023, including over 1 million from Iran, according to a senior U.N. official.',
  'question': '"How many people have returned or been forced to return to Afghanistan in 2023, and how many of those were from Iran?"',
  'answer': '"More than 1.4 million people have returned or been forced to return to Afghanistan in 2023, including over 1 million from Iran, according to a senior U.N. official."'},
 {'category': 'emergency',
  'fact': 'Russian troops attacked Kryvyi Rih in Dnipropetrovsk Oblast on July 4, 2025, resulting in eight injuries, including one person in serious condition and three in moderate condition, according to Oleksandr Vilkul, Head of Kryvyi Rih Defence Council.',
  'question': '"What were the consequences of the Russian troops attacking Kryvyi Rih in Dnipropetrovsk Oblast on July 4, 2025?"',
  'answer': '"The attack resulted in eight injuries, includi

In [54]:
def extract_rephrased_questions(response_text):
    question1 = response_text.split("1.")[1].split("2.")[0].strip()
    question2 = response_text.split("2.")[1].split("3.")[0].strip()
    question3 = response_text.split("3.")[1].split("4.")[0].strip()
    question4 = response_text.split("4.")[1].split("5.")[0].strip()
    question5 = response_text.split("5.")[1].split("6.")[0].strip()
    question6 = response_text.split("6.")[1].strip()
    return question1, question2, question3 , question4, question5, question6

In [61]:
import json
import os
import tqdm

OUTPUT_FILE = "extracted_QA/Rephrased_QA.jsonl"

done = set()
if os.path.exists(OUTPUT_FILE):
    with open(OUTPUT_FILE, "r", encoding="utf-8") as f:
        for line in f:
            try:
                obj = json.loads(line)
                done.add(obj["question"])
            except:
                continue

for example in tqdm.tqdm(data):
    if example["question"] in done:
        continue 

    prompt = prompt_for_question_rephrasing(example['question'])

    for attempt in range(2):
        try:
            response = client.responses.create(
                model="gpt-4o-mini",
                instructions="You are a helpful assistant",
                input=prompt,
                max_output_tokens=500,
                temperature=0.9
            )

            response = response.output_text.split("```")[1].replace("```", "").strip()
            q1, q2, q3, q4, q5, q6 = extract_rephrased_questions(response)

            example['rephrased_1'] = q1
            example['rephrased_2'] = q2
            example['rephrased_3'] = q3
            example['rephrased_4'] = q4
            example['rephrased_5'] = q5
            example['rephrased_6'] = q6

            with open(OUTPUT_FILE, "a", encoding="utf-8") as f:
                f.write(json.dumps(example, ensure_ascii=False) + "\n")

            break 

        except Exception as e:
            if attempt == 0:
                print(f"First attempt failed for '{example['question']}', retrying...")
            else:
                print(f"Second attempt failed for '{example['question']}', moving on...")


 86%|████████▌ | 2828/3300 [00:24<00:06, 77.13it/s] 

First attempt failed for '"What was the maximum sustained wind speed of Tropical Storm Chantal when it made landfall along the Carolina coast?"', retrying...


 86%|████████▋ | 2851/3300 [00:30<00:07, 56.44it/s]

Second attempt failed for '"What was the maximum sustained wind speed of Tropical Storm Chantal when it made landfall along the Carolina coast?"', moving on...


 95%|█████████▌| 3142/3300 [01:09<00:18,  8.65it/s]

First attempt failed for '"What did the 2019 Conservative government announce for chess players, and what key issues did it fail to address?"', retrying...


100%|██████████| 3300/3300 [01:21<00:00, 40.56it/s]
