In [1]:
import os
import dspy
import pandas as pd
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from tqdm import tqdm
from typing import List, Literal, List, Dict, Any
from dspy.evaluate import Evaluate
from dspy import LabeledFewShot
from dspy.teleprompt import BootstrapFewShotWithRandomSearch
import numpy as np
import seaborn as sns

from datasets import load_dataset
import random
import json
import re
from functools import partial
from datasets import Dataset
from copy import deepcopy
import evaluate
import nltk
from scipy.stats import ttest_ind
import string
from collections import Counter

from openai import OpenAI
import os
import time
import pandas as pd
from dotenv import load_dotenv
import json
import random
from ragas.llms import LangchainLLMWrapper
from langchain_deepseek import ChatDeepSeek
from ragas.dataset_schema import SingleTurnSample
from ragas.metrics import AnswerAccuracy

load_dotenv()

True

In [63]:
# Extract 20 random samples with F1 score of 0 from JSONL file
def extract_f1_zero_samples(input_file, output_file, num_samples=20):
    """
    Randomly extract specified number of samples with F1 score of 0 from JSONL file
    
    Args:
        input_file: Input JSONL file path
        output_file: Output JSONL file path
        num_samples: Number of samples to extract
    """
    f1_zero_samples = []
    
    # Read all data and filter samples with F1 score of 0
    with open(input_file, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line.strip())
            if data.get('f1') == 0:
                f1_zero_samples.append(data)
    
    print(f"Found {len(f1_zero_samples)} samples with F1 score of 0")
    
    # If F1=0 samples are fewer than requested, use all available
    if len(f1_zero_samples) < num_samples:
        print(f"Only {len(f1_zero_samples)} samples with F1=0 available, less than requested {num_samples}")
        selected_samples = f1_zero_samples
    else:
        # Randomly sample the specified number of samples
        selected_samples = random.sample(f1_zero_samples, num_samples)
    
    # Write to new JSONL file
    with open(output_file, 'w', encoding='utf-8') as f:
        for sample in selected_samples:
            f.write(json.dumps(sample, ensure_ascii=False) + '\n')
    
    print(f"Successfully extracted {len(selected_samples)} samples with F1=0 to {output_file}")
    return selected_samples

# Set file paths
input_file = "produced_files/GoogleNQ_UND_gpt_with_most_metrics.jsonl"
output_file = "produced_files/GoogleNQ_UND_gpt_f1_zero_samples.jsonl"

# Set random seed for reproducible results
random.seed(42)

# Execute extraction
selected_samples = extract_f1_zero_samples(input_file, output_file, num_samples=20)

Found 116 samples with F1 score of 0
Successfully extracted 20 samples with F1=0 to produced_files/GoogleNQ_UND_gpt_f1_zero_samples.jsonl


In [64]:
def retrieve_all_low_f1_samples(input_file, output_file, threshold):
    """
    Randomly extract specified number of samples with low f1 score from JSONL file
    
    Args:
        input_file: Input JSONL file path
        output_file: Output JSONL file path
    """
    target_samples = []
    
    # Read all data and filter samples with F1 score of 0
    with open(input_file, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line.strip())
            if data.get('f1') <= threshold:
                target_samples.append(data)
    
    print(f"Found {len(target_samples)} samples satisfying the defined condition")
    
    # Write to new JSONL file
    with open(output_file, 'w', encoding='utf-8') as f:
        for sample in target_samples:
            f.write(json.dumps(sample, ensure_ascii=False) + '\n')
    
    print(f"Successfully extracted {len(target_samples)} samples with F1 <= {threshold} to {output_file}")
    return target_samples

In [65]:
# Set file paths
input_file = "produced_files/GoogleNQ_UND_gpt_with_most_metrics.jsonl"
output_file = "produced_files/GoogleNQ_UND_gpt_low_f1_samples.jsonl"

# Execute extraction

selected_samples = retrieve_all_low_f1_samples(input_file, output_file, 0.2)

Found 156 samples satisfying the defined condition
Successfully extracted 156 samples with F1 <= 0.2 to produced_files/GoogleNQ_UND_gpt_low_f1_samples.jsonl


## Question Modification using Gemini 2.5 Flash

In [66]:
client = OpenAI(
    api_key=os.environ.get("GOOGLE_API_KEY"),
    base_url="https://generativelanguage.googleapis.com/v1beta/openai/"
)

In [67]:
def modify_question_with_gemini(question, short_answer, reasoning, model="gemini-2.5-flash", temperature=0, max_retries=5, sleep_time=2.0):

    system_prompt = (
        "You are a professional question optimization expert. Please modify the underspecified question to a fully specified version based on the provided clues.\n\n"
        "Requirements:\n"
        "1. Keep the core intent of the question unchanged\n"
        "2. Add necessary contextual information\n"
        "3. Eliminate underspecified elements and make the question clear\n"
        "4. Ensure the modified question can be directly answered with the provided short answer without dispute\n\n"
        "Please only return the modified question, do not include any other explanations."
    )
    
    user_prompt = f"""
The original question: {question}
Short answer: {short_answer}
Reasoning: {reasoning}

Please analyze the underspecified elements in the original question, then modify the question to a fully specified version based on the short answer and reasoning.
"""
    retries = 0
    while retries < max_retries:
        try:
            response = client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ],
                temperature=temperature
            )
            content = response.choices[0].message.content
            modified_question = content.strip()
            return modified_question
        except Exception as e:
            retries += 1
            print(f"Attempt {retries} failed: {str(e)}")
            if retries < max_retries:
                print(f"Waiting {sleep_time * retries} seconds before retry...")
                time.sleep(sleep_time * retries)
            else:
                print(f"All retries failed, returning original question")
                return question  # If error occurs, return original question




In [None]:
# Test function
test_question = "when did the smoking ban in public places start"
test_short_answer = ['1995']
test_reasoning = "The request is underspecified because the phrase 'public places' is vague and depends on the specific jurisdiction or country being referenced. Smoking bans vary significantly across different regions, and without specifying the location, the question lacks a necessary component to determine the correct answer."

print("Testing Gemini API connection...")
try:
    test_result = modify_question_with_gemini(test_question, test_short_answer, test_reasoning)
    print(f"Original question: {test_question}")
    print(f"Modified question: {test_result}")
    print("API connection successful!")
except Exception as e:
    print(f"API connection failed: {str(e)}")

In [68]:

def modification_in_batch(input_file, output_file, batch_size=5):
    """
    按批次处理所有样本，提高处理效率
    
    Args:
        input_file: 输入JSONL文件路径
        output_file: 输出JSONL文件路径
        batch_size: 每批处理的样本数量
    
    Returns:
        list: 所有处理过的样本
    """
    
    all_processed_samples = []
    
    # Loading all the data from the input file
    with open(input_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    total_samples = len(lines)
    print(f"Total samples to process: {total_samples}")
    print(f"Batch size: {batch_size}")
    
    # Process all samples in batches
    for batch_start in tqdm(range(0, total_samples, batch_size), desc="Processing batches"):
        batch_end = min(batch_start + batch_size, total_samples)
        batch_lines = lines[batch_start:batch_end]
        
        batch_processed_samples = []
        
        # Process each sample in the current batch
        for i, line in enumerate(batch_lines):
            try:
                data = json.loads(line.strip())
                
                # Extract necessary fields
                question = data['question']
                short_answer = data['short_answers']
                reasoning = data['reasoning']
                
                # Modify questions
                modified_question = modify_question_with_gemini(question, short_answer, reasoning)
                
                # Create new data structure
                new_sample = {
                    'original_question': question,
                    'modified_question': modified_question,
                    'short_answer': short_answer,
                    'model_original_answer': data.get('model_short_answer', 'undefined'),
                    'classifier_reasoning': reasoning,
                    'category': data.get('category', 'undefined'),
                    'original_f1': data.get('f1', 'undefined'),
                    'original_em': data.get('em', 'undefined')
                }
                
                batch_processed_samples.append(new_sample)
                
                # Add delay to avoid API rate limits
                time.sleep(1)
                
            except Exception as e:
                print(f"Error processing sample {batch_start + i + 1}: {e}")
                # Create error sample to maintain consistency
                error_sample = {
                    'original_question': data.get('question', 'error'),
                    'modified_question': 'error',
                    'short_answer': data.get('short_answers', ['error']),
                    'model_original_answer': data.get('model_short_answer', 'error'),
                    'classifier_reasoning': data.get('reasoning', 'error'),
                    'category': data.get('category', 'error'),
                    'original_f1': data.get('f1', 'error'),
                    'original_em': data.get('em', 'error')
                }
                batch_processed_samples.append(error_sample)
        
        # Add batch results to all processed samples
        all_processed_samples.extend(batch_processed_samples)
        
        # Write intermediate results to file (append mode)
        with open(output_file, 'a', encoding='utf-8') as f:
            for sample in batch_processed_samples:
                f.write(json.dumps(sample, ensure_ascii=False) + '\n')
        
    
    print(f"\nAll batch processing completed! Total processed: {len(all_processed_samples)} samples")
    print(f"Results saved to: {output_file}")
    
    return all_processed_samples


In [69]:
# 清空输出文件（如果存在）
modified_output_file = "produced_files/GoogleNQ_UND_gpt_low_f1_samples_modified.jsonl"
if os.path.exists(modified_output_file):
    os.remove(modified_output_file)
    print(f"Cleared existing output file: {modified_output_file}")

# 处理所有样本（按批次）
question_modification = modification_in_batch(output_file, modified_output_file, batch_size=3)


Total samples to process: 156
Batch size: 3


Processing batches:   0%|          | 0/52 [00:00<?, ?it/s]

Processing batches:  25%|██▌       | 13/52 [09:51<26:13, 40.36s/it]  

Attempt 1 failed: 'NoneType' object has no attribute 'strip'
Waiting 2.0 seconds before retry...
Attempt 2 failed: 'NoneType' object has no attribute 'strip'
Waiting 4.0 seconds before retry...


Processing batches:  83%|████████▎ | 43/52 [36:57<05:17, 35.26s/it]   

Attempt 1 failed: 'NoneType' object has no attribute 'strip'
Waiting 2.0 seconds before retry...
Attempt 2 failed: 'NoneType' object has no attribute 'strip'
Waiting 4.0 seconds before retry...
Attempt 3 failed: 'NoneType' object has no attribute 'strip'
Waiting 6.0 seconds before retry...
Attempt 4 failed: 'NoneType' object has no attribute 'strip'
Waiting 8.0 seconds before retry...
Attempt 5 failed: 'NoneType' object has no attribute 'strip'
All retries failed, returning original question


Processing batches: 100%|██████████| 52/52 [1:05:31<00:00, 75.61s/it]   


All batch processing completed! Total processed: 156 samples
Results saved to: produced_files/GoogleNQ_UND_gpt_low_f1_samples_modified.jsonl





In [70]:
df_view = pd.DataFrame(question_modification)
#df_view.to_csv("produced_files/modification_pilot.csv")
df_view

Unnamed: 0,original_question,modified_question,short_answer,model_original_answer,classifier_reasoning,category,original_f1,original_em
0,where does the modern view of history originat...,"When did modern historiography, emphasizing cr...",[approximately in the early 16th century],[The modern view of history originates from th...,The request is underspecified because the phra...,Undetermined standard or preference,0.111111,0
1,when was the first book made into a movie,When was the first feature film adaptation of ...,[1924],[1900 - 'Sherlock Holmes Baffled' based on Art...,The request is underspecified because the term...,Undetermined standard or preference,0.000000,0
2,when did the first wireless beats come out,When did the Beats Wireless headphones first c...,[October 2012],[2014],The request is underspecified because the term...,Undetermined lexicons or references,0.000000,0
3,what is the collection of the districts to the...,What are the major regions and countries locat...,"[Golan Heights, Jordan]",[Transjordan],The request is underspecified because the term...,Undetermined lexicons or references,0.000000,0
4,factories that assemble parts made in other co...,What are the designated geographical areas whe...,[special economic zones],"[Assembly plants, Maquiladoras]",The request is underspecified because the term...,Undetermined lexicons or references,0.000000,0
...,...,...,...,...,...,...,...,...
151,the origins of the stations of the cross,What is the name of the path in Jerusalem that...,[Via Dolorosa in Jerusalem which is believed t...,[- The Stations of the Cross originated as a d...,The request is underspecified because the term...,Undetermined standard or preference,0.169014,0
152,what's the medal count for canada in the olympics,What was the total medal count for Canada at t...,[302],"[Canada has won 199 gold, 173 silver, and 230 ...",The request is underspecified because it lacks...,Missing necessary components,0.000000,0
153,when was the last time the womens hockey team ...,When was the most recent time the United State...,[2018],[The Canadian women's hockey team won gold at ...,The request is underspecified because the phra...,Undetermined perspective or granularity,0.000000,0
154,when does the good doctor episode 8 air,When does The Good Doctor Season 1 Episode 8 air?,"[November 20, 2017]",[The air date for Episode 8 of The Good Doctor...,The request is underspecified because the phra...,Missing necessary components,0.000000,0


## Implementing QA on modified questions using GPT-4o

#### Loading GPT-4o and helper functions

In [20]:
client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
    base_url="https://api.openai.com/v1"
) # Delete when sharing

In [21]:
def ask_short_answer(question, client, model="gpt-4o-2024-11-20", temperature=0, max_retries=5, sleep_time=2.0):
    system_prompt = (
        "Answer the question with a concise response. "
        "Return answers as a list of strings. If there's only one answer, return a single-item list. "
        "Each answer should be brief and direct."
    )
    retries = 0
    while retries < max_retries:
        try:
            response = client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": question}
                ],
                temperature=temperature
            )
            content = response.choices[0].message.content
            if content.startswith("["):
                return eval(content)
            else:
                return [content.strip()]
        except Exception as e:
            retries += 1
            time.sleep(sleep_time * retries)
            
    return ["[Error]: Max retries exceeded"]

In [22]:
def run_batch_shortQA_api(batch, client, **kwargs):
    short_answers = []
    for q in batch["modified_question"]:
        try:
            answer = ask_short_answer(q, client=client, **kwargs)
            short_answers.append(answer)
        except Exception as e:
            print(f"Error: {e}")
            short_answers.append(["error"])
    return {"model_new_answer": short_answers}

def batch_QA_with_progress(dataset, batch_fn, output_key, batch_size=10, fill_value="error", **batch_fn_kwargs):
    all_outputs = []
    for i in tqdm(range(0, len(dataset), batch_size), desc=f"Running {output_key}"):
        batch = dataset.select(range(i, min(i + batch_size, len(dataset))))
        try:
            output = batch_fn(batch, **batch_fn_kwargs)
            if output_key not in output:
                raise ValueError(f"Missing key '{output_key}' in batch result")
            all_outputs.extend(output[output_key])
        except Exception as e:
            print(f"Batch error at {i}: {e}")
            all_outputs.extend([fill_value] * len(batch))

    if len(all_outputs) != len(dataset):
        print(f"[Warning] Output length mismatch, auto-filling")
        all_outputs.extend([fill_value] * (len(dataset) - len(all_outputs)))

    return {output_key: all_outputs}

#### Implementation

In [23]:
modified_set = load_dataset("json",
    data_files="BASELINE_GoogleNQ_UND_gpt_low_AA_samples_modified.jsonl",
    split="train"  # 必须指定 split，否则默认返回 DatasetDict
)

Generating train split: 0 examples [00:00, ? examples/s]

In [24]:
modified_results = batch_QA_with_progress(
    modified_set,
    batch_fn=run_batch_shortQA_api,
    output_key="model_new_answer",
    fill_value=["error"],
    client=client,
    model="gpt-4o-2024-11-20",
    temperature=0.0
)

Running model_new_answer: 100%|██████████| 25/25 [03:20<00:00,  8.04s/it]


In [25]:
qa_modified = deepcopy(modified_set)
for key in modified_results:
    qa_modified = qa_modified.add_column(key, modified_results[key])

qa_modified.to_json("BASELINE_GoogleNQ_UND_gpt_low_AA_samples_modified.jsonl", orient="records", lines=True)
df_qa_modified = pd.read_json("BASELINE_GoogleNQ_UND_gpt_low_AA_samples_modified.jsonl", lines=True)
#df_qa_modified.to_csv('produced_files/modification_pilot_qa.csv')
df_qa_modified

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Unnamed: 0,original_question,modified_question,short_answer,model_original_answer,classifier_reasoning,original_f1,original_em,original_AA,model_new_answer
0,where does the modern view of history originat...,When did the initial development of a critical...,['approximately in the early 16th century'],['The modern view of history originates from t...,The query asks about the 'modern view of histo...,0.086957,0,0.00,[19th century]
1,this inventor co-created the film fred ott’s s...,"Which American inventor, known for developing ...",['Edison'],['William K.L. Dickson'],The query refers to an 'inventor' who co-creat...,0.000000,0,0.25,[Thomas Edison]
2,what is the collection of the districts to the...,What are the primary geopolitical entities or ...,['Golan Heights' 'Jordan'],['Transjordan'],"The query refers to 'the Jordan River,' which ...",0.000000,0,0.50,[The Hashemite Kingdom of Jordan]
3,factories that assemble parts made in other co...,What are the specific geographic areas establi...,['special economic zones'],['Assembly plants'],The query lacks specificity regarding critical...,0.000000,0,0.50,[Export Processing Zones (EPZs)]
4,who sang it my party and i'll cry if i want to...,"Who sang the 1980s version of the song ""It's M...",['Dave Stewart and Barbara Gaskin'],['Lesley Gore'],The query appears to reference two distinct so...,0.000000,0,0.00,[Dave Stewart and Barbara Gaskin]
...,...,...,...,...,...,...,...,...,...
241,what's the medal count for canada in the olympics,What was Canada's total medal count across all...,['302'],"['Canada has won 199 gold, 173 silver, and 230...",The query asks for Canada's medal count in the...,0.000000,0,0.00,[302 medals]
242,when was the last time the womens hockey team ...,When was the last time the United States women...,['2018'],"[""The Canadian women's hockey team won gold at...",The query lacks specification of which nationa...,0.000000,0,0.00,[2018]
243,when does the good doctor episode 8 air,When does The Good Doctor Season 1 episode 8 air?,"['November 20, 2017']","['The air date for The Good Doctor Season 7, E...",The query asks for the air date of 'The Good D...,0.000000,0,0.00,"[November 20, 2017]"
244,where do royal families get their money from,What is the primary historical source of incom...,['the hereditary revenues of the Crown'],['- Inherited wealth and assets \n- Governmen...,The query is underspecified because 'royal fam...,0.000000,0,0.50,[The Crown Estate]


## Evaluations

In [26]:
def evaluate_squad_per_sample_multi_ref_pred(dataset, pred_col="model_new_answer", ref_col="short_answer"):
    """
    对每个样本逐一计算 EM 和 F1，支持多个参考答案和多个预测答案（list[str]）。
    返回带 "em", "f1" 列的新 Dataset，以及 f1/em 列表用于统计分析。
    Also considering multiple answers in both gold and pred and take the maximum score
    """

    def normalize_answer(s):
        def remove_articles(text):
            return re.sub(r'\b(a|an|the)\b', ' ', text)
        def white_space_fix(text):
            return ' '.join(text.split())
        def remove_punc(text):
            return ''.join(ch for ch in text if ch not in string.punctuation)
        def lower(text):
            return text.lower()
        return white_space_fix(remove_articles(remove_punc(lower(s))))

    def compute_exact(a_pred, a_gold):
    # 如果是 list，转成 set 并 normalize 每个元素
        if isinstance(a_pred, list) and isinstance(a_gold, list):
          pred_set = set(normalize_answer(a) for a in a_pred)
          gold_set = set(normalize_answer(a) for a in a_gold)
          return int(pred_set == gold_set)
        else:
          return int(normalize_answer(a_pred) == normalize_answer(a_gold))

    def compute_f1(a_pred, a_gold):
        pred_tokens = normalize_answer(a_pred).split()
        gold_tokens = normalize_answer(a_gold).split()
        common = Counter(pred_tokens) & Counter(gold_tokens)
        num_same = sum(common.values())
        if num_same == 0:
            return 0.0
        precision = num_same / len(pred_tokens)
        recall = num_same / len(gold_tokens)
        return 2 * precision * recall / (precision + recall)

    new_data = []
    f1_scores = []
    em_scores = []

    for item in dataset:
        preds = item.get(pred_col, [])
        golds = item.get(ref_col, [])
        # 转为 list
        if not isinstance(preds, list):
            preds = [preds] if preds else []
        if not isinstance(golds, list):
            golds = [golds] if golds else []

        # 多对多最大匹配
        if not preds or not golds:
            em = 0.0
            f1 = 0.0
        else:
            em = max(compute_exact(p, g) for p in preds for g in golds)
            f1 = max(compute_f1(p, g) for p in preds for g in golds)

        new_item = deepcopy(item)
        new_item["new_em"] = em
        new_item["new_f1"] = f1
        new_data.append(new_item)
        em_scores.append(em)
        f1_scores.append(f1)

    return Dataset.from_list(new_data), f1_scores, em_scores

In [27]:
squad_scored_modified, modified_f1_list, modified_em_list = evaluate_squad_per_sample_multi_ref_pred(qa_modified)
squad_scored_modified.to_json("BASELINE_Gemini_modified_GPT_qa_squad_scores.jsonl", orient="records", lines=True)

df = pd.read_json("BASELINE_Gemini_modified_GPT_qa_squad_scores.jsonl", lines=True)
df.to_csv('BASELINE_Gemini_modified_GPT_qa_squad_scores.csv')

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

In [28]:
modified_mean_em = np.mean(modified_em_list)  # em_scores: EM list per sample
modified_mean_f1 = np.mean(modified_f1_list)  # f1_scores F1 list per sample
print(f"New answers after modification Exact Match (avg): {modified_mean_em * 100:.2f}")
print(f"New answers after modification F1 Score (avg): {modified_mean_f1 * 100:.2f}")

original_em_list = qa_modified['original_em']
original_f1_list = qa_modified['original_f1']

original_mean_em = np.mean(original_em_list)  # em_scores: EM list per sample
original_mean_f1 = np.mean(original_f1_list)  # f1_scores F1 list per sample
print(f"Original answers Exact Match (avg): {original_mean_em * 100:.2f}")
print(f"Original answers F1 Score (avg): {original_mean_f1 * 100:.2f}")

f1_tstat, f1_pval = ttest_ind(modified_f1_list, original_f1_list, equal_var=False)
print(f"F1: t={f1_tstat:.3f}, p={f1_pval:.4f}")

em_tstat, em_pval = ttest_ind(modified_em_list, original_em_list, equal_var=False)
print(f"EM: t={em_tstat:.3f}, p={em_pval:.4f}")

New answers after modification Exact Match (avg): 19.11
New answers after modification F1 Score (avg): 44.54
Original answers Exact Match (avg): 0.00
Original answers F1 Score (avg): 12.67
F1: t=12.188, p=0.0000
EM: t=7.607, p=0.0000


In [80]:
def find_failed_rows_simple(input_file, output_file):
    """
    简单方法：通过比较原问题和修改后问题是否相同来找出失败的行
    """
    print("=== 查找失败的行（简单方法）===")
    
    # 读取输入和输出文件
    with open(input_file, 'r', encoding='utf-8') as f:
        input_data = [json.loads(line.strip()) for line in f]
    
    with open(output_file, 'r', encoding='utf-8') as f:
        output_data = [json.loads(line.strip()) for line in f]
    
    failed_rows = []
    
    for i, (input_row, output_row) in enumerate(zip(input_data, output_data)):
        original_question = input_row.get('question', '')
        modified_question = output_row.get('modified_question', '')
        
        # 如果原问题和修改后问题相同，说明失败了
        if original_question == modified_question:
            failed_rows.append({
                'row_number': i + 1,
                'original_question': original_question,
                'short_answer': input_row.get('short_answers', ''),
                'reasoning': input_row.get('reasoning', '')
            })
    
    print(f"发现 {len(failed_rows)} 个失败的行:")
    for row in failed_rows:
        print(f"\n第 {row['row_number']} 行:")
        print(f"  问题: {row['original_question'][:100]}...")
        print(f"  短答案: {row['short_answer']}")
        print(f"  推理: {row['reasoning'][:100]}...")
    
    return failed_rows

# 使用简单方法查找失败的行
failed_rows = find_failed_rows_simple(
    "produced_files/GoogleNQ_UND_gpt_low_f1_samples.jsonl",
    "produced_files/GoogleNQ_UND_gpt_low_f1_samples_modified.jsonl"
)


=== 查找失败的行（简单方法）===
发现 1 个失败的行:

第 130 行:
  问题: when will the next episode of flash be aired...
  短答案: ['May 15, 2018']
  推理: The request is underspecified because the phrase “next episode” can refer to multiple possible refer...


In [2]:
evaluator_llm = LangchainLLMWrapper(ChatDeepSeek(model="deepseek-chat", verbose=True, temperature=0))

In [5]:
async def answer_accuracy_modified(input_dataset, evaluator=evaluator_llm):
    # 在函数开始时创建一次 scorer
    scorer = AnswerAccuracy(llm=evaluator)
    

    score_list = []
        
    for i, row in enumerate(tqdm(input_dataset, desc="Calculating short answer accuracy")):
        try:
            # 短答案评分 - 处理列表情况
            if 'short_answer' in row and 'model_new_answer' in row:
                model_answers = row['model_new_answer'] if isinstance(row['model_new_answer'], list) else [row['model_new_answer']]
                reference_answers = row['short_answer'] if isinstance(row['short_answer'], list) else [row['short_answer']]
                    
                # 计算所有组合的分数，取最高分
                max_score = 0.0
                for model_ans in model_answers:
                    for ref_ans in reference_answers:
                        sample = SingleTurnSample(
                                user_input=row['modified_question'],
                                response=model_ans,
                                reference=ref_ans
                            )
                        score = await scorer.single_turn_ascore(sample)
                        max_score = max(max_score, score)
                        if max_score == 1.0:
                            break  # 跳出内层循环
                    if max_score == 1.0:
                        break  # 跳出外层循环
                
                score_list.append(max_score)
            else:
                score_list.append(0.0)
                
        except Exception as e:
            print(f"处理第 {i+1} 个样本时出错: {e}")
            score_list.append(0.0)

    ragas_scored_dataset = input_dataset.add_column("new_AA", score_list)

    return ragas_scored_dataset

In [6]:
squad_scored_modified = load_dataset("json",
    data_files="BASELINE_Gemini_modified_GPT_qa_squad_scores.jsonl",
    split="train")
result_with_AA = await answer_accuracy_modified(squad_scored_modified)
result_with_AA.to_csv("BASELINE_Gemini_modified_GPT_qa_all_scores.csv")

Calculating short answer accuracy: 100%|██████████| 246/246 [40:17<00:00,  9.83s/it]


Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

221862

In [7]:
original_AA = list(result_with_AA["original_AA"])
modified_AA = list(result_with_AA["new_AA"])

original_mean_AA = np.mean(original_AA)
print(f"original AA (avg): {original_mean_AA * 100:.2f}")


modified_mean_AA = np.mean(modified_AA)
print(f"modified AA (avg): {modified_mean_AA * 100:.2f}")

AA_tstat, AA_pval = ttest_ind(modified_AA, original_AA, equal_var=False)
print(f"AA: t={AA_tstat:.3f}, p={AA_pval:.4f}")

original AA (avg): 21.75
modified AA (avg): 55.79
AA: t=10.280, p=0.0000


### Including RAGAS

In [2]:
df = pd.read_csv("../baseline_classifier_for_paper/GoogleNQ_UND_gpt4o_Ragas.csv")

In [9]:
df.to_json("BASELINE_GoogleNQ_UND_gpt4o_Ragas.jsonl", orient="records", lines=True)
dataset = load_dataset("json",
    data_files="BASELINE_GoogleNQ_UND_gpt4o_Ragas.jsonl",
    split="train"  # 必须指定 split，否则默认返回 DatasetDict
)

Generating train split: 0 examples [00:00, ? examples/s]

In [15]:
dataset

Dataset({
    features: ['question', 'short_answers', 'long_answer', 'qwen3_thinking', 'qwen3_model_response', 'qwen3_model_pred', 'model_short_answer', 'model_long_answer', 'em', 'f1', 'bleu', 'meteor', 'rouge', 'bertscore', 'ragas_AA_short'],
    num_rows: 458
})

In [14]:
json.loads(dataset[0]['qwen3_model_response'])['reasoning']

"The query asks about the 'modern view of history' but fails to specify critical parameters such as temporal boundaries (e.g., 18th-century Enlightenment vs. 20th-century postcolonial theory), geographic scope (e.g., Western Europe vs. global histories), disciplinary frameworks (e.g., Marxist historiography vs. traditional narrative), or key theoretical developments (e.g., source criticism vs. oral history). Additionally, 'modern view' is inherently subjective and contested among historians, further complicating the definition of an origin point."

In [16]:
def retrieve_all_low_AA_samples(input_file, output_file, threshold):
    """
    Randomly extract specified number of samples with low f1 score from JSONL file
    
    Args:
        input_file: Input JSONL file path
        output_file: Output JSONL file path
    """
    target_samples = []
    
    
    with open(input_file, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line.strip())
            if data.get('ragas_AA_short') < threshold:
                target_samples.append(data)
    
    print(f"Found {len(target_samples)} samples satisfying the defined condition")
    
    # Write to new JSONL file
    with open(output_file, 'w', encoding='utf-8') as f:
        for sample in target_samples:
            f.write(json.dumps(sample, ensure_ascii=False) + '\n')
    
    print(f"Successfully extracted {len(target_samples)} samples with Ragas AA <= {threshold} to {output_file}")
    return target_samples

In [17]:
input_file = "BASELINE_GoogleNQ_UND_gpt4o_Ragas.jsonl"
output_file = "BASELINE_GoogleNQ_UND_gpt4o_Ragas_low_AA.jsonl"
target_samples = retrieve_all_low_AA_samples(input_file, output_file, 1)

Found 246 samples satisfying the defined condition
Successfully extracted 246 samples with Ragas AA <= 1 to BASELINE_GoogleNQ_UND_gpt4o_Ragas_low_AA.jsonl


In [18]:
client = OpenAI(
    api_key=os.environ.get("GOOGLE_API_KEY"),
    base_url="https://generativelanguage.googleapis.com/v1beta/openai/"
)

def modify_question_with_gemini(question, short_answer, reasoning, model="gemini-2.5-flash", temperature=0, max_retries=5, sleep_time=2.0):

    system_prompt = (
        "You are a professional question optimization expert. Please modify the underspecified question to a fully specified version based on the provided clues.\n\n"
        "Requirements:\n"
        "1. Keep the core intent of the question unchanged\n"
        "2. Add necessary contextual information\n"
        "3. Eliminate underspecified elements and make the question clear\n"
        "4. Ensure the modified question can be directly answered with the provided short answer without dispute\n\n"
        "Please only return the modified question, do not include any other explanations."
    )
    
    user_prompt = f"""
The original question: {question}
Short answer: {short_answer}
Reasoning: {reasoning}

Please analyze the underspecified elements in the original question, then modify the question to a fully specified version based on the short answer and reasoning.
"""
    retries = 0
    while retries < max_retries:
        try:
            response = client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ],
                temperature=temperature
            )
            content = response.choices[0].message.content
            modified_question = content.strip()
            return modified_question
        except Exception as e:
            retries += 1
            print(f"Attempt {retries} failed: {str(e)}")
            if retries < max_retries:
                print(f"Waiting {sleep_time * retries} seconds before retry...")
                time.sleep(sleep_time * retries)
            else:
                print(f"All retries failed, returning original question")
                return question  # If error occurs, return original question

def modification_in_batch_alt(input_file, output_file, batch_size=5):
    """
    按批次处理所有样本，提高处理效率
    
    Args:
        input_file: 输入JSONL文件路径
        output_file: 输出JSONL文件路径
        batch_size: 每批处理的样本数量
    
    Returns:
        list: 所有处理过的样本
    """
    
    all_processed_samples = []
    
    # Loading all the data from the input file
    with open(input_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    total_samples = len(lines)
    print(f"Total samples to process: {total_samples}")
    print(f"Batch size: {batch_size}")
    
    # Process all samples in batches
    for batch_start in tqdm(range(0, total_samples, batch_size), desc="Processing batches"):
        batch_end = min(batch_start + batch_size, total_samples)
        batch_lines = lines[batch_start:batch_end]
        
        batch_processed_samples = []
        
        # Process each sample in the current batch
        for i, line in enumerate(batch_lines):
            try:
                data = json.loads(line.strip())
                
                # Extract necessary fields
                question = data['question']
                short_answer = data['short_answers']
                classifier_response = data['qwen3_model_response']
                classifier_reasoning = json.loads(classifier_response)['reasoning']
                
                # Modify questions
                modified_question = modify_question_with_gemini(question, short_answer, classifier_reasoning)
                
                # Create new data structure
                new_sample = {
                    'original_question': question,
                    'modified_question': modified_question,
                    'short_answer': short_answer,
                    'model_original_answer': data.get('model_short_answer', 'undefined'),
                    'classifier_reasoning': classifier_reasoning,
                    'original_f1': data.get('f1', 'undefined'),
                    'original_em': data.get('em', 'undefined'),
                    'original_AA': data.get('ragas_AA_short', 'undefined')
                }
                
                batch_processed_samples.append(new_sample)
                
                # Add delay to avoid API rate limits
                time.sleep(1)
                
            except Exception as e:
                print(f"Error processing sample {batch_start + i + 1}: {e}")
                # Create error sample to maintain consistency
                error_sample = {
                    'original_question': question,
                    'modified_question': modified_question,
                    'short_answer': short_answer,
                    'model_original_answer': data.get('model_short_answer', 'error'),
                    'classifier_reasoning': classifier_reasoning,
                    'original_f1': data.get('f1', 'error'),
                    'original_em': data.get('em', 'error'),
                    'original_AA': data.get('ragas_AA_short', 'error')
                }
                batch_processed_samples.append(error_sample)
        
        # Add batch results to all processed samples
        all_processed_samples.extend(batch_processed_samples)
        
        # Write intermediate results to file (append mode)
        with open(output_file, 'a', encoding='utf-8') as f:
            for sample in batch_processed_samples:
                f.write(json.dumps(sample, ensure_ascii=False) + '\n')
        
    
    print(f"\nAll batch processing completed! Total processed: {len(all_processed_samples)} samples")
    print(f"Results saved to: {output_file}")
    
    return all_processed_samples

In [19]:
# 清空输出文件（如果存在）
modified_output_file = "BASELINE_GoogleNQ_UND_gpt_low_AA_samples_modified.jsonl"
if os.path.exists(modified_output_file):
    os.remove(modified_output_file)
    print(f"Cleared existing output file: {modified_output_file}")

# 处理所有样本（按批次）
question_modification = modification_in_batch_alt(output_file, modified_output_file, batch_size=3)


Total samples to process: 246
Batch size: 3


Processing batches: 100%|██████████| 82/82 [1:07:34<00:00, 49.44s/it] 


All batch processing completed! Total processed: 246 samples
Results saved to: BASELINE_GoogleNQ_UND_gpt_low_AA_samples_modified.jsonl



