# Halubench datset

In [15]:
# import pandas as pd

# # Load inputs
# df_halu_hybrid = pd.read_csv('input_files/halubench_hybrid_linear_with_rerank_minilm_alpha_0.3_beta_0.85.csv')
# df_halu_cont = pd.read_csv('input_files/contriever_results.csv')
# df = pd.read_parquet("hf://datasets/PatronusAI/HaluBench/data/test-00000-of-00001.parquet")

# # Keep only the needed column from contriever results
# df_halu_cont = df_halu_cont[['contriever_ret_docs']]

# # Keep only the needed column from the parquet df
# df_source = df[['source_ds']]

# # Concatenate columns side-by-side with df_halu_hybrid
# # This assumes the rows align in order across the three inputs.
# # If they don't align, merge on a key instead of concat.
# df_out = pd.concat([df_halu_hybrid.reset_index(drop=True),
#                     df_halu_cont.reset_index(drop=True),
#                     df_source.reset_index(drop=True)], axis=1)

# # Reorder/select final columns
# final_cols = ['question', 'answer', 'groundtruth_with_ids',
#               'hybrid_reranked_docs', 'contriever_ret_docs', 'source_ds']
# df_out = df_out[final_cols]

# # Save to CSV
# output_path = 'output_files/halubench_hybrid_contriever_combined_results.csv'
# df_out.to_csv(output_path, index=False)
# print(f"Saved: {output_path}")

Saved: output_files/halubench_hybrid_contriever_combined_results.csv


## preparing test set for generation all in one code

In [7]:
import pandas as pd

# 1) Rebuild the combined dataset cleanly with aligned indices
df_halu_hybrid = pd.read_csv('input_files/halubench_hybrid_linear_with_rerank_minilm_alpha_0.3_beta_0.85.csv')
df_halu_cont   = pd.read_csv('input_files/halubench_contriever_results.csv')
df_parquet     = pd.read_parquet("hf://datasets/PatronusAI/HaluBench/data/test-00000-of-00001.parquet")

# Keep only needed columns from the two side frames
df_halu_cont = df_halu_cont[['contriever_ret_docs']].reset_index(drop=True)
df_source    = df_parquet[['source_ds']].reset_index(drop=True)

# Ensure the main df has a clean index
df_halu_hybrid = df_halu_hybrid.reset_index(drop=True)

# Sanity check same length; if not, fail fast to prevent misalignment
if not (len(df_halu_hybrid) == len(df_halu_cont) == len(df_source)):
    raise ValueError(f"Row count mismatch: hybrid={len(df_halu_hybrid)}, contriever={len(df_halu_cont)}, source={len(df_source)}. "
                     "Align your inputs (same order/length) before concatenation.")

# Concatenate side-by-side with aligned indices
combined = pd.concat([df_halu_hybrid, df_halu_cont, df_source], axis=1)

# Keep only the final columns you want in output
final_cols = ['question', 'answer', 'groundtruth_with_ids',
              'hybrid_reranked_docs', 'contriever_ret_docs', 'source_ds']
missing = [c for c in final_cols if c not in combined.columns]
if missing:
    raise KeyError(f"Missing expected columns: {missing}")

combined = combined[final_cols].copy()

# 2) Normalize source_ds to lowercase/trim
combined['source_ds'] = combined['source_ds'].astype(str).str.strip().str.lower()

# 3) Filter to the expected categories
expected_cats = ['halueval', 'drop', 'pubmedqa', 'financebench', 'covidqa', 'ragtruth']
filtered = combined[combined['source_ds'].isin(expected_cats)].copy()

# 4) Stratified sampling by getting indices first, then slicing
seed = 42
per_class = 20

# Verify counts
counts = filtered['source_ds'].value_counts()
too_small = [c for c in expected_cats if counts.get(c, 0) < per_class]
if too_small:
    raise ValueError(f"Not enough rows for categories {too_small}. Required {per_class} each. Actual counts: {counts.to_dict()}")

# Collect sampled indices per category (no mutation of the frame)
sampled_index = (
    filtered
    .groupby('source_ds', group_keys=False)
    .apply(lambda g: g.sample(n=per_class, random_state=seed).index)
)

# Flatten the index collection (handles both Series/Index return shapes)
if hasattr(sampled_index, 'explode'):
    sampled_index = sampled_index.explode().astype(int).tolist()
else:
    # Fallback if explode isn't available
    sampled_index = [int(i) for sub in sampled_index for i in list(sub)]

# Slice the original filtered dataframe with .loc to preserve all columns exactly
sampled = filtered.loc[sampled_index].copy()

# Optional: sort within category or keep random order. Here we keep as-sampled order.
# If you prefer a stable order, you can do: sampled = sampled.sort_values(['source_ds']).reset_index(drop=True)

# 5) Save to CSV with all selected columns intact
output_path = 'output_files/halubench/halubench_combined_results_stratified_120.csv'
sampled.to_csv(output_path, index=False)
print(f"Saved: {output_path}")

# Quick sanity checks
print("Final shape:", sampled.shape)
print("Per-category counts:\n", sampled['source_ds'].value_counts())

  from .autonotebook import tqdm as notebook_tqdm


Saved: output_files/halubench/halubench_combined_results_stratified_120.csv
Final shape: (120, 6)
Per-category counts:
 source_ds
covidqa         20
drop            20
financebench    20
halueval        20
pubmedqa        20
ragtruth        20
Name: count, dtype: int64


  .apply(lambda g: g.sample(n=per_class, random_state=seed).index)


In [20]:
# #!/usr/bin/env python3
# import ast
# import json
# import os
# import sys
# import pandas as pd

# def prompt_path(prompt_text, must_exist=False, default=None):
#     while True:
#         raw = input(f"{prompt_text}{' [' + default + ']' if default else ''}: ").strip()
#         path = raw or (default or "")
#         if not path:
#             print("Please provide a path.")
#             continue
#         if must_exist and not os.path.exists(path):
#             print(f"Path not found: {path}")
#             continue
#         return path

# def maybe_parse_listlike_column(series):
#     """
#     Best-effort convert stringified list-of-dicts to actual Python list objects.
#     Leaves non-string values unchanged.
#     """
#     def parse_cell(x):
#         if isinstance(x, str):
#             xs = x.strip()
#             # Heuristics: only try to parse if it looks like a Python/JSON list
#             if (xs.startswith("[") and xs.endswith("]")) or (xs.startswith("{") and xs.endswith("}")):
#                 try:
#                     # literal_eval handles Python-like repr (single quotes) safely
#                     return ast.literal_eval(xs)
#                 except Exception:
#                     # Fallback: try json if it looks like JSON
#                     try:
#                         return json.loads(xs)
#                     except Exception:
#                         return x
#         return x
#     return series.apply(parse_cell)

# def top5_full_text(items):
#     """
#     Extract up to the first 5 'full_text' strings from a list of dicts/objects.
#     Returns a list of strings.
#     """
#     if not isinstance(items, (list, tuple)):
#         return []
#     out = []
#     for d in items[:5]:
#         if isinstance(d, dict):
#             val = d.get("full_text", None)
#             if val is not None:
#                 out.append(val)
#         else:
#             # Try attribute access if not a dict
#             try:
#                 val = getattr(d, "full_text", None)
#                 if val is not None:
#                     out.append(val)
#             except Exception:
#                 pass
#     return out

# def main():
#     print("=== Top-5 full_text extractor ===")
#     in_csv = prompt_path("output_files/halubench_combined_results_stratified_120.csv", must_exist=True)

#     # Load CSV
#     try:
#         df = pd.read_csv(in_csv)
#     except Exception as e:
#         print(f"Failed to read CSV: {e}")
#         sys.exit(1)

#     # Verify required columns
#     required_cols = ["hybrid_reranked_docs", "contriever_ret_docs"]
#     missing = [c for c in required_cols if c not in df.columns]
#     if missing:
#         print(f"Missing expected columns: {missing}")
#         print("Columns present:", list(df.columns))
#         sys.exit(1)

#     # Attempt to parse list-like cells if they are stringified
#     for col in required_cols:
#         df[col] = maybe_parse_listlike_column(df[col])

#     # Create new columns
#     df["hybrid_full_text"] = df["hybrid_reranked_docs"].apply(top5_full_text)
#     df["contriever_full_text"] = df["contriever_ret_docs"].apply(top5_full_text)

#     # Ask how to save list columns in CSV
#     print("\nHow should list columns be saved?")
#     print("1) As JSON strings (recommended; preserves structure)")
#     print("2) As Python repr strings")
#     choice = input("Enter 1 or 2 [1]: ").strip() or "1"

#     df_to_save = df.copy()
#     list_cols = ["hybrid_full_text", "contriever_full_text"]
#     if choice == "1":
#         for col in list_cols:
#             df_to_save[col] = df_to_save[col].apply(json.dumps)
#     else:
#         # Leave as Python repr, which pandas will write as strings
#         df_to_save[list_cols] = df_to_save[list_cols].astype(str)

#     default_out = os.path.splitext(in_csv)[0] + "_with_texts.csv"
#     out_csv = prompt_path("Enter path to output CSV file", default=default_out)

#     try:
#         df_to_save.to_csv(out_csv, index=False)
#         print(f"\nSaved modified DataFrame to: {out_csv}")
#     except Exception as e:
#         print(f"Failed to write CSV: {e}")
#         sys.exit(1)

# if __name__ == "__main__":
#     main()

=== Top-5 full_text extractor ===
Please provide a path.
Please provide a path.
Please provide a path.
Please provide a path.
Please provide a path.


In [8]:
import pandas as pd
import ast

df = pd.read_csv('output_files/halubench/halubench_combined_results_stratified_120.csv')


# Function to extract top 5 full_text from string representation of list
def extract_top5_full_text(docs_string):
    """
    Extract the 'full_text' field from the top 5 documents
    Handles string representation of lists
    """
    if pd.isna(docs_string) or docs_string == '':
        return []
    
    try:
        # Convert string to actual list of dictionaries
        docs_list = ast.literal_eval(docs_string)
        
        if not isinstance(docs_list, list):
            return []
        
        # Take only the first 5 documents
        top5_docs = docs_list[:5]
        
        # Extract full_text from each document
        full_texts = [doc.get('full_text', '') for doc in top5_docs if isinstance(doc, dict)]
        
        return full_texts
    except:
        return []

# Apply the function to create new columns
df['hybrid_full_text'] = df['hybrid_reranked_docs'].apply(extract_top5_full_text)
df['contriever_full_text'] = df['contriever_ret_docs'].apply(extract_top5_full_text)

# Verify the results
print(f"Number of items in first row hybrid_full_text: {len(df['hybrid_full_text'].iloc[0])}")
print(f"Number of items in first row contriever_full_text: {len(df['contriever_full_text'].iloc[0])}")
df.columns

Number of items in first row hybrid_full_text: 5
Number of items in first row contriever_full_text: 5


Index(['question', 'answer', 'groundtruth_with_ids', 'hybrid_reranked_docs',
       'contriever_ret_docs', 'source_ds', 'hybrid_full_text',
       'contriever_full_text'],
      dtype='object')

In [2]:
df.to_csv('output_files/halubench/halubench_combined_results_stratified_120.csv', index=False)

# HotpotQA dataset

In [2]:
import pandas as pd
df_hybrid = pd.read_csv('input_files/hotpotqa_hybrid_linear_with_rerank_minilm_alpha_0.3_beta_0.85.csv')  
df_hybrid.columns    

Index(['question', 'answer', 'mpnet_ret_docs', 'splade_ret_docs',
       'hybrid_ret_docs', 'hybrid_reranked_docs', 'passages_with_ids',
       'groundtruth_with_ids', 'FUSED_MAP', 'FUSED_NDCG', 'FUSED_MAP@3',
       'FUSED_NDCG@3', 'FUSED_MAP@5', 'FUSED_NDCG@5', 'FUSED_MAP@10',
       'FUSED_NDCG@10', 'RERANK_MAP', 'RERANK_NDCG', 'RERANK_MAP@3',
       'RERANK_NDCG@3', 'RERANK_MAP@5', 'RERANK_NDCG@5', 'RERANK_MAP@10',
       'RERANK_NDCG@10'],
      dtype='object')

In [3]:
df_cont = pd.read_csv('input_files/hotpotqa_contriever.csv')  
df_cont.columns 

Index(['question', 'answer', 'passage', 'groundtruth_docs',
       'contriever_ret_docs', 'MAP@3', 'MAP@5', 'MAP@10', 'NDCG@3', 'NDCG@5',
       'NDCG@10'],
      dtype='object')

In [5]:
#!/usr/bin/env python3
import ast
import json
import os
import sys
import pandas as pd

INPUT_HYBRID = "input_files/hotpotqa_hybrid_linear_with_rerank_minilm_alpha_0.3_beta_0.85.csv"
INPUT_CONT   = "input_files/hotpotqa_contriever.csv"
OUTPUT_CSV   = "output_files/hotpot/hotpotqa_sampled_120_with_full_texts.csv"  # change as needed
SAMPLE_SIZE  = 120
RANDOM_SEED  = 42

REQ_COLS_HYBRID = ["question", "answer", "groundtruth_with_ids", "hybrid_reranked_docs"]
REQ_COLS_FINAL  = ["question", "answer", "groundtruth_with_ids",
                   "hybrid_reranked_docs", "contriever_ret_docs",
                   "hybrid_full_text", "contriever_full_text"]

def maybe_parse_listlike(series):
    """
    Convert stringified list-of-dicts to real Python lists if needed.
    Leaves non-string values unchanged.
    """
    def parse_cell(x):
        if isinstance(x, str):
            xs = x.strip()
            if (xs.startswith("[") and xs.endswith("]")) or (xs.startswith("{") and xs.endswith("}")):
                # Try literal_eval first (handles single quotes), then JSON fallback
                try:
                    return ast.literal_eval(xs)
                except Exception:
                    try:
                        return json.loads(xs)
                    except Exception:
                        return x
        return x
    return series.apply(parse_cell)

def top5_full_text(items):
    """
    Extract up to 5 'full_text' strings from a list of dicts/objects.
    """
    if not isinstance(items, (list, tuple)):
        return []
    out = []
    for d in items[:5]:
        if isinstance(d, dict):
            val = d.get("full_text", None)
            if val is not None:
                out.append(val)
        else:
            # Try attribute access if objects
            try:
                val = getattr(d, "full_text", None)
                if val is not None:
                    out.append(val)
            except Exception:
                pass
    return out

def main():
    # Load dataframes
    try:
        df_hybrid = pd.read_csv(INPUT_HYBRID)
    except Exception as e:
        print(f"Failed to read hybrid CSV: {INPUT_HYBRID}\nError: {e}")
        sys.exit(1)

    try:
        df_cont = pd.read_csv(INPUT_CONT)
    except Exception as e:
        print(f"Failed to read contriever CSV: {INPUT_CONT}\nError: {e}")
        sys.exit(1)

    # Validate required columns in df_hybrid
    missing_h = [c for c in REQ_COLS_HYBRID if c not in df_hybrid.columns]
    if missing_h:
        print(f"df_hybrid missing required columns: {missing_h}")
        print("df_hybrid columns:", list(df_hybrid.columns))
        sys.exit(1)

    # Keep only 'contriever_ret_docs' in df_cont
    if "contriever_ret_docs" not in df_cont.columns:
        print("df_cont missing 'contriever_ret_docs' column.")
        print("df_cont columns:", list(df_cont.columns))
        sys.exit(1)
    df_cont = df_cont[["contriever_ret_docs"]]

    # Align lengths (assumes same row order alignment by index)
    if len(df_cont) != len(df_hybrid):
        print(f"Warning: Row count mismatch. df_hybrid={len(df_hybrid)}, df_cont={len(df_cont)}")
        # If you need to align by an id, merge on that key instead of concatenating by index.
        # For now, we will align by index and truncate to the min length.
        min_len = min(len(df_hybrid), len(df_cont))
        df_hybrid = df_hybrid.iloc[:min_len].reset_index(drop=True)
        df_cont   = df_cont.iloc[:min_len].reset_index(drop=True)

    # Concatenate contriever_ret_docs into df_hybrid
    df = pd.concat([df_hybrid.reset_index(drop=True), df_cont.reset_index(drop=True)], axis=1)

    # Keep only the specified columns before sampling
    keep_cols = ["question", "answer", "groundtruth_with_ids", "hybrid_reranked_docs", "contriever_ret_docs"]
    missing_keep = [c for c in keep_cols if c not in df.columns]
    if missing_keep:
        print(f"Missing columns before sampling: {missing_keep}")
        print("Available columns:", list(df.columns))
        sys.exit(1)
    df = df[keep_cols]

    # Randomly sample 1200 rows with seed 1200
    if len(df) < SAMPLE_SIZE:
        print(f"Warning: Requested sample size {SAMPLE_SIZE} > available rows {len(df)}. Taking all rows.")
        df_sampled = df.copy()
    else:
        df_sampled = df.sample(n=SAMPLE_SIZE, random_state=RANDOM_SEED).reset_index(drop=True)

    # Parse list-like columns if they are stringified
    for col in ["hybrid_reranked_docs", "contriever_ret_docs"]:
        df_sampled[col] = maybe_parse_listlike(df_sampled[col])

    # Create the two new columns from top-5 full_text
    df_sampled["hybrid_full_text"] = df_sampled["hybrid_reranked_docs"].apply(top5_full_text)
    df_sampled["contriever_full_text"] = df_sampled["contriever_ret_docs"].apply(top5_full_text)

    # Ensure final column order
    df_out = df_sampled[REQ_COLS_FINAL].copy()

    # Save: store list columns as JSON strings to preserve structure in CSV
    df_save = df_out.copy()
    for col in ["hybrid_reranked_docs", "contriever_ret_docs", "hybrid_full_text", "contriever_full_text"]:
        df_save[col] = df_save[col].apply(lambda x: json.dumps(x) if isinstance(x, (list, dict)) else x)

    os.makedirs(os.path.dirname(OUTPUT_CSV), exist_ok=True)
    try:
        df_save.to_csv(OUTPUT_CSV, index=False)
        print(f"Saved: {OUTPUT_CSV}")
    except Exception as e:
        print(f"Failed to write CSV: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()

Saved: output_files/hotpot/hotpotqa_sampled_120_with_full_texts.csv


# Final generation output files

In [66]:
import pandas as pd

df = pd.read_csv('output_files/hotpot/hotpotqa_answers_hybrid_qwen3_4b_1200rows.csv')
# df = pd.read_csv('output_files/hotpot/hotpot_1200_answers_hybrid_llama3.1_8b.csv')

df.columns

Index(['question', 'answer', 'groundtruth_with_ids', 'hybrid_reranked_docs',
       'contriever_ret_docs', 'hybrid_full_text', 'contriever_full_text',
       'answer_gemma2b_hybrid'],
      dtype='object')

In [67]:
# remove duplicate rows in the dtaaset
df = df.drop_duplicates()
df.shape

(1200, 8)

In [68]:
df_new = df.drop(columns=[ 'groundtruth_with_ids', 'hybrid_reranked_docs',
       'contriever_ret_docs', 'hybrid_full_text', 'contriever_full_text']) 
df_new.columns

Index(['question', 'answer', 'answer_gemma2b_hybrid'], dtype='object')

In [69]:
# rename this column name to answer_gemma2b_contriever
df_new = df_new.rename(columns={"answer_gemma2b_hybrid": "answer_qwen4b_hybrid"})
df_new.columns

Index(['question', 'answer', 'answer_qwen4b_hybrid'], dtype='object')

In [70]:
df_new.to_csv("hotpotqa_answers_hybrid_qwen3_4b_1200rows_final_cut.csv", index=False)


GT
[{'doc_id': '64_0', 'full_text': 'iqaluit airport (iata: yfb, icao: cyfb) serves iqaluit, nunavut, canada and is located adjacent to the town.'}, {'doc_id': '64_5', 'full_text': 'canadian north inc. is an airline headquartered in calgary, alberta, canada.'}]

MPNet
[{'doc_id': '39763_1', 'score': 0.744383, 'full_text': 'it is located in iqaluit, nunavut.'}, 
{'doc_id': '64_0', 'score': 0.743217, 'full_text': 'iqaluit airport (iata: yfb, icao: cyfb) serves iqaluit, nunavut, canada and is located adjacent to the town.'}, 
{'doc_id': '64_8', 'score': 0.680723, 'full_text': 'its main base is edmonton airport.']

Splade 
[{'doc_id': '64_0', 'score': 22.4232, 'full_text': 'iqaluit airport (iata: yfb, icao: cyfb) serves iqaluit, nunavut, canada and is located adjacent to the town.' }, 
{'doc_id': '39763_2', 'score': 19.4198, 'full_text': 'iqaluit (inuktitut ), meaning place of fish, is the capital of the canadian territory of nunavut; its largest community, and its only city.'}, 
{'doc_id': '64_5', 'score': 17.4174, 'full_text': 'canadian north inc. is an airline headquartered in calgary, alberta, canada.'}

Hybrid
[{'doc_id': '64_0', 'score': 0.9969582344194297, 'rank': 1, 'full_text': 'iqaluit airport (iata: yfb, icao: cyfb) serves iqaluit, nunavut, canada and is located adjacent to the town.'}, 
{'doc_id': '64_5', 'score': 0.5659488834121346, 'rank': 2, 'full_text': 'canadian north inc. is an airline headquartered in calgary, alberta, canada.'}
{'doc_id': '39763_2', 'score': 0.3417903555643511, 'rank': 3, 'full_text': 'iqaluit (inuktitut), meaning place of fish, is the capital of the canadian territory of nunavut; its largest community, and its only city.'}