# Aggregate multiple datasets to one

Thsi section defines some stuff for running the script. 

In [1]:
import os
import glob
import zipfile
import itertools
import pandas as pd

# -----------------------------
# Global variables
# -----------------------------
# List of models (as provided)
MODELS_FULL = [
    "meta-llama/Llama-3.1-8B-Instruct",
    "kaist-ai/janus-7b",
    "mistralai/Mixtral-8x7B-Instruct-v0.1",
    # "meta-llama/Llama-3.3-70B-Instruct",
    "mistralai/Mistral-7B-Instruct-v0.3",
]
# Process model names
MODELS = [model.split("/")[-1] for model in MODELS_FULL]

# Prompt methods to process
PROMPT_METHODS = ["cot", "direct", "icl"]

# The input dataset directories (each should contain a CSV per model)
DATASETS_FULL = [
    "tau/commonsense_qa",
    "cais/mmlu",
    "truthfulqa/truthful_qa",
]
# Process dataset names
DATASETS = [dataset.split("/")[-1] for dataset in DATASETS_FULL]

DATASET_LENS = {
    "commonsense_qa": 1221,
    "mmlu": 5170,
    "truthful_qa": 817,
}

# The target output subdirectory (under each prompt method) where aggregated results are saved.
TARGET_DATASET = "full"
# The robust dataset to use for aggregation
ROBUST_PATH = os.path.join("data", "robuset.csv")

from preferences.prefs import MMLU_PREFS, TQA_PREFS, CQA_PREFS

DATASET_PREFERENCES = {
  "tau/commonsense_qa": CQA_PREFS,
  "cais/mmlu": MMLU_PREFS,
  "truthfulqa/truthful_qa": TQA_PREFS,
  "commonsense_qa": CQA_PREFS,
  "mmlu": MMLU_PREFS,
  "truthful_qa": TQA_PREFS,
}

## Check if for all models/method, all files exist

This part checks if all datasets (configured above) exists one and only one csv files for a particular (prompt method, model) pair, and whether it has the same length. Stats are printed for missing files for each combination, so you know which runs are still missing. 

In [2]:
valid_combinations = []

# First, verify the robust dataset exists.
if not os.path.exists(ROBUST_PATH):
    print(f"Robust dataset not found at {ROBUST_PATH}. Exiting.")
    exit(0)
robust_df = pd.read_csv(ROBUST_PATH)
robust_rows = len(robust_df)

def detect_and_add(method, dataset, model, combo_files):
    file_pattern1 = os.path.join("results", "mcq_results", "relevant", method, dataset, model+"*_eval.csv")
    file_pattern2 = os.path.join("results", "mcq_results", "relevant", method, dataset, model+"_eval.csv")
    # print(f"Checking {file_pattern1} and {file_pattern2}")
    files = list(set(glob.glob(file_pattern1) + glob.glob(file_pattern2)))
    if files:
        if len(files) > 1:
            print(f"Warning: Multiple files found for [{method}, {model}, {dataset}]. Check combination")
            return False
        combo_files.append(files[0])
        return True
    return False

# Check all files exist
for method, model in itertools.product(PROMPT_METHODS, MODELS):
    # print(f"\nChecking {method}, {model}")
    print("")
    combo_files = []
    for dataset in DATASETS:
        if detect_and_add(method, dataset, model, combo_files):
            continue
        # Otherwise, try the zipped version
        zip_file = os.path.join("results", "mcq_results", "relevant", method, dataset, model+".zip")
        if os.path.exists(zip_file):
            with zipfile.ZipFile(zip_file, "r") as zip_ref:
                zip_ref.extractall(os.path.join("results", "mcq_results", "relevant", method, dataset))
            if detect_and_add(method, dataset, model, combo_files):
                continue
        print(f"Warning: No files found for [{method}, {model}, {dataset}]")
        continue
    if len(combo_files) != len(DATASETS):
        print(f"Incomplete files for [{method}, {model}]")
        continue
    # Add to valid combinations
    valid_combinations.append((method, model, combo_files))
    print(f"Completed files for {method}, {model}")

print(f"Found {len(valid_combinations)}/{len(PROMPT_METHODS)*len(MODELS)} valid combinations:")
for method, model, combo_files in valid_combinations:
    print(f" - {method}, {model}")
        


Incomplete files for [cot, Llama-3.1-8B-Instruct]

Incomplete files for [cot, janus-7b]

Incomplete files for [cot, Mixtral-8x7B-Instruct-v0.1]

Incomplete files for [cot, Mistral-7B-Instruct-v0.3]

Incomplete files for [direct, Llama-3.1-8B-Instruct]

Incomplete files for [direct, janus-7b]

Incomplete files for [direct, Mixtral-8x7B-Instruct-v0.1]

Incomplete files for [direct, Mistral-7B-Instruct-v0.3]

Incomplete files for [icl, Llama-3.1-8B-Instruct]

Incomplete files for [icl, janus-7b]

Incomplete files for [icl, Mixtral-8x7B-Instruct-v0.1]

Incomplete files for [icl, Mistral-7B-Instruct-v0.3]
Found 0/12 valid combinations:


# Remove existing aggregations, if in conflict

This section can be turned on to skip combinations that already have a full version exist. 

In [3]:
SKIP_EXISTING = True

if SKIP_EXISTING:
    still_valid_combinations = []
    for (method, model, combo_files) in valid_combinations:
        goal_dataset_path = os.path.join("results", "mcq_results", "relevant", method, "full", f"{model}-{method}-full_eval.csv")
        if os.path.exists(goal_dataset_path):
            exist_df = pd.read_csv(goal_dataset_path)
            exist_rows = len(exist_df)
            if exist_rows == robust_rows:
                print(f"Skipping {method}, {model} as it already exists.")
                continue
            else:
                print(f"Warning: {method}, {model} exists but has {exist_rows}/{robust_rows} rows. Overwriting.")
        else:
            print(f"Processing {method}, {model}")
        still_valid_combinations.append((method, model, combo_files))
    valid_combinations = still_valid_combinations

print(f"\nTo be prrocessed: {len(valid_combinations)}/{len(PROMPT_METHODS)*len(MODELS)} combinations")
for method, model, combo_files in valid_combinations:
    print(f" - {method}, {model}")


To be prrocessed: 0/12 combinations


For each combination, process the data and match with existing robust_df.

In [4]:
for (method, model, combo_files) in valid_combinations:
    processed_dfs = []
    goal_dataset_path = os.path.join("results", "mcq_results", "relevant", method, "full", f"{model}-{method}-full_eval.csv")
    for combo_file, dataset, full_dataset in zip(combo_files, DATASETS, DATASETS_FULL):
        print(f"Processing {method}, {model}, {dataset}")
        data_df = pd.read_csv(combo_file)
        # Get chunk from robust_df
        robust_chunk = robust_df[robust_df["source"] == full_dataset].copy()
        assert len(data_df)==len(robust_chunk), "Sizes don't match!"

        data_df = data_df.sort_values(by=["question", "options"]).reset_index(drop=True)
        robust_chunk = robust_chunk.sort_values(by=["question", "options"]).reset_index(drop=True)
        # Check size
        assert len(data_df)==len(robust_chunk), "Sizes don't match!"

        # Assert correct answers
        data_correct_ans = data_df["gold_option"].reset_index(drop=True)
        robust_correct_ans = robust_chunk["gold_option"].reset_index(drop=True)
        assert len(data_correct_ans)==len(robust_correct_ans), "Option Sizes don't match!"

        pref_library = DATASET_PREFERENCES.get(full_dataset, {})
        # Reverse the TQA_PREFS dictionary: keys become values and vice versa.
        rev_prefs = {v: k for k, v in pref_library.items()}

        # create a new column "pref_index" that maps the text to its corresponding index.
        robust_chunk["pref_index"] = robust_chunk.loc[:,"preference"].map(rev_prefs)
        pref_text = robust_chunk.apply(
            lambda row: pref_library[int(row['pref_index'])],
            axis=1
        )
        pref_res = robust_chunk.apply(
            lambda row: data_df.loc[row.name, f"profile_{int(row['pref_index'])}_res"],
            axis=1
        )
        pref_answer = robust_chunk.apply(
            lambda row: data_df.loc[row.name, f"profile_{int(row['pref_index'])}_answer"],
            axis=1
        )
        pref_correct = pref_answer.eq(data_df["gold_option"])
        nopref_res = robust_chunk.apply(
            lambda row: data_df.loc[row.name, f"profile_0_res"],
            axis=1
        )
        nopref_answer = robust_chunk.apply(
            lambda row: data_df.loc[row.name, f"profile_0_answer"],
            axis=1
        )
        nopref_correct = nopref_answer.eq(data_df["gold_option"])
        # Start building a new dataframe
        new_df_dict = {
            "question": data_df["question"],
            "options": data_df["options"],
            "gold_option": data_df["gold_option"],
            "source": robust_chunk["source"],
            "profile_idx": robust_chunk["pref_index"],
            "preference": pref_text,
            "pref_res": pref_res,
            "pref_answer": pref_answer,
            "pref_correct": pref_correct,
            "nopref_res": nopref_res,
            "nopref_answer": nopref_answer,
            "nopref_correct": nopref_correct,
        }
        # print("Correctness rate:", nopref_correct.mean(), pref_correct.mean())
        processed_dfs.append(pd.DataFrame(new_df_dict))
    
    # Concatenate and save df
    combined_df = pd.concat(processed_dfs, ignore_index=True)
    assert len(combined_df)==robust_rows, "Sizes don't match!"
    combined_df.to_csv(goal_dataset_path, index=False)
    print(f"Saved {method}, {model} to full")
            

## Add correctness column if not exist

In [5]:
# Create stats folder and compute stats
stats_folder = os.path.join("results", "mcq_results", "relevant","stats")

# For each method and model, try loading the full dataset
# And compute robustness = (pref_correct / nopref_correct) given nopref_correct
if not os.path.exists(stats_folder):
    os.makedirs(stats_folder)

# Rebuild valid combinations
valid_combinations = []
for method, model in itertools.product(PROMPT_METHODS, MODELS):
    full_dataset_path = os.path.join("results", "mcq_results", "relevant", method, "full", f"{model}-{method}-full_eval.csv")
    if not os.path.exists(full_dataset_path):
        print(f"Warning: No full dataset found for [{method}, {model}]")
        # valid_combinations.append((method, model, None))
        continue
    # Add to valid combinations
    valid_combinations.append((method, model, full_dataset_path))
    print(f"Added files for {method}, {model}")

    full_df = pd.read_csv(full_dataset_path)
    if 'nopref_correct' not in full_df.columns:
        full_df['nopref_correct'] = full_df["gold_option"] == full_df["no_pref_ans"]
    if 'pref_correct' not in full_df.columns:
        full_df['pref_correct'] = full_df["gold_option"] == full_df["pref_ans"]
    full_df.to_csv(full_dataset_path, index=False)

print(f"Found {len(valid_combinations)}/{len(PROMPT_METHODS)*len(MODELS)} datasets to stats")

Added files for cot, Llama-3.1-8B-Instruct
Added files for cot, janus-7b
Added files for cot, Mixtral-8x7B-Instruct-v0.1
Added files for cot, Mistral-7B-Instruct-v0.3
Added files for direct, Llama-3.1-8B-Instruct
Added files for direct, janus-7b
Added files for direct, Mixtral-8x7B-Instruct-v0.1
Added files for direct, Mistral-7B-Instruct-v0.3
Added files for icl, Llama-3.1-8B-Instruct
Added files for icl, janus-7b
Added files for icl, Mixtral-8x7B-Instruct-v0.1
Added files for icl, Mistral-7B-Instruct-v0.3
Found 12/12 datasets to stats
