In [11]:
import pandas as pd
import glob
import pickle
import argparse
import re
import os

In [12]:
def extract_answer(answer):
    """Extracts the correct answers from the provided answer string.

    Args:
        answer: The answer string to extract the correct answers from.

    Returns:
        A list of correct answers.
    """

    # Remove reasoning sections enclosed in <think> tags
    answer = re.sub(r"<think>.*?</think>", "", answer, flags=re.DOTALL).strip()

    # Remove unnecessary characters for better processing
    answer_proc = re.sub(r'[\s\n.,]', '', answer)

    # Define regex patterns
    pattern_single_letters = re.compile(r'^[A-J]+$')

    # Improved regex to handle multiple formats, parentheses, and separators
    pattern1 = re.compile(
        r"answer is(?!.*\bnot\b)\s*(?:\[*\(*\**([A-J])\**\)*\]*,?\s*)+", re.IGNORECASE
    )

    pattern2 = re.compile(
        r".*[aA]nswer:\s*(?:\[*\(*\**([A-J])\**\)*\]*,?\s*)+", re.IGNORECASE
    )

    # Directly match if the entire answer consists of letters (A-J)
    if re.match(pattern_single_letters, answer_proc):
        return list(answer_proc)

    else:
        # Find all matches
        match1 = pattern1.findall(answer)
        match2 = pattern2.findall(answer)

        # Combine results from both patterns
        results = match1 + match2

        # Flatten the list and remove duplicates
        combined_results = list(set("".join(results)))

        return combined_results


In [13]:
def process_value(value):
    """
    Modify this function to define how you want to process the values.
    """
    return value * 2  # Example: Modify as per your requirements

def process_pkl(input_file, output_file, input_column, output_column, temperature, exam, prompt_engineering):
    # Load the DataFrame
    with open(input_file, "rb") as f:
        df = pickle.load(f)

    # Ensure input column exists before processing
    if input_column not in df.columns:
        raise ValueError(f"Column '{input_column}' not found in the DataFrame.")

    # Apply function to the input column and save results in the new column
    df[output_column] = df[input_column].apply(extract_answer)

    # Add new columns with constant values
    df["Temperature"] = temperature
    df["Exam"] = exam
    df["Prompt_Engineering"] = prompt_engineering

    # Define columns to keep
    selected_columns = ["Exam",
                        "QuestionIndex",
                        "NumberOfChoices",
                        "Model",
                        "Temperature",
                        "SamplingIndex",
                        "LLM_Answer",
                        "Exam_Answers",
                        output_column,
                        "Prompt_Engineering"]

    # Select only the specified columns
    df_selected = df[selected_columns]

    # Save the modified DataFrame as a .pkl file
    with open(output_file, "wb") as f:
        pickle.dump(df_selected, f)

    print(f"Processed file saved as: {output_file}")

In [14]:
def batch_process_pkl(input_dir, output_dir, input_column, output_column, temperature, exam, prompt_engineering):
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Get all .pkl files in the input directory
    pkl_files = glob.glob(os.path.join(input_dir, "*.pkl"))

    if not pkl_files:
        print(f"No .pkl files found in {input_dir}.")
        return

    print(f"Found {len(pkl_files)} .pkl files in {input_dir}. Processing...")

    # Loop through each file and process it
    for input_file in pkl_files:
        # Generate output file path
        filename = os.path.basename(input_file)
        output_file = os.path.join(output_dir, f"processed_{filename}")

        # Process and save
        process_pkl(input_file, output_file, input_column, output_column, temperature, exam, prompt_engineering)

In [15]:
#batch_process_pkl(
#     input_dir="./exam/201-301-ccna/0_shot/t00/",
#     output_dir="./processed_results/",
#     input_column="LLM_Answer",
#     output_column="Extracted_Answers",
#     temperature=0.0,
#     exam="CCNA-201-301",
#     prompt_engineering="0_shot"
# )

Found 10 .pkl files in ./exam/201-301-ccna/0_shot/t00/. Processing...
Processed file saved as: ./processed_results/processed_100_questions_201-301-CCNA_deepseek-ai_DeepSeek-R1-Distill-Llama-8B_20250206_0853_shuffled_1.pkl
Processed file saved as: ./processed_results/processed_100_questions_201-301-CCNA_meta-llama_Llama-3.1-8B-Instruct_20250211_1528_shuffled_4.pkl
Processed file saved as: ./processed_results/processed_100_questions_201-301-CCNA_deepseek-ai_DeepSeek-R1-Distill-Llama-8B_20250206_1111_shuffled_4.pkl
Processed file saved as: ./processed_results/processed_100_questions_201-301-CCNA_deepseek-ai_DeepSeek-R1-Distill-Llama-8B_20250206_1027_shuffled_3.pkl
Processed file saved as: ./processed_results/processed_100_questions_201-301-CCNA_meta-llama_Llama-3.1-8B-Instruct_20250211_1525_shuffled_1.pkl
Processed file saved as: ./processed_results/processed_100_questions_201-301-CCNA_deepseek-ai_DeepSeek-R1-Distill-Llama-8B_20250206_0938_shuffled_2.pkl
Processed file saved as: ./process