In [1]:
import pandas as pd
import glob
import pickle
import re
import os

In [2]:
def improved_extract_answer(answer):
    """Extracts the correct answers from the provided answer string with improved handling
    for final answer statements.
    """
    # Remove reasoning sections enclosed in <think> tags
    answer = re.sub(r"<think>.*?</think>", "", answer, flags=re.DOTALL).strip()

    # Check for final answer statement at the end of the text
    final_answer = re.search(r"(?:the\s+)?answer\s+is\s+([A-J](?:\s*,\s*[A-J])*)\s*\.?\s*$",
                            answer, re.IGNORECASE)
    if final_answer:
        letters = re.findall(r"[A-J]", final_answer.group(1))
        if letters:
            return sorted(set(letters))

    # Continue with the existing patterns...
    # Special case 1: Extract all "[X]" patterns in the entire text
    bracketed_letters = re.findall(r"\[([A-J])\]", answer)
    if len(bracketed_letters) > 1:
        return sorted(set(bracketed_letters))

    # Special case 2: "Answer :[A, B, C]" format
    if re.search(r"Answer\s*:\s*\[", answer, re.IGNORECASE):
        brackets_with_commas = re.search(r"\[(.*?)\]", answer)
        if brackets_with_commas:
            content = brackets_with_commas.group(1)
            letters = re.findall(r"([A-J])", content)
            if letters:
                return sorted(set(letters))

    # Find the answer section using regular pattern - from last occurrence
    answer_sections = list(re.finditer(
        r"(?:answer is|answer:|the correct answer is|therefore,\s+the\s+correct\s+answer\s+is)\s*"
        r"(.+?)"
        r"(?:\.|\:|\n|$)",
        answer, re.IGNORECASE
    ))

    # If found, use the last (most recent) answer section
    if answer_sections:
        last_match = answer_sections[-1]
        answer_section = last_match.group(1).strip()

        # Handle special case with 'not'
        if "not" in answer_section.lower():
            not_matches = re.findall(r"not\s+[\[\(\*]?([A-J])[\]\)\*]?", answer_section, re.IGNORECASE)
            all_matches = re.findall(r"([A-J])", answer_section)
            return sorted(set([letter for letter in all_matches if letter not in not_matches]))

        # Extract all letters directly
        all_letters = re.findall(r"([A-J])", answer_section)
        return sorted(set(all_letters))

    return []

In [3]:
def improved_extract_answer_PREV_2(answer):
    """Extracts the correct answers from the provided answer string.

    Args:
        answer: The answer string to extract the correct answers from.

    Returns:
        A list of correct answers.
    """
    # Remove reasoning sections enclosed in <think> tags
    answer = re.sub(r"<think>.*?</think>", "", answer, flags=re.DOTALL).strip()

    # Special case 1: Extract all "[X]" patterns in the entire text
    bracketed_letters = re.findall(r"\[([A-J])\]", answer)
    if len(bracketed_letters) > 1:
        return sorted(set(bracketed_letters))

    # Special case 2: "Answer :[A, B, C]" format (comma-separated list in one bracket)
    if re.search(r"Answer\s*:\s*\[", answer, re.IGNORECASE):
        brackets_with_commas = re.search(r"\[(.*?)\]", answer)
        if brackets_with_commas:
            content = brackets_with_commas.group(1)
            letters = re.findall(r"([A-J])", content)
            if letters:
                return sorted(set(letters))

    # Find the answer section using regular pattern
    pattern = re.compile(
        r"(?:answer is|answer:)\s*"  # Match indicators like "answer is" or "answer:"
        r"(.+?)"  # Capture everything after the indicator
        r"(?:\.|\:|\n|$)",  # Until a period, colon, newline, or end of string
        re.IGNORECASE
    )

    match = pattern.search(answer)
    if not match:
        return []

    answer_section = match.group(1).strip()

    # Handle special case with 'not'
    if "not" in answer_section.lower():
        not_matches = re.findall(r"not\s+[\[\(\*]?([A-J])[\]\)\*]?", answer_section, re.IGNORECASE)
        all_matches = re.findall(r"([A-J])", answer_section)

        # Filter out 'not' letters
        return sorted(set([letter for letter in all_matches if letter not in not_matches]))

    # Extract all letters directly
    all_letters = re.findall(r"([A-J])", answer_section)

    return sorted(set(all_letters))  # Remove duplicates and return sorted list



In [4]:
def improved_extract_answer_PREV(answer):
    """Extracts the correct answers from the provided answer string.

    Args:
        answer: The answer string to extract the correct answers from.

    Returns:
        A list of correct answers.
    """
    # Remove reasoning sections enclosed in <think> tags
    answer = re.sub(r"<think>.*?</think>", "", answer, flags=re.DOTALL).strip()

    # Pattern 1: Direct "answer is X" pattern
    direct_answer = re.search(r"(?:the\s+)?answer\s+is\s+([A-J](?:\s*,\s*[A-J])*)", answer, re.IGNORECASE)
    if direct_answer:
        letters = re.findall(r"[A-J]", direct_answer.group(1))
        if letters:
            return sorted(set(letters))

    # Pattern 2: "The answer is ABCDE" (multiple consecutive letters without spaces)
    consecutive_letters = re.search(r"(?:answer is|answer:)\s*([A-J]{2,})", answer, re.IGNORECASE)
    if consecutive_letters:
        letters = list(consecutive_letters.group(1))
        return sorted(set(letters))

    # Special case 1: Extract all "[X]" patterns in the entire text
    bracketed_letters = re.findall(r"\[([A-J])\]", answer)
    if len(bracketed_letters) > 1:
        return sorted(set(bracketed_letters))

    # Special case 2: "Answer :[A, B, C]" format (comma-separated list in one bracket)
    if re.search(r"Answer\s*:\s*\[", answer, re.IGNORECASE):
        brackets_with_commas = re.search(r"\[(.*?)\]", answer)
        if brackets_with_commas:
            content = brackets_with_commas.group(1)
            letters = re.findall(r"([A-J])", content)
            if letters:
                return sorted(set(letters))

    # Find the answer section using regular pattern
    pattern = re.compile(
        r"(?:answer is|answer:)\s*"  # Match indicators like "answer is" or "answer:"
        r"(.+?)"  # Capture everything after the indicator
        r"(?:\.|\:|\n|$|\"\"\"\")",  # Until a period, colon, newline, end of string, or 4 quote marks
        re.IGNORECASE
    )

    match = pattern.search(answer)
    if not match:
        return []

    answer_section = match.group(1).strip()

    # Handle special case with 'not'
    if "not" in answer_section.lower():
        not_matches = re.findall(r"not\s+[\[\(\*]?([A-J])[\]\)\*]?", answer_section, re.IGNORECASE)
        all_matches = re.findall(r"([A-J])", answer_section)

        # Filter out 'not' letters
        return sorted(set([letter for letter in all_matches if letter not in not_matches]))

    # Extract all letters directly
    all_letters = re.findall(r"([A-J])", answer_section)

    return sorted(set(all_letters))  # Remove duplicates and return sorted list

In [5]:
def their_extract_answer(answer):
    """Extracts the correct answers from the provided answer string.

    Args:
        answer: The answer string to extract the correct answers from.

    Returns:
        A list of correct answers.
    """

    # Cleaning the input by removing some non-relevant characters
    answer_proc = re.sub(r'[\s\n.,]', '', answer)

    # Define regex patterns for different cases
    pattern_single_letters = re.compile(r'^[A-J]+$')
    #pattern1 = re.compile(r"answer is \(?([A-J]+)\)?", re.IGNORECASE)
    pattern1 = re.compile(r"answer is \[?([A-J]+)\]?", re.IGNORECASE)
    pattern2 = re.compile(r'.*[aA]nswer:\s*([A-J]+)', re.IGNORECASE)

    if re.match(pattern_single_letters, answer_proc):
        return list(answer_proc)
    else:
        # Find matches using the first regex pattern
        #drop , from answer

        match1 = pattern1.findall(answer)

        # Find matches using the second regex pattern
        match2 = pattern2.findall(answer)

        # Combine results from both patterns
        results = match1 + match2

        # Flatten the list and remove duplicates
        combined_results = []
        for result in results:
            combined_results.extend(list(result))

        return list(set(combined_results))

In [6]:
def test_function(function_to_test):
    """Test the improved_extract_answer function with various cases."""
    test_cases = [
        # Previously correct cases
        ("The answer is ABCDE.", ['A', 'B', 'C', 'D', 'E']),
        ("The best answer is C, E.", ['C', 'E']),
        ("The best answer is [DE]. The access and distribution layers must be on the same device", ['D', 'E']),
        ("Answer:[A, B, C]", ['A', 'B', 'C']),
        ("The best answer is A, and not B.", ['A']),
        ("Answer: A and B:", ['A', 'B']),
        ("The best answer is *A*", ['A']),
        ("The best answer is (A)", ['A']),

        # Previously incorrect cases
        ("The best answer is A, and B.", ['A', 'B']),
        ("The best answer is (A), (B), and (C)", ['A', 'B', 'C']),
        ("Answer: [A], [B], and [C]:", ['A', 'B', 'C']),
        ("Answer: A, B, and C:", ['A', 'B', 'C']),
        ("Answer: A, B, C:", ['A', 'B', 'C']),

        # New test case
        ("Some text\n\nthe answer is B", ['B']),

        # Think case
        ("""
<think>
Okay, so I have this question about the ip route command in Cisco IOS. I'm a bit new to this, but I'll try to work through it step by step. The question is asking which two statements are true about the command: ip route 172.16.3.0 255.255.255.0 192.168.2.4.
So the true statements are A and E.
</think>

The best answer is A and E.
""", ['A','E']),
        ("""
Looking at the options, the question is which one they do NOT provide. So the answer is the one that's not a function of encryption.

Integrity is about data integrity, which is separate. So B is not provided by encryption.

Wait, but the question is about what they do NOT provide. So the correct answer is B. Integrity.

Wait, but I'm a bit confused. Let me think again. Encryption ensures that only authorized people can read the data, so it's about confidentiality. It doesn't ensure that the data hasn't been altered (integrity) or that the system is reliable. So the answer is B. Integrity.
</think>

The question asks which aspect encryption and decryption do not provide. Encryption focuses on confidentiality, not integrity. Therefore, the correct answer is:

B. Integrity

the answer is B""", ['B']),

        ("""<think>
Looking at the options, the question is which one they do NOT provide. So the answer is the one that's not a function of encryption.

Integrity is about data integrity, which is separate. So B is not provided by encryption.

Wait, but the question is about what they do NOT provide. So the correct answer is B. Integrity.

Wait, but I'm a bit confused. Let me think again. Encryption ensures that only authorized people can read the data, so it's about confidentiality. It doesn't ensure that the data hasn't been altered (integrity) or that the system is reliable. So the answer is B. Integrity.
</think>

The question asks which aspect encryption and decryption do not provide. Encryption focuses on confidentiality, not integrity. Therefore, the correct answer is:

B. Integrity

the answer is B""", ['B'])
    ]

    passed_all_tests = True
    for input_text, expected_output in test_cases:
        result = function_to_test(input_text)
        print(f"{input_text} : {result} {'✓' if result == expected_output else '✗'}")
        if result != expected_output:
            passed_all_tests = False

    if passed_all_tests:
        print("All tests passed")
    else:
        print("Some tests failed")

In [7]:
print("Previous Improved extracted answer function")
test_function(improved_extract_answer)


Previous Improved extracted answer function
The answer is ABCDE. : ['A', 'B', 'C', 'D', 'E'] ✓
The best answer is C, E. : ['C', 'E'] ✓
The best answer is [DE]. The access and distribution layers must be on the same device : ['D', 'E'] ✓
Answer:[A, B, C] : ['A', 'B', 'C'] ✓
The best answer is A, and not B. : ['A'] ✓
Answer: A and B: : ['A', 'B'] ✓
The best answer is *A* : ['A'] ✓
The best answer is (A) : ['A'] ✓
The best answer is A, and B. : ['A', 'B'] ✓
The best answer is (A), (B), and (C) : ['A', 'B', 'C'] ✓
Answer: [A], [B], and [C]: : ['A', 'B', 'C'] ✓
Answer: A, B, and C: : ['A', 'B', 'C'] ✓
Answer: A, B, C: : ['A', 'B', 'C'] ✓
Some text

the answer is B : ['B'] ✓

<think>
Okay, so I have this question about the ip route command in Cisco IOS. I'm a bit new to this, but I'll try to work through it step by step. The question is asking which two statements are true about the command: ip route 172.16.3.0 255.255.255.0 192.168.2.4.
So the true statements are A and E.
</think>

The best

In [8]:
print("Prev Improved extracted answer function")
test_function(improved_extract_answer_PREV)

Prev Improved extracted answer function
The answer is ABCDE. : ['A'] ✗
The best answer is C, E. : ['C', 'E'] ✓
The best answer is [DE]. The access and distribution layers must be on the same device : ['D', 'E'] ✓
Answer:[A, B, C] : ['A', 'B', 'C'] ✓
The best answer is A, and not B. : ['A'] ✓
Answer: A and B: : ['A', 'B'] ✓
The best answer is *A* : ['A'] ✓
The best answer is (A) : ['A'] ✓
The best answer is A, and B. : ['A'] ✗
The best answer is (A), (B), and (C) : ['A', 'B', 'C'] ✓
Answer: [A], [B], and [C]: : ['A', 'B', 'C'] ✓
Answer: A, B, and C: : ['A', 'B', 'C'] ✓
Answer: A, B, C: : ['A', 'B', 'C'] ✓
Some text

the answer is B : ['B'] ✓

<think>
Okay, so I have this question about the ip route command in Cisco IOS. I'm a bit new to this, but I'll try to work through it step by step. The question is asking which two statements are true about the command: ip route 172.16.3.0 255.255.255.0 192.168.2.4.
So the true statements are A and E.
</think>

The best answer is A and E.
 : ['A'] 

In [9]:
print("Prev 2 Improved extracted answer function")
test_function(improved_extract_answer_PREV_2)


Prev 2 Improved extracted answer function
The answer is ABCDE. : ['A', 'B', 'C', 'D', 'E'] ✓
The best answer is C, E. : ['C', 'E'] ✓
The best answer is [DE]. The access and distribution layers must be on the same device : ['D', 'E'] ✓
Answer:[A, B, C] : ['A', 'B', 'C'] ✓
The best answer is A, and not B. : ['A'] ✓
Answer: A and B: : ['A', 'B'] ✓
The best answer is *A* : ['A'] ✓
The best answer is (A) : ['A'] ✓
The best answer is A, and B. : ['A', 'B'] ✓
The best answer is (A), (B), and (C) : ['A', 'B', 'C'] ✓
Answer: [A], [B], and [C]: : ['A', 'B', 'C'] ✓
Answer: A, B, and C: : ['A', 'B', 'C'] ✓
Answer: A, B, C: : ['A', 'B', 'C'] ✓
Some text

the answer is B : ['B'] ✓

<think>
Okay, so I have this question about the ip route command in Cisco IOS. I'm a bit new to this, but I'll try to work through it step by step. The question is asking which two statements are true about the command: ip route 172.16.3.0 255.255.255.0 192.168.2.4.
So the true statements are A and E.
</think>

The best a

In [10]:
print("Their extracted answer function")
test_function(their_extract_answer)

Their extracted answer function
The answer is ABCDE. : ['D', 'E', 'A', 'B', 'C'] ✗
The best answer is C, E. : ['C'] ✗
The best answer is [DE]. The access and distribution layers must be on the same device : ['D', 'E'] ✓
Answer:[A, B, C] : [] ✗
The best answer is A, and not B. : ['A'] ✓
Answer: A and B: : ['A'] ✗
The best answer is *A* : [] ✗
The best answer is (A) : [] ✗
The best answer is A, and B. : ['A'] ✗
The best answer is (A), (B), and (C) : [] ✗
Answer: [A], [B], and [C]: : [] ✗
Answer: A, B, and C: : ['A'] ✗
Answer: A, B, C: : ['A'] ✗
Some text

the answer is B : ['B'] ✓

<think>
Okay, so I have this question about the ip route command in Cisco IOS. I'm a bit new to this, but I'll try to work through it step by step. The question is asking which two statements are true about the command: ip route 172.16.3.0 255.255.255.0 192.168.2.4.
So the true statements are A and E.
</think>

The best answer is A and E.
 : ['A'] ✗

Looking at the options, the question is which one they do NO

In [11]:
def process_pkl_mcqa(input_file, output_file, llm_answer_column, temperature, exam, prompt_engineering):
    # Load the DataFrame
    with open(input_file, "rb") as f:
        df = pickle.load(f)

    # Ensure input column exists before processing
    if llm_answer_column not in df.columns:
        raise ValueError(f"Column '{llm_answer_column}' not found in the DataFrame.")

    improved_extracted_answer_column = "Improved_Extracted_Answer_Column"
    their_extracted_answer_column = "Their_Extracted_Answer_Column"

    # Apply functions to the llm answer column and save results in the new columns
    df[improved_extracted_answer_column] = df[llm_answer_column].apply(improved_extract_answer)
    df[their_extracted_answer_column] = df[llm_answer_column].apply(their_extract_answer)

    # Add new columns with constant values
    df["Temperature"] = temperature
    df["Exam"] = exam
    df["Prompt_Engineering"] = prompt_engineering
    df["Differ"] = df[their_extracted_answer_column] != df[improved_extracted_answer_column]

    # Define columns to keep
    selected_columns = ["Exam",
                        "QuestionIndex",
                        "NumberOfChoices",
                        "Model",
                        "SamplingIndex",
                        "Temperature",
                        "Prompt_Engineering",
                        llm_answer_column,
                        "Exam_Answers",
                        improved_extracted_answer_column,
                        their_extracted_answer_column,
                        "Differ"]

    # Select only the specified columns
    df_selected = df[selected_columns]

    # Save the modified DataFrame as a .pkl file
    with open(output_file, "wb") as f:
        pickle.dump(df_selected, f)

    print(f"Processed file saved as: {output_file}")

In [12]:
def batch_process_pkl_mcqa(input_dir, output_dir, llm_answer_column, temperature, exam, prompt_engineering):
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Get all .pkl files in the input directory
    pkl_files = glob.glob(os.path.join(input_dir, "*.pkl"))

    if not pkl_files:
        print(f"No .pkl files found in {input_dir}.")
        return

    print(f"Found {len(pkl_files)} .pkl files in {input_dir}. Processing...")

    # Loop through each file and process it
    for input_file in pkl_files:
        # Generate output file path
        filename = os.path.basename(input_file)
        output_file = os.path.join(output_dir, f"processed_{filename}")

        # Process and save
        process_pkl_mcqa(input_file, output_file, llm_answer_column, temperature, exam, prompt_engineering)

In [13]:
# CCNA
batch_process_pkl_mcqa(
    input_dir="./exam/201-301-ccna/0_shot/t00/",
    output_dir="./processed_results/",
    llm_answer_column="LLM_Answer",
    temperature=0.0,
    exam="CCNA-201-301",
    prompt_engineering="0_shot"
)

batch_process_pkl_mcqa(
    input_dir="./exam/201-301-ccna/0_shot/t07/",
    output_dir="./processed_results/",
    llm_answer_column="LLM_Answer",
    temperature=0.7,
    exam="CCNA-201-301",
    prompt_engineering="0_shot"
)

batch_process_pkl_mcqa(
    input_dir="./exam/201-301-ccna/5_shot/t00/",
    output_dir="./processed_results/",
    llm_answer_column="LLM_Answer",
    temperature=0.0,
    exam="CCNA-201-301",
    prompt_engineering="5_shot"
)

batch_process_pkl_mcqa(
    input_dir="./exam/201-301-ccna/5_shot/t07/",
    output_dir="./processed_results/",
    llm_answer_column="LLM_Answer",
    temperature=0.7,
    exam="CCNA-201-301",
    prompt_engineering="5_shot"
)



Found 10 .pkl files in ./exam/201-301-ccna/0_shot/t00/. Processing...
Processed file saved as: ./processed_results/processed_100_questions_201-301-CCNA_deepseek-ai_DeepSeek-R1-Distill-Llama-8B_20250206_0853_shuffled_1.pkl
Processed file saved as: ./processed_results/processed_100_questions_201-301-CCNA_meta-llama_Llama-3.1-8B-Instruct_20250211_1528_shuffled_4.pkl
Processed file saved as: ./processed_results/processed_100_questions_201-301-CCNA_deepseek-ai_DeepSeek-R1-Distill-Llama-8B_20250206_1111_shuffled_4.pkl
Processed file saved as: ./processed_results/processed_100_questions_201-301-CCNA_deepseek-ai_DeepSeek-R1-Distill-Llama-8B_20250206_1027_shuffled_3.pkl
Processed file saved as: ./processed_results/processed_100_questions_201-301-CCNA_meta-llama_Llama-3.1-8B-Instruct_20250211_1525_shuffled_1.pkl
Processed file saved as: ./processed_results/processed_100_questions_201-301-CCNA_deepseek-ai_DeepSeek-R1-Distill-Llama-8B_20250206_0938_shuffled_2.pkl
Processed file saved as: ./process

In [14]:
# CCNP
batch_process_pkl_mcqa(
    input_dir="./exam/350-701-ccnp/0_shot/t00/",
    output_dir="./processed_results/",
    llm_answer_column="LLM_Answer",
    temperature=0.0,
    exam="CCNP-350-701",
    prompt_engineering="0_shot"
)

batch_process_pkl_mcqa(
    input_dir="./exam/350-701-ccnp/0_shot/t07/",
    output_dir="./processed_results/",
    llm_answer_column="LLM_Answer",
    temperature=0.7,
    exam="CCNP-350-701",
    prompt_engineering="0_shot"
)

batch_process_pkl_mcqa(
    input_dir="./exam/350-701-ccnp/5_shot/t00/",
    output_dir="./processed_results/",
    llm_answer_column="LLM_Answer",
    temperature=0.0,
    exam="CCNP-350-701",
    prompt_engineering="5_shot"
)

batch_process_pkl_mcqa(
    input_dir="./exam/350-701-ccnp/5_shot/t07/",
    output_dir="./processed_results/",
    llm_answer_column="LLM_Answer",
    temperature=0.7,
    exam="CCNP-350-701",
    prompt_engineering="5_shot"
)


Found 10 .pkl files in ./exam/350-701-ccnp/0_shot/t00/. Processing...
Processed file saved as: ./processed_results/processed_100_questions_350-701-CCNP_meta-llama_Llama-3.1-8B-Instruct_20250211_1518_shuffled_0.pkl
Processed file saved as: ./processed_results/processed_100_questions_350-701-CCNP_meta-llama_Llama-3.1-8B-Instruct_20250211_1519_shuffled_4.pkl
Processed file saved as: ./processed_results/processed_100_questions_350-701-CCNP_deepseek-ai_DeepSeek-R1-Distill-Llama-8B_20250206_0109_shuffled_3.pkl
Processed file saved as: ./processed_results/processed_100_questions_350-701-CCNP_deepseek-ai_DeepSeek-R1-Distill-Llama-8B_20250206_0133_shuffled_4.pkl
Processed file saved as: ./processed_results/processed_100_questions_350-701-CCNP_deepseek-ai_DeepSeek-R1-Distill-Llama-8B_20250205_2330_shuffled_0.pkl
Processed file saved as: ./processed_results/processed_100_questions_350-701-CCNP_deepseek-ai_DeepSeek-R1-Distill-Llama-8B_20250206_0033_shuffled_2.pkl
Processed file saved as: ./process

In [15]:
# MMLU
batch_process_pkl_mcqa(
    input_dir="./exam/mmlu_computer_security/0_shot/t00/",
    output_dir="./processed_results/",
    llm_answer_column="LLM_Answer",
    temperature=0.0,
    exam="MMLU-Computer-Security",
    prompt_engineering="0_shot"
)

batch_process_pkl_mcqa(
    input_dir="./exam/mmlu_computer_security/0_shot/t07/",
    output_dir="./processed_results/",
    llm_answer_column="LLM_Answer",
    temperature=0.7,
    exam="MMLU-Computer-Security",
    prompt_engineering="0_shot"
)

batch_process_pkl_mcqa(
    input_dir="./exam/mmlu_computer_security/5_shot/t00/",
    output_dir="./processed_results/",
    llm_answer_column="LLM_Answer",
    temperature=0.0,
    exam="MMLU-Computer-Security",
    prompt_engineering="5_shot"
)

batch_process_pkl_mcqa(
    input_dir="./exam/mmlu_computer_security/5_shot/t07/",
    output_dir="./processed_results/",
    llm_answer_column="LLM_Answer",
    temperature=0.7,
    exam="MMLU-Computer-Security",
    prompt_engineering="5_shot"
)


Found 10 .pkl files in ./exam/mmlu_computer_security/0_shot/t00/. Processing...
Processed file saved as: ./processed_results/processed_100_questions_mmlu_Computer_Security_meta-llama_Llama-3.1-8B-Instruct_20250211_1506_shuffled_4.pkl
Processed file saved as: ./processed_results/processed_100_questions_mmlu_Computer_Security_meta-llama_Llama-3.1-8B-Instruct_20250211_1450_shuffled_2.pkl
Processed file saved as: ./processed_results/processed_100_questions_mmlu_Computer_Security_deepseek-ai_DeepSeek-R1-Distill-Llama-8B_20250207_0042_shuffled_1.pkl
Processed file saved as: ./processed_results/processed_100_questions_mmlu_Computer_Security_deepseek-ai_DeepSeek-R1-Distill-Llama-8B_20250207_0249_shuffled_4.pkl
Processed file saved as: ./processed_results/processed_100_questions_mmlu_Computer_Security_deepseek-ai_DeepSeek-R1-Distill-Llama-8B_20250207_0203_shuffled_3.pkl
Processed file saved as: ./processed_results/processed_100_questions_mmlu_Computer_Security_meta-llama_Llama-3.1-8B-Instruct_2

In [16]:
# MMLU
batch_process_pkl_mcqa(
    input_dir="./exam/mmlu_pro/0_shot/t00/",
    output_dir="./processed_results/",
    llm_answer_column="LLM_Answer",
    temperature=0.0,
    exam="MMLU-PRO-Computer-Security",
    prompt_engineering="0_shot"
)

batch_process_pkl_mcqa(
    input_dir="./exam/mmlu_pro/0_shot/t07/",
    output_dir="./processed_results/",
    llm_answer_column="LLM_Answer",
    temperature=0.7,
    exam="MMLU-PRO-Computer-Security",
    prompt_engineering="0_shot"
)

batch_process_pkl_mcqa(
    input_dir="./exam/mmlu_pro/1_shot_cot/t00/",
    output_dir="./processed_results/",
    llm_answer_column="LLM_Answer",
    temperature=0.0,
    exam="MMLU-PRO-Computer-Security",
    prompt_engineering="1_shot_cot"
)

batch_process_pkl_mcqa(
    input_dir="./exam/mmlu_pro/1_shot_cot/t07/",
    output_dir="./processed_results/",
    llm_answer_column="LLM_Answer",
    temperature=0.7,
    exam="MMLU-PRO-Computer-Security",
    prompt_engineering="1_shot_cot"
)

batch_process_pkl_mcqa(
    input_dir="./exam/mmlu_pro/5_shot_cot/t00/",
    output_dir="./processed_results/",
    llm_answer_column="LLM_Answer",
    temperature=0.0,
    exam="MMLU-PRO-Computer-Security",
    prompt_engineering="5_shot_cot"
)

batch_process_pkl_mcqa(
    input_dir="./exam/mmlu_pro/5_shot_cot/t07/",
    output_dir="./processed_results/",
    llm_answer_column="LLM_Answer",
    temperature=0.7,
    exam="MMLU-PRO-Computer-Security",
    prompt_engineering="5_shot_cot"
)


Found 10 .pkl files in ./exam/mmlu_pro/0_shot/t00/. Processing...
Processed file saved as: ./processed_results/processed_46_questions_mmlu_pro_Computer_Security_46_deepseek-ai_DeepSeek-R1-Distill-Llama-8B_20250316_0417_shuffled_3.pkl
Processed file saved as: ./processed_results/processed_46_questions_mmlu_pro_Computer_Security_46_deepseek-ai_DeepSeek-R1-Distill-Llama-8B_20250316_0330_shuffled_1.pkl
Processed file saved as: ./processed_results/processed_46_questions_mmlu_pro_Computer_Security_46_meta-llama_Llama-3.1-8B-Instruct_20250315_2009_shuffled_4.pkl
Processed file saved as: ./processed_results/processed_46_questions_mmlu_pro_Computer_Security_46_meta-llama_Llama-3.1-8B-Instruct_20250315_2008_shuffled_3.pkl
Processed file saved as: ./processed_results/processed_46_questions_mmlu_pro_Computer_Security_46_deepseek-ai_DeepSeek-R1-Distill-Llama-8B_20250316_0309_shuffled_0.pkl
Processed file saved as: ./processed_results/processed_46_questions_mmlu_pro_Computer_Security_46_deepseek-ai_

In [17]:
def convert_pkl_to_parquet(folder_path):
    """
    Converts all .pkl files in the specified folder to .parquet format.
    """
    if not os.path.isdir(folder_path):
        print(f"Error: The folder '{folder_path}' does not exist.")
        return

    converted_files = 0
    for filename in os.listdir(folder_path):
        if filename.endswith(".pkl"):
            pkl_path = os.path.join(folder_path, filename)
            parquet_path = pkl_path.replace(".pkl", ".parquet")

            try:
                # Load the pickle file
                df = pd.read_pickle(pkl_path)

                # Convert to Parquet
                df.to_parquet(parquet_path, index=False)

                print(f"Converted: {filename} → {os.path.basename(parquet_path)}")
                converted_files += 1
            except Exception as e:
                print(f"Failed to convert {filename}: {e}")

    if converted_files == 0:
        print("No .pkl files found in the folder.")
    else:
        print(f"Conversion complete: {converted_files} file(s) converted.")


In [18]:
convert_pkl_to_parquet("./processed_results/")


Converted: processed_100_questions_350-701-CCNP_meta-llama_Llama-3.1-8B-Instruct_20250211_1519_shuffled_4.pkl → processed_100_questions_350-701-CCNP_meta-llama_Llama-3.1-8B-Instruct_20250211_1519_shuffled_4.parquet
Converted: processed_46_questions_mmlu_pro_Computer_Security_46_meta-llama_Llama-3.1-8B-Instruct_20250315_2008_shuffled_3.pkl → processed_46_questions_mmlu_pro_Computer_Security_46_meta-llama_Llama-3.1-8B-Instruct_20250315_2008_shuffled_3.parquet
Converted: processed_100_questions_201-301-CCNA_meta-llama_Llama-3.1-8B-Instruct_20250211_1530_shuffled_1.pkl → processed_100_questions_201-301-CCNA_meta-llama_Llama-3.1-8B-Instruct_20250211_1530_shuffled_1.parquet
Converted: processed_100_questions_201-301-CCNA_meta-llama_Llama-3.1-8B-Instruct_20250214_1508_shuffled_4.pkl → processed_100_questions_201-301-CCNA_meta-llama_Llama-3.1-8B-Instruct_20250214_1508_shuffled_4.parquet
Converted: processed_46_questions_mmlu_pro_Computer_Security_46_meta-llama_Llama-3.1-8B-Instruct_20250319_10

In [19]:
def compare_answers(answer_llm, answer_exam):
    """Compares the extracted correct answers with the answers in answer_exam.

    Keyword arguments:
    answerLLM -- the list of answers extracted from the LLM answer
    answer_exam -- list of answers from the exam
    """
    # Convert answer_exam_list from letters to numbers
    answerLLM = [ord(answer) - 65 for answer in answer_llm]

    # Get number of correct answers in the exam
    num_of_correct_exam_answers = len(answer_exam)

    # Convert both lists to sets for efficient comparison
    answer_LLM_set = set(answerLLM)
    answer_exam_set = set(answer_exam)

    # Calculate the count of matching answers
    number_of_correct_llm_answers = len(answer_LLM_set.intersection(answer_exam_set))

    #Calculate the number of incorrect answers
    number_of_incorrect_llm_answers = len(answer_LLM_set.difference(answer_exam_set))

    # Check if the number of answers given by the LLM is greater than the number of correct answers
    too_many_answ_given = False
    if len(answer_LLM_set) > num_of_correct_exam_answers:
        too_many_answ_given = True

    # Return a dictionary with the matching count and the number of correct answers
    return number_of_correct_llm_answers, too_many_answ_given, number_of_incorrect_llm_answers

In [20]:
def evaluation_sampling(fn_extractor, full_llm_answer, exam_answers):
    """Analyse the answer given by the LLM and compare it with the exam answers.

    Keyword arguments:
    fn_extractor -- the function that extracts the answer(s) from the llm_answer
    llm_answer -- the answer string given by the LLM
    exam_Answers -- the list of answers from the exam
    """
    num_of_correct_answer = len(exam_answers)

    llm_answers = fn_extractor(full_llm_answer)
    if llm_answers is not None:
        num_of_correct_llm_Answers, too_many_answ, number_of_incorrect_llm_answers = compare_answers(llm_answers, exam_answers)
        if num_of_correct_llm_Answers == num_of_correct_answer and too_many_answ == False:
            answered_correctly = True
        else:
            answered_correctly = False
        return num_of_correct_llm_Answers, llm_answers, too_many_answ, answered_correctly, number_of_incorrect_llm_answers
    else:
         return -1

In [21]:
def evaluation(llm_output_dataframe):

    # Compute the number of total questions for each model
    number_of_questions = llm_output_dataframe.groupby(['Model','Prompt_Engineering', 'Temperature', 'Exam'])['QuestionIndex'].count()

    #Number of fully correct answers given by the LLM
    correctly_answered = llm_output_dataframe.groupby(['Model', 'Prompt_Engineering', 'Temperature', 'Exam'])['Answered_Correctly'].sum()

    #Number of incorrect answers given by the LLM
    incorrectly_answered = number_of_questions - correctly_answered

    #Amount of correct answers in the exam
    amount_correct_exam_answers = llm_output_dataframe.groupby(['Model', 'Prompt_Engineering', 'Temperature', 'Exam'])['NumberOfCorrectExamAnswers'].sum()

    #Amount of correct answers given by the LLM even if not fully correct
    amount_correct_llm_answers = llm_output_dataframe.groupby(['Model', 'Prompt_Engineering', 'Temperature', 'Exam'])['NumberOfCorrectLLMAnswers'].sum()

    # Calculate Partial Credits
    llm_output_dataframe['Partial_Credit'] = llm_output_dataframe.apply(
        lambda row: max(0, row['NumberOfCorrectLLMAnswers'] / row['NumberOfCorrectExamAnswers'] -
                        (row['NumberOfIncorrectLLMAnswers'] /row['NumberOfCorrectExamAnswers'])), axis=1)

    # Aggregate Partial Credit for each model
    partial_credit_sum = llm_output_dataframe.groupby(['Model', 'Prompt_Engineering', 'Temperature', 'Exam'])['Partial_Credit'].sum()

    #Calculation of Accuracy and Recall and f1 score
    accuracy = correctly_answered / number_of_questions
    accuracy_partial = partial_credit_sum / number_of_questions

    results_df = pd.DataFrame({
        'Number of Questions': number_of_questions,
        'Correctly Answered': correctly_answered,
        'Incorrectly Answered': incorrectly_answered,
        'Accuracy': accuracy,
        'Accuracy Partial': accuracy_partial,
        'Total Partial Credit': partial_credit_sum
    })

    results_df = results_df.reset_index()

    return results_df

In [22]:
def calculate_model_statistics_their(df):
    """
    Calculates statistics for each model in the DataFrame.

    Args:
    df (DataFrame): Input DataFrame containing evaluation metrics for different models.

    Returns:
    DataFrame: New DataFrame containing calculated statistics for each model.
    """
    model_stats = []
    for model, group_df in df.groupby(['Model']):
        model_stat = {
            'Model': model,
            'Accuracy Mean': group_df['Accuracy'].mean(),
            'Accuracy Max': group_df['Accuracy'].max(),
            'Accuracy Min': group_df['Accuracy'].min(),
            'Accuracy STD': group_df['Accuracy'].std(),
            'Accuracy Partial Mean': group_df['Accuracy Partial'].mean(),
            'Accuracy Partial Max': group_df['Accuracy Partial'].max(),
            'Accuracy Partial Min': group_df['Accuracy Partial'].min(),
            'Accuracy Partial STD': group_df['Accuracy Partial'].std()
        }
        model_stats.append(model_stat)

    return pd.DataFrame(model_stats)

In [52]:
def calculate_model_statistics_mcqa(df, group_by=['Model', 'Prompt_Engineering', 'Temperature', 'Exam'], filters=None):
    """
    Calculates statistics for each model in the DataFrame with optional filtering.

    Args:
    df (DataFrame): Input DataFrame containing evaluation metrics for different models.
    group_by (list): List of columns to group by.
    filters (dict): Dictionary where keys are column names and values are the filter criteria.

    Returns:
    DataFrame: New DataFrame containing calculated statistics for each model, prompt engineering, and temperature.
    """

    filtered_df = df.copy()

    # Apply multiple filters
    if filters:
        for col, value in filters.items():
            if col in df.columns:
                filtered_df = filtered_df[filtered_df[col] == value]

    # Remove filtered columns from grouping list
    filtered_group_by = [col for col in group_by if col not in filters] if filters else group_by

    # Ensure group_by is not empty
    if not filtered_group_by:
        raise ValueError("Grouping columns cannot be empty after filtering. Check your 'group_by' and 'filters' values.")

    # Group by the remaining columns and calculate statistics
    stats_df = filtered_df.groupby(filtered_group_by).agg(
        Accuracy_Mean=('Accuracy', 'mean'),
        Accuracy_Max=('Accuracy', 'max'),
        Accuracy_Min=('Accuracy', 'min'),
        Accuracy_STD=('Accuracy', 'std'),
        Accuracy_Partial_Mean=('Accuracy Partial', 'mean'),
        Accuracy_Partial_Max=('Accuracy Partial', 'max'),
        Accuracy_Partial_Min=('Accuracy Partial', 'min'),
        Accuracy_Partial_STD=('Accuracy Partial', 'std')
    ).reset_index()

    # Replace NaN STD values with 0 (for groups with only one value)
    stats_df.fillna(0, inplace=True)

    return stats_df


In [24]:
def produce_statistics_from_batch_mcqa(input_dir ="./processed_results/"):
    # Get all .pkl files in the input directory
    pkl_files = glob.glob(os.path.join(input_dir, "*.pkl"))

    if not pkl_files:
        print(f"No .pkl files found in {input_dir}.")
        return

    print(f"Found {len(pkl_files)} .pkl files in {input_dir}. Processing...")

    #Create a dataframe with the size of NUM_OF_SHUFFLES which contains the dataframe llm_exam_result
    shuffled_evaluation_df = pd.DataFrame(columns=['Number of Questions', 'Correctly Answered', 'Incorrectly Answered', 'Accuracy', 'Accuracy Partial'])

    # pickle_path = "./processed_results/processed_100_questions_201-301-CCNA_meta-llama_Llama-3.1-8B-Instruct_20250211_1523_shuffled_0.pkl"
    # pickles = [pickle_path,]

    for pickle in pkl_files:
        llm_exam_result = pd.DataFrame(columns = [
            "Model",
            "Prompt_Engineering",
            "Temperature",
            "Exam",
            "QuestionIndex",
            "SamplingIndex",
            # "Improved_Extracted_Answer_Column",
            # "Their_Extracted_Answer_Column",
            # "Differ",
            "NumberOfChoices",
            "NumberOfCorrectLLMAnswers",
            "NumberOfIncorrectLLMAnswers",
            "NumberOfCorrectExamAnswers",
            "Ratio",
            "LLM_Answer",
            "Exam_Answers",
            "Answered_Correctly",
            "Too_Many_answers"
        ])
        result_from_exam = pd.read_pickle(pickle)
        for index_question, row in result_from_exam.iterrows():
            num_of_correct_answer = len(row["Exam_Answers"])
            # num_of_choices = row["NumberOfChoices"] # TODO: Do not need. Remove
            # extracted_answer = row["Improved_Extracted_Answer"]

            num_of_correct_llm_answer, answerLLm, too_many_answers, answered_correctly, number_of_incorrect_llm_answers = evaluation_sampling(improved_extract_answer, row["LLM_Answer"],row["Exam_Answers"])

            new_row = pd.DataFrame({
                "Model": [row["Model"]], # M
                "Prompt_Engineering": [row["Prompt_Engineering"]],
                "Temperature": [row["Temperature"]],
                "Exam": [row["Exam"]],
                "QuestionIndex": [row["QuestionIndex"]], # M
                "SamplingIndex": [row["SamplingIndex"]],
                # "Improved_Extracted_Answer" : row["Improved_Extracted_Answer"],
                # "Their_Extracted_Answer": row["Their_Extracted_Answer"],
                # "Differ": row["Differ],
                "NumberOfChoices": row["NumberOfChoices"],
                "NumberOfIncorrectLLMAnswers": number_of_incorrect_llm_answers, # M
                "NumberOfCorrectLLMAnswers": [num_of_correct_llm_answer], # M
                "NumberOfCorrectExamAnswers": [num_of_correct_answer], # M
                "Ratio": [num_of_correct_llm_answer/num_of_correct_answer],
                "LLM_Answer": [row["LLM_Answer"]],
                "Exam_Answers": [row["Exam_Answers"]],
                "Answered_Correctly" : [answered_correctly], # M
                "Too_Many_answers": [too_many_answers]})

            if llm_exam_result.empty:
                llm_exam_result = new_row  # Directly assign instead of concatenating
            else:
                llm_exam_result = pd.concat([llm_exam_result, new_row], ignore_index=True)

        evaluation_df = evaluation(llm_exam_result)
        #Concat the evaluation dataframe to the complete dataframe

        if shuffled_evaluation_df.empty:
            shuffled_evaluation_df = evaluation_df  # Directly assign instead of concatenating
        else:
            shuffled_evaluation_df = pd.concat([shuffled_evaluation_df, evaluation_df], ignore_index=True)


    # print(shuffled_evaluation_df)
    # model_statistics = calculate_model_statistics(shuffled_evaluation_df)
    # print(model_statistics)
    # print(type(shuffled_evaluation_df))
    # print(type(model_statistics))
    return shuffled_evaluation_df

In [25]:
evaluation_df = produce_statistics_from_batch_mcqa()

Found 180 .pkl files in ./processed_results/. Processing...


In [53]:
print(type(evaluation_df))
model_statistics = calculate_model_statistics_mcqa(evaluation_df)


<class 'pandas.core.frame.DataFrame'>


In [54]:
model_statistics

Unnamed: 0,Model,Prompt_Engineering,Temperature,Exam,Accuracy_Mean,Accuracy_Max,Accuracy_Min,Accuracy_STD,Accuracy_Partial_Mean,Accuracy_Partial_Max,Accuracy_Partial_Min,Accuracy_Partial_STD
0,deepseek-ai/DeepSeek-R1-Distill-Llama-8B,0_shot,0.0,CCNA-201-301,0.556,0.59,0.52,0.02881,0.565,0.6,0.53,0.027839
1,deepseek-ai/DeepSeek-R1-Distill-Llama-8B,0_shot,0.0,CCNP-350-701,0.544,0.57,0.52,0.018166,0.549,0.575,0.53,0.016733
2,deepseek-ai/DeepSeek-R1-Distill-Llama-8B,0_shot,0.0,MMLU-Computer-Security,0.714,0.77,0.69,0.033615,0.714,0.77,0.69,0.033615
3,deepseek-ai/DeepSeek-R1-Distill-Llama-8B,0_shot,0.0,MMLU-PRO-Computer-Security,0.46087,0.5,0.413043,0.038888,0.46087,0.5,0.413043,0.038888
4,deepseek-ai/DeepSeek-R1-Distill-Llama-8B,0_shot,0.7,CCNA-201-301,0.564,0.59,0.52,0.029665,0.576,0.605,0.53,0.029665
5,deepseek-ai/DeepSeek-R1-Distill-Llama-8B,0_shot,0.7,CCNP-350-701,0.538,0.56,0.52,0.016432,0.552,0.57,0.54,0.01255
6,deepseek-ai/DeepSeek-R1-Distill-Llama-8B,0_shot,0.7,MMLU-Computer-Security,0.726,0.76,0.7,0.021909,0.726,0.76,0.7,0.021909
7,deepseek-ai/DeepSeek-R1-Distill-Llama-8B,0_shot,0.7,MMLU-PRO-Computer-Security,0.513043,0.543478,0.478261,0.024786,0.513043,0.543478,0.478261,0.024786
8,deepseek-ai/DeepSeek-R1-Distill-Llama-8B,1_shot_cot,0.0,MMLU-PRO-Computer-Security,0.46087,0.478261,0.434783,0.023814,0.46087,0.478261,0.434783,0.023814
9,deepseek-ai/DeepSeek-R1-Distill-Llama-8B,1_shot_cot,0.7,MMLU-PRO-Computer-Security,0.452174,0.565217,0.347826,0.077472,0.452174,0.565217,0.347826,0.077472


In [55]:
filtered_stats = calculate_model_statistics_mcqa(evaluation_df, filters={'Temperature': 0.0, 'Exam': 'MMLU-PRO-Computer-Security'})


In [56]:
filtered_stats

Unnamed: 0,Model,Prompt_Engineering,Accuracy_Mean,Accuracy_Max,Accuracy_Min,Accuracy_STD,Accuracy_Partial_Mean,Accuracy_Partial_Max,Accuracy_Partial_Min,Accuracy_Partial_STD
0,deepseek-ai/DeepSeek-R1-Distill-Llama-8B,0_shot,0.46087,0.5,0.413043,0.038888,0.46087,0.5,0.413043,0.038888
1,deepseek-ai/DeepSeek-R1-Distill-Llama-8B,1_shot_cot,0.46087,0.478261,0.434783,0.023814,0.46087,0.478261,0.434783,0.023814
2,deepseek-ai/DeepSeek-R1-Distill-Llama-8B,5_shot_cot,0.456522,0.5,0.413043,0.034373,0.456522,0.5,0.413043,0.034373
3,meta-llama/Llama-3.1-8B-Instruct,0_shot,0.452174,0.521739,0.391304,0.05627,0.452174,0.521739,0.391304,0.05627
4,meta-llama/Llama-3.1-8B-Instruct,1_shot_cot,0.478261,0.543478,0.413043,0.055424,0.478261,0.543478,0.413043,0.055424
5,meta-llama/Llama-3.1-8B-Instruct,5_shot_cot,0.430435,0.543478,0.369565,0.084755,0.430435,0.543478,0.369565,0.084755
