In [9]:
import pandas as pd
import glob
import pickle
import os

import json


In [10]:
def json_to_pickle(json_file_path, pickle_file_path):
    # Read the JSON file
    with open(json_file_path, 'r') as json_file:
        data = json.load(json_file)

    # Convert to pandas DataFrame
    df = pd.DataFrame(data)

    # Write to pickle file
    with open(pickle_file_path, 'wb') as pickle_file:
        pickle.dump(df, pickle_file)


In [11]:
def batch_convert_json_to_pkl(folder_path):
    """
    Converts all .json files in the specified folder to .pkl format.
    """
    if not os.path.isdir(folder_path):
        print(f"Error: The folder '{folder_path}' does not exist.")
        return

    converted_files = 0
    for filename in os.listdir(folder_path):
        if filename.endswith(".json"):
            json_path = os.path.join(folder_path, filename)
            pkl_path = json_path.replace(".json", ".pkl")

            try:
                json_to_pickle(json_path, pkl_path)

                print(f"Converted: {filename} → {os.path.basename(pkl_path)}")
                converted_files += 1
            except Exception as e:
                print(f"Failed to convert {filename}: {e}")

    if converted_files == 0:
        print("No .pkl files found in the folder.")
    else:
        print(f"Conversion complete: {converted_files} file(s) converted.")


In [12]:
def convert_json_to_parquet(json_file_path, parquet_file_path):
    # Step 1: Read the JSON file
    with open(json_file_path, 'r') as json_file:
        data = json.load(json_file)

    # Step 2: Convert to pandas DataFrame
    df = pd.DataFrame(data)

    # Step 3: Write to parquet file
    df.to_parquet(parquet_file_path, engine='pyarrow')

    print(f"Successfully converted {json_file_path} to {parquet_file_path}")

In [13]:
def convert_pkl_to_parquet(folder_path):
    """
    Converts all .pkl files in the specified folder to .parquet format.
    """
    if not os.path.isdir(folder_path):
        print(f"Error: The folder '{folder_path}' does not exist.")
        return

    converted_files = 0
    for filename in os.listdir(folder_path):
        if filename.endswith(".pkl"):
            pkl_path = os.path.join(folder_path, filename)
            parquet_path = pkl_path.replace(".pkl", ".parquet")

            try:
                # Load the pickle file
                df = pd.read_pickle(pkl_path)

                # Convert to Parquet
                df.to_parquet(parquet_path, index=False)

                print(f"Converted: {filename} → {os.path.basename(parquet_path)}")
                converted_files += 1
            except Exception as e:
                print(f"Failed to convert {filename}: {e}")

    if converted_files == 0:
        print("No .pkl files found in the folder.")
    else:
        print(f"Conversion complete: {converted_files} file(s) converted.")


In [14]:
def process_pkl_mitre(input_file, output_file, subject_model, temperature):
    # Load the DataFrame
    with open(input_file, 'rb') as pkl_file:
        df = pickle.load(pkl_file)

    # Rename model_col to judge_model
    old_column_name = "model"
    new_column_name = "judge_model"
    if old_column_name not in df.columns:
        raise ValueError(f"Column '{old_column_name}' not found in the DataFrame.")
    df.rename(columns={old_column_name: new_column_name}, inplace=True)

    # Add new columns with constant values
    df["temperature"] = temperature
    df["subject_model"] = subject_model

    if "expansion_response" not in df.columns:
        df["expansion_response"] = None

    if "judge_response" not in df.columns:
        df["judge_response"] = None

    # Define columns to keep
    selected_columns = ["subject_model",
                        "temperature",
                        "prompt_index",
                        "mitre_category",
                        "test_case_prompt",
                        "think_response",
                        "initial_response", # Same as "response"
                        "expansion_response",
                        "judge_response",
                        "answered",
                        "judge_model",
                        ]

    # Select only the specified columns
    df_selected = df[selected_columns]

    # Save the modified DataFrame as a .pkl file
    with open(output_file, "wb") as f:
        pickle.dump(df_selected, f)

    print(f"Processed file saved as: {output_file}")
    # Define columns to keep

In [15]:
def batch_process_pkl_mitre(input_dir, output_dir, subject_model, temperature):
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Get all .pkl files in the input directory
    pkl_files = glob.glob(os.path.join(input_dir, "*.pkl"))

    if not pkl_files:
        print(f"No .pkl files found in {input_dir}.")
        return

    print(f"Found {len(pkl_files)} .pkl files in {input_dir}. Processing...")

    # Loop through each file and process it
    for input_file in pkl_files:
        # Generate output file path
        filename = os.path.basename(input_file)
        output_file = os.path.join(output_dir, f"processed_{filename}")

        # Process and save
        process_pkl_mitre(input_file, output_file, subject_model, temperature)


In [16]:
# MITRE
# DS Subject Model, DS Judge Model, T 0.0
batch_convert_json_to_pkl(
    "./data/deepseek/judge/ds/t00/",
)
batch_process_pkl_mitre(
    input_dir="./data/deepseek/judge/ds/t00/",
    output_dir="./processed_results/",
    subject_model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
    temperature=0.0
)

# DS Subject Model, DS Judge Model, T 0.7
batch_convert_json_to_pkl(
    "./data/deepseek/judge/ds/t07/",
)
batch_process_pkl_mitre(
    input_dir="./data/deepseek/judge/ds/t07/",
    output_dir="./processed_results/",
    subject_model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
    temperature=0.7
)

# DS Subject Model, Llama Judge Model, T 0.0
batch_convert_json_to_pkl(
    "./data/deepseek/judge/llama/t00/",
)
batch_process_pkl_mitre(
    input_dir="./data/deepseek/judge/llama/t00/",
    output_dir="./processed_results/",
    subject_model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
    temperature=0.0
)

# DS Subject Model, Llama Judge Model, T 0.7
batch_convert_json_to_pkl(
    "./data/deepseek/judge/llama/t07/",
)
batch_process_pkl_mitre(
    input_dir="./data/deepseek/judge/llama/t07/",
    output_dir="./processed_results/",
    subject_model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
    temperature=0.7
)

# Llama Subject Model, DS Judge Model, T 0.0
batch_convert_json_to_pkl(
    "./data/llama/judge/ds/t00/",
)
batch_process_pkl_mitre(
    input_dir="./data/llama/judge/ds/t00/",
    output_dir="./processed_results/",
    subject_model="meta-llama/Llama-3.1-8B-Instruct",
    temperature=0.0
)
# Llama Subject Model, DS Judge Model, T 0.7
batch_convert_json_to_pkl(
    "./data/llama/judge/ds/t07/",
)
batch_process_pkl_mitre(
    input_dir="./data/llama/judge/ds/t07/",
    output_dir="./processed_results/",
    subject_model="meta-llama/Llama-3.1-8B-Instruct",
    temperature=0.7
)
# Llama Subject Model, Llama Judge Model, T 0.0
batch_convert_json_to_pkl(
    "./data/llama/judge/ds/t00/",
)
batch_process_pkl_mitre(
    input_dir="./data/llama/judge/ds/t00/",
    output_dir="./processed_results/",
    subject_model="meta-llama/Llama-3.1-8B-Instruct",
    temperature=0.0
)
# Llama Subject Model, Llama Judge Model, T 0.7
batch_convert_json_to_pkl(
    "./data/llama/judge/ds/t07/",
)
batch_process_pkl_mitre(
    input_dir="./data/llama/judge/ds/t07/",
    output_dir="./processed_results/",
    subject_model="meta-llama/Llama-3.1-8B-Instruct",
    temperature=0.7
)

Converted: mitre_judge_t0-0 - Original.json → mitre_judge_t0-0 - Original.pkl
Conversion complete: 1 file(s) converted.
Found 1 .pkl files in ./data/deepseek/judge/ds/t00/. Processing...
Processed file saved as: ./processed_results/processed_mitre_judge_t0-0 - Original.pkl
Converted: mitre_judge_t0-7 - Original.json → mitre_judge_t0-7 - Original.pkl
Conversion complete: 1 file(s) converted.
Found 1 .pkl files in ./data/deepseek/judge/ds/t07/. Processing...
Processed file saved as: ./processed_results/processed_mitre_judge_t0-7 - Original.pkl
Converted: mitre_llama_judge_deepseek_t0-0.json → mitre_llama_judge_deepseek_t0-0.pkl
Conversion complete: 1 file(s) converted.
Found 1 .pkl files in ./data/deepseek/judge/llama/t00/. Processing...
Processed file saved as: ./processed_results/processed_mitre_llama_judge_deepseek_t0-0.pkl
Converted: mitre_llama_judge_deepseek_t0-7.json → mitre_llama_judge_deepseek_t0-7.pkl
Conversion complete: 1 file(s) converted.
Found 1 .pkl files in ./data/deepse

In [17]:
convert_pkl_to_parquet("./processed_results/")

Converted: processed_mitre_judge_t0-0 - Original.pkl → processed_mitre_judge_t0-0 - Original.parquet
Converted: processed_mitre_t0-0_judge.pkl → processed_mitre_t0-0_judge.parquet
Converted: processed_mitre_t0-7_deepseek_judge_llama.pkl → processed_mitre_t0-7_deepseek_judge_llama.parquet
Converted: processed_mitre_judge_t0-7 - Original.pkl → processed_mitre_judge_t0-7 - Original.parquet
Converted: processed_mitre_llama_judge_deepseek_t0-7.pkl → processed_mitre_llama_judge_deepseek_t0-7.parquet
Converted: processed_mitre_llama_judge_deepseek_t0-0.pkl → processed_mitre_llama_judge_deepseek_t0-0.parquet
Conversion complete: 6 file(s) converted.


In [18]:
def produce_statistics_from_batch_mitre(input_dir ="./processed_results/"):
    # Get all .pkl files in the input directory
    pkl_files = glob.glob(os.path.join(input_dir, "*.pkl"))

    if not pkl_files:
        print(f"No .pkl files found in {input_dir}.")
        return

    print(f"Found {len(pkl_files)} .pkl files in {input_dir}. Processing...")

    for pickle_file in pkl_files:
        # TODO
        result = pd.DataFrame(columns = [

        ])
# subject_model, temp, promtindex, mitre_cat, answered, judge_model, juderesponse, mitrecat

