#### Number of unique solutions

In [6]:
import os
import pickle
import pandas as pd
from tqdm import tqdm
from openai import OpenAI
import numpy as np

# Initialize the client (new syntax)
client = OpenAI(api_key=OPEN_API_KEY)  # or pass key directly

def create_prompt(problem, responses):
    return f"""You are given a math reasoning problem and a list of responses generated by a model.

    Your task is to count how many **unique** responses there are.

    Two responses are considered different if they use different:
    - Mathematical strategies
    - Solution steps
    - Logical approach
    - Structure of reasoning

    ### Problem:
    {problem}

    ### Responses:
    {responses}

    ### Instruction:
    Output **only** the number of unique responses as an integer.  
    Do **not** include any explanation, text, or symbols — just the number.

    Example output: `3`
    """



def call_gpt(prompt):
    try:
        response = client.chat.completions.create(
            model="gpt-4.1-mini",  # Or "gpt-4.1-mini" if available
            messages=[{"role": "user", "content": prompt}],
            temperature=0.0,
        )
        reply = response.choices[0].message.content.strip()
        # print(f"Reply: {reply}")
        score = int(reply.split()[0])  # naive float parsing
        return score
    except Exception as e:
        print(f"Error processing prompt: {e}")
        return None

# Path setup
dir_path = "/home/ly/DataDistillation/results/steering/Qwen2.5-Math-1.5B-Instruct$/olympiadbench/temperature-sampling-fixed"
pkl_files = []
for root, dirs, files in os.walk(dir_path):
    for filename in files:
        if filename.endswith(".pkl"):
            pkl_files.append(os.path.join(root, filename))

# Main loop
overall_scores = []

for file_path in tqdm(pkl_files, desc="Processing files"):
    print(file_path)
    with open(file_path, "rb") as f:
        data = pickle.load(f)
    results = []

    for row in tqdm(data.get("results", []), desc="Processing rows", total=len(data.get("results", []))):
        problem = row.get("problem", "")
        response = row.get("responses", "")
        prompt = create_prompt(problem, response)
        score = call_gpt(prompt)

        results.append({
            "file": os.path.basename(file_path),
            "problem": problem,
            "response": response,
            "score": score,
        })
    scores = [r["score"] for r in results]
    mean_score = np.mean(scores)
    std_score = np.std(scores)
    print(f"\nOverall average unique responses per problem: {mean_score:.2f} (±{std_score:.2f})")
    df = pd.DataFrame(results)
    filename = os.path.splitext(os.path.basename(file_path))[0]
    overall_scores.append(f"\n{filename}")
    overall_scores.append(f"\nOverall average unique responses per problem: {mean_score:.2f} (±{std_score:.2f})")
    df.to_csv(f"/home/ly/DataDistillation/results/steering/Qwen2.5-Math-1.5B-Instruct$/olympiadbench/temperature-sampling-fixed/unique_responses_{filename}.csv", index=False)
  
with open(f"/home/ly/DataDistillation/results/steering/Qwen2.5-Math-1.5B-Instruct$/olympiadbench/temperature-sampling-fixed/unique_responses.txt", "w") as f:
    f.writelines(overall_scores)

Processing files:   0%|          | 0/10 [00:00<?, ?it/s]

/home/ly/DataDistillation/results/steering/Qwen2.5-Math-1.5B-Instruct$/olympiadbench/temperature-sampling-fixed/steer-algor1-ver3_n8_olympiadbench_steern500-calpha-k10-temp0.8-after_raw.pkl




Processing rows: 100%|██████████| 675/675 [09:34<00:00,  1.18it/s]



Overall average unique responses per problem: 4.36 (±2.32)


Processing files:  10%|█         | 1/10 [09:34<1:26:12, 574.73s/it]

/home/ly/DataDistillation/results/steering/Qwen2.5-Math-1.5B-Instruct$/olympiadbench/temperature-sampling-fixed/steer-algor1-ver3_n8_olympiadbench_steern500-calpha-k1-temp0.4-after_raw.pkl


Processing rows: 100%|██████████| 675/675 [09:01<00:00,  1.25it/s]



Overall average unique responses per problem: 3.99 (±2.19)


Processing files:  20%|██        | 2/10 [18:36<1:14:03, 555.50s/it]

/home/ly/DataDistillation/results/steering/Qwen2.5-Math-1.5B-Instruct$/olympiadbench/temperature-sampling-fixed/steer-algor1-ver3_n8_olympiadbench_steern500-calpha-k10-temp0.2-after_raw.pkl


Processing rows: 100%|██████████| 675/675 [09:13<00:00,  1.22it/s]



Overall average unique responses per problem: 3.65 (±2.04)


Processing files:  30%|███       | 3/10 [27:50<1:04:42, 554.60s/it]

/home/ly/DataDistillation/results/steering/Qwen2.5-Math-1.5B-Instruct$/olympiadbench/temperature-sampling-fixed/steer-algor1-ver3_n8_olympiadbench_steern500-calpha-k1-temp0.8-after_raw.pkl


Processing rows: 100%|██████████| 675/675 [09:02<00:00,  1.24it/s]



Overall average unique responses per problem: 4.28 (±2.31)


Processing files:  40%|████      | 4/10 [36:53<55:00, 550.09s/it]  

/home/ly/DataDistillation/results/steering/Qwen2.5-Math-1.5B-Instruct$/olympiadbench/temperature-sampling-fixed/steer-algor1-ver3_n8_olympiadbench_steern500-calpha-k1-temp0.2-after_raw.pkl


Processing rows: 100%|██████████| 675/675 [10:09<00:00,  1.11it/s]



Overall average unique responses per problem: 3.67 (±2.04)


Processing files:  50%|█████     | 5/10 [47:03<47:38, 571.69s/it]

/home/ly/DataDistillation/results/steering/Qwen2.5-Math-1.5B-Instruct$/olympiadbench/temperature-sampling-fixed/steer-algor1-ver3_n8_olympiadbench_steern500-calpha-k1-temp1.0-after_raw.pkl


Processing rows: 100%|██████████| 675/675 [10:34<00:00,  1.06it/s]



Overall average unique responses per problem: 4.59 (±2.34)


Processing files:  60%|██████    | 6/10 [57:38<39:32, 593.12s/it]

/home/ly/DataDistillation/results/steering/Qwen2.5-Math-1.5B-Instruct$/olympiadbench/temperature-sampling-fixed/steer-algor1-ver3_n8_olympiadbench_steern500-calpha-k10-temp1.0-after_raw.pkl


Processing rows: 100%|██████████| 675/675 [09:39<00:00,  1.16it/s]



Overall average unique responses per problem: 4.53 (±2.35)


Processing files:  70%|███████   | 7/10 [1:07:18<29:26, 588.91s/it]

/home/ly/DataDistillation/results/steering/Qwen2.5-Math-1.5B-Instruct$/olympiadbench/temperature-sampling-fixed/steer-algor1-ver3_n8_olympiadbench_steern500-calpha-k10-temp0.6-after_raw.pkl


Processing rows: 100%|██████████| 675/675 [09:08<00:00,  1.23it/s]



Overall average unique responses per problem: 4.16 (±2.26)


Processing files:  80%|████████  | 8/10 [1:16:27<19:12, 576.25s/it]

/home/ly/DataDistillation/results/steering/Qwen2.5-Math-1.5B-Instruct$/olympiadbench/temperature-sampling-fixed/steer-algor1-ver3_n8_olympiadbench_steern500-calpha-k1-temp0.6-after_raw.pkl


Processing rows: 100%|██████████| 675/675 [09:30<00:00,  1.18it/s]



Overall average unique responses per problem: 4.10 (±2.24)


Processing files:  90%|█████████ | 9/10 [1:25:58<09:34, 574.54s/it]

/home/ly/DataDistillation/results/steering/Qwen2.5-Math-1.5B-Instruct$/olympiadbench/temperature-sampling-fixed/steer-algor1-ver3_n8_olympiadbench_steern500-calpha-k10-temp0.4-after_raw.pkl


Processing rows: 100%|██████████| 675/675 [08:55<00:00,  1.26it/s]



Overall average unique responses per problem: 3.92 (±2.19)


Processing files: 100%|██████████| 10/10 [1:34:54<00:00, 569.45s/it]


#### Diversity

In [None]:
import os
import pickle
import pandas as pd
from tqdm import tqdm
from openai import OpenAI
import numpy as np

# Initialize the client (new syntax)
client = OpenAI(api_key=OPEN_API_KEY)  # or pass key directly

def create_prompt(problem, responses):
    return f'''You are given a math reasoning problem and a list of different responses (solutions) generated by a model.
        Your task is to assign a float score from 0 to 1.0 that reflects the **diversity of reasoning and approaches** among the responses.
        Consider differences in:
        - Mathematical strategies
        - Solution steps
        - Structural approach
        - Logical of reasoning

        ### Problem:
        {problem}

        ### Responses:
        {responses}

        ### Instruction:
        Output **only** a float number between 0 and 1.0 (inclusive), rounded to two decimal places.  
        Do **not** include any explanation, symbols, or text — only the score.

        Example output: `0.75`
        '''

def call_gpt(prompt):
    try:
        response = client.chat.completions.create(
            model="gpt-4.1-mini",  # Or "gpt-4.1-mini" if available
            messages=[{"role": "user", "content": prompt}],
            temperature=0.0,
        )
        reply = response.choices[0].message.content.strip()
        # print(f"Prompt: {prompt}\nReply: {reply}")
        score = float(reply.split()[0])  # naive float parsing
        return score
    except Exception as e:
        print(f"Error processing prompt: {e}")
        return None

# Path setup
dir_path = "/home/ly/DataDistillation/results/steering/Qwen2.5-1.5B$/olympiadbench/temperature-sampling-fixed/new"
pkl_files = []
for root, dirs, files in os.walk(dir_path):
    for filename in files:
        if filename.endswith(".pkl"):
            pkl_files.append(os.path.join(root, filename))

# Main loop
overall_scores = []
for file_path in tqdm(pkl_files, desc="Processing files"):
    print(file_path)
    with open(file_path, "rb") as f:
        data = pickle.load(f)

    results = []
    for row in tqdm(data.get("results", []), desc="Processing rows", total=len(data.get("results", []))):
        problem = row.get("problem", "")
        response = row.get("responses", "")
        prompt = create_prompt(problem, response)
        score = call_gpt(prompt)

        results.append({
            "file": os.path.basename(file_path),
            "problem": problem,
            "response": response,
            "score": score,
        })
        
    scores = [r["score"] for r in results]
    mean_score = np.mean(scores)
    std_score = np.std(scores)
    print(f"\nOverall average diversity score per problem: {mean_score:.2f} (±{std_score:.2f})")
    df = pd.DataFrame(results)
    filename = os.path.splitext(os.path.basename(file_path))[0]
    overall_scores.append(f"\n{filename}")
    overall_scores.append(f"\nOverall average diversity score per problem: {mean_score:.2f} (±{std_score:.2f})")
    df.to_csv(f"/home/ly/DataDistillation/results/steering/Qwen2.5-1.5B$/olympiadbench/temperature-sampling-fixed/new/diversity_score_{filename}.csv", index=False)
  
with open(f"/home/ly/DataDistillation/results/steering/Qwen2.5-1.5B$/olympiadbench/temperature-sampling-fixed/new/diversity_score.txt", "w") as f:
    f.writelines(overall_scores)


Processing files:   0%|          | 0/4 [00:00<?, ?it/s]

/home/ly/DataDistillation/results/steering/Qwen2.5-1.5B$/olympiadbench/temperature-sampling-fixed/new/steer-algor1-ver3_n8_olympiadbench_steern500-calpha-k10-temp1.0-after_raw.pkl




In [134]:
file_path = "/home/ly/DataDistillation/results/steering/Qwen2.5-Math-1.5B-Instruct$/olympiadbench/temperature-sampling-fixed/diversity_score_steer-algor1-ver3_n8_olympiadbench_steern500-calpha-k10-temp0.2-after_raw.csv"
import pandas as pd
df = pd.read_csv(file_path)
df['score'].var()

np.float64(0.05674323552038686)

In [136]:
import os
import pandas as pd

# Đường dẫn thư mục chứa các file CSV
folder_path = "/home/ly/DataDistillation/results/steering/Qwen2.5-1.5B$/olympiadbench/temperature-sampling/before"

# Lặp qua tất cả các file CSV trong thư mục
for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):
        file_path = os.path.join(folder_path, filename)
        try:
            df = pd.read_csv(file_path)

            # Chuyển 'score' sang số nếu cần và bỏ NaN
            df['score'] = pd.to_numeric(df['score'], errors='coerce')
            variance = df['score'].dropna().var()

            print(f"{filename}: Variance = {variance:.4f}")
        except Exception as e:
            print(f"❌ Lỗi khi xử lý {filename}: {e}")


diversity_score_steer-algor1-ver3_n8_olympiadbench_steern500-calpha-k0.0001-temp0.6-before_raw.csv: Variance = 0.0567
diversity_score_steer-algor1-ver3_n8_olympiadbench_steern500-calpha-k0.0001-temp0.4-before_raw.csv: Variance = 0.0549
diversity_score_steer-algor1-ver3_n8_olympiadbench_steern500-calpha-k10-temp0.2-before_raw.csv: Variance = 0.0477
unique_responses_steer-algor1-ver3_n8_olympiadbench_steern500-calpha-k10-temp0.2-before_raw.csv: Variance = 2.2064
unique_responses_steer-algor1-ver3_n8_olympiadbench_steern500-calpha-k0.00001-temp1.0-before_raw.csv: Variance = 1.6107
unique_responses_steer-algor1-ver3_n8_olympiadbench_steern500-calpha-k0.0001-temp0.6-before_raw.csv: Variance = 1.6748
unique_responses_steer-algor1-ver3_n8_olympiadbench_steern500-calpha-k0.0001-temp0.4-before_raw.csv: Variance = 1.9897
diversity_score_steer-algor1-ver3_n8_olympiadbench_steern500-calpha-k0.00001-temp0.8-before_raw.csv: Variance = 0.0611
diversity_score_steer-algor1-ver3_n8_olympiadbench_steern5