In [1]:
import math
import numpy as np
import google.protobuf
import sentencepiece

In [2]:
import pandas as pd
from collections import Counter
from collections import defaultdict
import copy
from tqdm import tqdm

import re

import ast
from itertools import combinations
from numpy.linalg import det

pd.set_option('display.max_rows', None)  
pd.set_option('display.max_columns', None) 
pd.set_option('display.width', None)  
pd.set_option('display.max_colwidth', None) 

In [None]:
temp_df_oqwen = pd.read_csv('../Data/arc_policy_df_oqwen_7B.csv')
temp_df_llama2 = pd.read_csv('../Data/arc_policy_df_deepseek_llm_7b.csv')
temp_df_qwen = pd.read_csv('../Data/arc_policy_df_deepseek_qwen_7B.csv')
temp_df_gemma_7B = pd.read_csv('../Data/arc_policy_df_gemma-7b-it.csv') 
temp_df_mistral_7B = pd.read_csv('../Data/arc_policy_df_Mistral-7B-Instruct.csv') 
temp_df_ai_Yi_9B = pd.read_csv('../Data/arc_policy_df_ai_Yi_9B.csv')
temp_df_openchat_7B= pd.read_csv("../Data/arc_policy_df_openchat_7B.csv")


In [None]:

temp_df_qwen["answer_letter"] = temp_df_qwen["answerKey"]
temp_df_oqwen["answer_letter"] = temp_df_oqwen["answerKey"]
temp_df_llama2["answer_letter"] = temp_df_llama2["answerKey"]
temp_df_gemma_7B["answer_letter"] = temp_df_gemma_7B["answerKey"]
temp_df_mistral_7B["answer_letter"] = temp_df_mistral_7B["answerKey"]
temp_df_ai_Yi_9B["answer_letter"] = temp_df_ai_Yi_9B["answerKey"]
temp_df_openchat_7B["answer_letter"] = temp_df_openchat_7B["answerKey"]

In [5]:
def evaluate_answer_alignment(df_qwen, df_oqwen, df_llama2, answer_col='disc_init_answer', label_col='answer_letter'):
    """
    Compare predicted answers and ground truth across three DataFrames.
    
    Args:
        df_qwen (DataFrame): First model's dataframe (e.g., Qwen).
        df_oqwen (DataFrame): Second model's dataframe (e.g., Oqwen).
        df_llama2 (DataFrame): Third model's dataframe (e.g., Llama2).
        answer_col (str): Name of the model prediction column (default 'disc_init_answer').
        label_col (str): Name of the ground truth column (default 'answer_letter').
        
    Returns:
        DataFrame: Table summarizing counts and accuracies.
    """
    
    total_n = df_oqwen.shape[0]
    
    qwen_corr = (df_qwen[answer_col] == df_qwen[label_col]).sum()
    oqwen_corr = (df_oqwen[answer_col] == df_oqwen[label_col]).sum()
    llama2_corr = (df_llama2[answer_col] == df_llama2[label_col]).sum()

    qwen_llama2_oqwen = ((df_oqwen[answer_col] == df_llama2[answer_col]) & (df_llama2[answer_col] == df_llama2[label_col]) & (df_qwen[answer_col] == df_oqwen[answer_col])).sum()
    
    results_d = {
        "Metric": [
            "Total Question #",
            "oqwen correct answers",
            "qwen correct answers",
            "llama2 correct answers",
            "qwen llama2 oqwen", 
        ],
        "Count": [
            total_n,
            oqwen_corr,
            qwen_corr,
            llama2_corr,
            qwen_llama2_oqwen,
        ]  
    }
    
    results_df = pd.DataFrame(results_d)
    results_df["Accuracy"] = (results_df["Count"] / total_n).apply(lambda x: f"{x:.0%}")
    
    return results_df


In [6]:

COMMON_COLUMNS = ["disc_init_policy", "gen_init_policy", "answer_letter", "disc_answer", "gen_answer"]
RENAME_MAPPING = {"disc_answer": "ED_consensus", "gen_answer": "EG_consensus"}

def process_dataframe(temp_df):
    """Process a dataframe by selecting columns and renaming them."""
    df = temp_df[COMMON_COLUMNS].copy()
    df.rename(columns=RENAME_MAPPING, inplace=True)
    return df

# ARC Challenge datasets
df_oqwen = process_dataframe(temp_df_oqwen)
df_deepseekqwen = process_dataframe(temp_df_qwen)
df_deepseekllama = process_dataframe(temp_df_llama2)
df_gemma_7B = process_dataframe(temp_df_gemma_7B)
df_llama3_8B = process_dataframe(temp_df_llama3_8b)
df_mistral_7B = process_dataframe(temp_df_mistral_7B)
df_ai_Yi_9B = process_dataframe(temp_df_ai_Yi_9B)
df_openchat_7B = process_dataframe(temp_df_openchat_7B)
df_zephyer = process_dataframe(temp_df_zephyer)
df_phi2 = process_dataframe(temp_df_phi2)

In [7]:


def parse_np_float_string(s):
    if isinstance(s, dict):
        return s
    if not isinstance(s, str):
        return {}

    try:
        # 将 np.float32(0.12345) 替换为 0.12345
        s_clean = re.sub(r'np\.float32\((.*?)\)', r'\1', s)
        return ast.literal_eval(s_clean)
    except Exception as e:
        print("Still failed to parse:", s)
        return {}


for df in [
    df_oqwen, df_deepseekqwen, df_deepseekllama,
    df_gemma_7B , df_llama3_8B, df_mistral_7B,
    df_ai_Yi_9B, df_openchat_7B, df_zephyer, df_phi2
]:
    df["disc_init_policy"] = df["disc_init_policy"].apply(parse_np_float_string)
    df["gen_init_policy"] = df["gen_init_policy"].apply(parse_np_float_string)

In [8]:
for df in [
    df_oqwen, df_deepseekqwen, df_deepseekllama,
    df_gemma_7B , df_llama3_8B, df_mistral_7B,
    df_ai_Yi_9B, df_openchat_7B, df_zephyer, df_phi2
]:
    for col in ["disc_init_policy", "gen_init_policy"]:
        if df[col].dtype == object and isinstance(df[col].iloc[0], str):
            df[col] = df[col].apply(ast.literal_eval)


In [9]:
#Normalized the incorrect and correct policy in determinator
def apply_softmax(choice_dict):
    # Extract correct and incorrect values for all choices
    
    correct_values = np.array([v['correct'] for v in choice_dict.values()])
    incorrect_values = np.array([v['incorrect'] for v in choice_dict.values()])
    
    # Define softmax function
    def softmax(x):
        e_x = np.exp(x - np.max(x))
        return e_x / e_x.sum()
    
    softmax_correct = softmax(correct_values)
    softmax_incorrect = softmax(incorrect_values)
    
    result = {}
    for i, choice in enumerate(choice_dict.keys()):
        result[choice] = {
            'correct': softmax_correct[i],
            'incorrect': softmax_incorrect[i]
        }
    return result



for df in [
    df_oqwen, df_deepseekqwen, df_deepseekllama,
    df_gemma_7B , df_llama3_8B, df_mistral_7B,
    df_ai_Yi_9B, df_openchat_7B, df_zephyer, df_phi2
]:
    df["disc_init_policy_refine"] = df["disc_init_policy"].apply(apply_softmax)

    

In [10]:
def get_most_probable_letter(answer_dict):
    
    if answer_dict is None:
        return None 
    
    letter, _ = max(answer_dict.items(), key=lambda item: item[1]['correct'])
    return letter

for df in [
    df_oqwen, df_deepseekqwen, df_deepseekllama,
    df_gemma_7B , df_llama3_8B, df_mistral_7B,
    df_ai_Yi_9B, df_openchat_7B, df_zephyer, df_phi2
]:
    df["disc_init_answer"] = df["disc_init_policy"].apply(get_most_probable_letter)
    df["disc_init_refine_answer"] = df["disc_init_policy_refine"].apply(get_most_probable_letter)
    


In [11]:
# for df in [
#     df_oqwen, df_deepseekqwen, df_deepseekllama,
#     df_gemma_7B , df_llama3_8B, df_mistral_7B
# ]:
#     display(df.head(8))
    

In [12]:
def reweight_joint(row):
    plm_y_given_xl = row['gen_init_policy'] 
    plm_l_given_xy = row['disc_init_policy']  

    new_joint = {}
    for y, label_probs in plm_y_given_xl.items():
        new_joint[y] = {}
        for label, p1 in label_probs.items():
            p2 = plm_l_given_xy.get(label, {}).get(y, 0.0)
            new_joint[y][label] = p1 * p2
    return new_joint


def get_max_correct_label(mi_dict):
    return max(mi_dict['correct'], key=mi_dict['correct'].get)


for df in [
    df_oqwen, df_deepseekqwen, df_deepseekllama,
    df_gemma_7B , df_llama3_8B, df_mistral_7B, df_zephyer, df_phi2
]:
    df["MI"] = df.apply(reweight_joint, axis=1)
    df["MI_answer"] = df["MI"].apply(get_max_correct_label)

    

In [13]:


def evaluate_majority_vote_fixed(df_qwen, df_oqwen, df_llama2, answer_col='disc_init_refine_answer', label_col='answer_letter'):
    """
    Compare three models' predictions with ground truth, and compute majority vote accuracy.
    This function recalculates model correctness based on combined_df only.
    
    Args:
        df_qwen (DataFrame): First model's dataframe (e.g., Qwen).
        df_oqwen (DataFrame): Second model's dataframe (e.g., OQwen).
        df_llama2 (DataFrame): Third model's dataframe (e.g., Llama2).
        answer_col (str): Name of the model prediction column (default 'disc_init_refine_answer').
        label_col (str): Name of the ground truth column (default 'answer_letter').
        
    Returns:
        combined_df (DataFrame): DataFrame containing all predictions and majority votes.
        results_df (DataFrame): Summary table of counts and accuracies.
    """

    # Combine predictions
    combined_df = pd.DataFrame({
        "question_id": df_oqwen.index,
        "correct_answer": df_oqwen[label_col],   # Use OQwen's label (or consistent label)
        "qwen_pred": df_qwen[answer_col],
        "oqwen_pred": df_oqwen[answer_col],
        "llama2_pred": df_llama2[answer_col]
    })
    
    # Majority vote
    def get_majority_vote(row):
        votes = [row["qwen_pred"], row["oqwen_pred"], row["llama2_pred"]]
        vote_counts = Counter(votes)
        majority_answers = [ans for ans, count in vote_counts.items() if count >= 2]
        if not majority_answers:
            return None
        return majority_answers[0]
    
    combined_df["majority_vote"] = combined_df.apply(get_majority_vote, axis=1)
    
    # Check correctness
    combined_df["qwen_correct"] = combined_df["qwen_pred"] == combined_df["correct_answer"]
    combined_df["oqwen_correct"] = combined_df["oqwen_pred"] == combined_df["correct_answer"]
    combined_df["llama2_correct"] = combined_df["llama2_pred"] == combined_df["correct_answer"]
    combined_df["is_majority_correct"] = combined_df["majority_vote"] == combined_df["correct_answer"]

    # Summarize
    total_questions = len(combined_df)
    qwen_corr = combined_df["qwen_correct"].sum()
    oqwen_corr = combined_df["oqwen_correct"].sum()
    llama2_corr = combined_df["llama2_correct"].sum()
    majority_correct = combined_df["is_majority_correct"].sum()

    results_d = {
        "Metric": [
            "Total Questions",
            "Qwen Accuracy",
            "OQwen Accuracy",
            "Llama2 Accuracy",
            "Majority Vote (≥2 models) Accuracy",
        ],
        "Count": [
            total_questions,
            qwen_corr,
            oqwen_corr,
            llama2_corr,
            majority_correct,
        ],
        "Accuracy": [
            "100%",
            f"{(qwen_corr / total_questions) * 100:.2f}%",
            f"{(oqwen_corr / total_questions) * 100:.2f}%",
            f"{(llama2_corr / total_questions) * 100:.2f}%",
            f"{(majority_correct / total_questions) * 100:.2f}%",
        ]
    }

    results_df = pd.DataFrame(results_d)
    
    return combined_df, results_df


In [14]:

def evaluate_majority_vote(
    model_dfs: dict[str, pd.DataFrame],
    answer_col: str = 'disc_init_refine_answer',
    label_col: str = 'answer_letter'
):
    """
    Compare multiple models' predictions, compute majority vote accuracy, and summarize results.

    Args:
        model_dfs (dict): A dictionary where keys are model names (str) and
                          values are their corresponding DataFrames (pd.DataFrame).
        answer_col (str): The name of the column containing model predictions.
        label_col (str): The name of the column containing the ground truth answer.

    Returns:
        tuple: A tuple containing:
            - combined_df (pd.DataFrame): DataFrame with all predictions and votes.
            - results_df (pd.DataFrame): A summary table of accuracies.
    """
    model_names = list(model_dfs.keys())
    if not model_names:
        raise ValueError("The model_dfs dictionary cannot be empty.")

    num_models = len(model_names)
    
    # --- 1. Combine Predictions ---
    # Use the first model's DataFrame to set up the base
    base_df_name = model_names[0]
    combined_df = pd.DataFrame({
        "question_id": model_dfs[base_df_name].index,
        "correct_answer": model_dfs[base_df_name][label_col]
    })

    # Add each model's prediction as a new column
    pred_cols = []
    for name, df in model_dfs.items():
        pred_col_name = f"{name}_pred"
        combined_df[pred_col_name] = df[answer_col]
        pred_cols.append(pred_col_name)

    # --- 2. Calculate Majority Vote ---
    def get_majority_vote(row):
        # A true majority means a count greater than half the number of models
        majority_threshold = math.floor(num_models / 2) + 1
        
        votes = [row[col] for col in pred_cols if pd.notna(row[col])]
        if not votes:
            return None
        
        vote_counts = Counter(votes)
        most_common_ans, highest_count = vote_counts.most_common(1)[0]
        
        # Return the answer only if it meets the majority threshold
        return most_common_ans if highest_count >= majority_threshold else None

    combined_df["majority_vote"] = combined_df.apply(get_majority_vote, axis=1)

    # --- 3. Check Correctness ---
    for name in model_names:
        combined_df[f"{name}_correct"] = combined_df[f"{name}_pred"] == combined_df["correct_answer"]
    
    combined_df["is_majority_correct"] = combined_df["majority_vote"] == combined_df["correct_answer"]

    # --- 4. Summarize Results ---
    total_questions = len(combined_df)
    
    metrics = ["Total Questions"]
    counts = [total_questions]
    accuracies = [f"100%"]

    # Add results for each individual model
    for name in model_names:
        correct_count = combined_df[f"{name}_correct"].sum()
        metrics.append(f"{name} Accuracy")
        counts.append(correct_count)
        accuracies.append(f"{(correct_count / total_questions) * 100:.2f}%")
        
    # Add final majority vote result
    majority_correct = combined_df["is_majority_correct"].sum()
    majority_threshold_display = math.floor(num_models / 2)
    metrics.append(f"Majority Vote (>{majority_threshold_display} models) Accuracy")
    counts.append(majority_correct)
    accuracies.append(f"{(majority_correct / total_questions) * 100:.2f}%")

    results_df = pd.DataFrame({"Metric": metrics, "Count": counts, "Accuracy": accuracies})
    
    return combined_df, results_df

In [15]:
np.random.seed(42)

class Discriminator:
    def __init__(self, init_policy):
        """Initialize with given probability dictionary."""
        self.policy = init_policy 

    def respond(self, choice, max_choice):
        """Sample a response ('correct' or 'incorrect') for a given answer choice."""
        prob_correct = self.policy[choice]['correct']
        res = "correct" if choice == max_choice else "incorrect"    
#         if choice == max_choice:
#             res = "correct"
#         else:
#             res = np.random.choice(["correct", "incorrect"], p=[prob_correct, 1- prob_correct])
        
        return res

    def log_gradient(self, choice, response):

        if response == "correct":
            grad_correct = 1 / self.policy[choice]['correct']
            grad_incorrect = -1 / self.policy[choice]['correct']
        else:
            grad_correct = -1 / self.policy[choice]['incorrect']
            grad_incorrect = 1 / self.policy[choice]['incorrect']

        return {'correct': grad_correct, 'incorrect': grad_incorrect}

    def get_max_correct_choice(self):
        """Return the choice with highest 'correct' probability"""
        return max(self.policy.items(), key=lambda x: x[1]['correct'])[0]
    
    def update_policy(self, choice, response, reward, learning_rate=0.1):
        grads = self.log_gradient(choice, response)
        prob_correct = self.policy[choice]['correct']
        prob_incorrect = self.policy[choice]['incorrect']
        
        log_prob_correct = np.log(prob_correct) + learning_rate * reward * grads['correct']
        log_prob_incorrect = np.log(prob_incorrect) + learning_rate * reward * grads['incorrect']

        max_log = max(log_prob_correct, log_prob_incorrect)
        log_prob_correct -= max_log
        log_prob_incorrect -= max_log

        exp_correct = np.exp(log_prob_correct)
        exp_incorrect = np.exp(log_prob_incorrect)
        total = exp_correct + exp_incorrect
        self.policy[choice]['correct'] = exp_correct / total
        self.policy[choice]['incorrect'] = exp_incorrect / total
        
        self.policy[choice]['correct'] = np.clip(self.policy[choice]['correct'], 1e-6, 1 - 1e-6)
        self.policy[choice]['incorrect'] = 1 - self.policy[choice]['correct']




label_to_index = {'correct': 0, 'incorrect': 1}

def normalize_rewards(dmi_dict):
    values = np.array(list(dmi_dict.values()))
    mean = np.mean(values)
    std = np.std(values) + 1e-8
    return {k: (v - mean) / std for k, v in dmi_dict.items()}


        
def generate_matrix(part_data, i, j):
    matrix = np.zeros((2, 2), dtype=int)
    for row in part_data:
        row_i = label_to_index[row[i]]
        row_j = label_to_index[row[j]]
        matrix[row_i][row_j] += 1
    return matrix


def calculate_dmi_score(part1_matrix, part2_matrix):
    det_part1 = np.linalg.det(part1_matrix)
    det_part2 = np.linalg.det(part2_matrix)
    dmi_score = det_part1 * det_part2
    return dmi_score

def compute_dmi_payments(responses):
    """Compute DMI payments using determinants of agreement matrices."""
    n = len(responses)
    split1 = responses[ : (n + 1)//2]  
    split2 = responses[(n + 1)//2 : ]  
    splits = {'Part1': split1, 'Part2': split2}
    
    n_d = len(responses[0])
    d_dmi_dict = {i : 0 for i in range(n_d)}
    for i, j in combinations(range(n_d), 2):
        part1_matrix = generate_matrix(splits["Part1"], i, j)
        part2_matrix = generate_matrix(splits["Part2"], i, j)
        dmi_score = calculate_dmi_score(part1_matrix, part2_matrix)
        d_dmi_dict[i] += dmi_score
        d_dmi_dict[j] += dmi_score
        
    return d_dmi_dict

def batch_update_discriminators(
    discriminator_dfs: dict, # Pass a dictionary of DataFrames
    choice_df,
    T_steps,
    batch_size=8,
    learning_rate=0.1
):
    """
    Batch update discriminator policies for a variable number of discriminators.
    
    Args:
        discriminator_dfs (dict): A dictionary where keys are model names (str) and
                                  values are their corresponding DataFrames.
        choice_df (DataFrame): DataFrame used for selecting initial choices.
        batch_size (int): Number of rows to update in each batch.
        T_steps (int): Number of response-update iterations per batch.
        learning_rate (float): Learning rate for policy update.
        
    Returns:
        dict: A dictionary containing the updated DataFrames, with the same keys.
    """
    if batch_size < 4:
        print("Warning: batch size should be >= 4 for DMI calculation.")
    
    # Extract model names and a list of DataFrames for indexed processing
    model_names = list(discriminator_dfs.keys())
    df_list = [df.copy() for df in discriminator_dfs.values()] # Work on copies to avoid modifying original input dict


    if not df_list:
        return {} # Return empty dict if input is empty

    num_discriminators = len(df_list)
    n_rows = df_list[0].shape[0]

    # Initialize the column for updated policies in each DataFrame
    for df in df_list:
        df["updated_disc_policy"] = None

    for i in range(0, n_rows, batch_size):
        current_batch_size = min(batch_size, n_rows - i)
        if current_batch_size < 4: continue

        # Initialize discriminators from the list of DataFrames
        discriminators = [
            [Discriminator(copy.deepcopy(df.loc[i + j, 'disc_init_policy_refine'])) for j in range(current_batch_size)]
            for df in df_list
        ]

        # Initialize choices for the batch
        choices = [
            max(choice_df.loc[i + j, "gen_init_policy"]["correct"],
                key=choice_df.loc[i + j, "gen_init_policy"]["correct"].get)
            for j in range(batch_size)
        ]
        
        for _ in range(T_steps):
            batch_responses = []
            for j in range(current_batch_size):
                task_responses = []
                for d in range(num_discriminators):
                    max_choice = discriminators[d][j].get_max_correct_choice()
                    response = discriminators[d][j].respond(choices[j], max_choice)
                    task_responses.append(response)
                batch_responses.append(task_responses)

            dmi_scores = compute_dmi_payments(batch_responses)
            dmi_scores = normalize_rewards(dmi_scores)

            for d in range(num_discriminators):
                for j in range(current_batch_size):
                    choice = choices[j]
                    response = batch_responses[j][d]
                    reward = dmi_scores.get(d, 0) # Use .get for safety
                    discriminators[d][j].update_policy(choice, response, reward, learning_rate)

        # Save the final updated policies back to the DataFrames in the list
        for d in range(num_discriminators):
            for j in range(current_batch_size):
                df_list[d].at[i + j, "updated_disc_policy"] = discriminators[d][j].policy
    
    # Add the final answer letter based on the updated policy
    for df in df_list:
        df['updated_answer_letter'] = df['updated_disc_policy'].apply(get_most_probable_letter)

    # Reconstruct the dictionary with the original names and updated DataFrames
    updated_dfs_dict = {name: df for name, df in zip(model_names, df_list)}
    
    return updated_dfs_dict

In [16]:

temp_discriminator_dict = {
    "OQwen": temp_df_oqwen,
    "DeepseekQwen": temp_df_qwen,
    "DeepseekLlama": temp_df_llama2,
    # "Gemma_7B":  temp_df_gemma_7B,
    # "Llama3_8B": temp_df_llama3_8b,
    # "Mistral_7B":  temp_df_mistral_7B,
}

discriminator_dict = {
    "OQwen": df_oqwen,
    "DeepseekQwen": df_deepseekqwen,
    "DeepseekLlama": df_deepseekllama,
    # "Gemma_7B": df_gemma_7B,
    # "Llama3_8B": df_llama3_8B,
    # "Mistral_7B": df_mistral_7B,
}


discriminator_dict_update =  batch_update_discriminators(
    discriminator_dict,
    choice_df = df_oqwen, 
    batch_size=8,
    T_steps=5,
    learning_rate=0.1
)
    
combined_df_refine_update, results_df_refine_update = evaluate_majority_vote(
    discriminator_dict_update,  
    answer_col='updated_answer_letter', 
    label_col='answer_letter')

results_df_refine_update

Unnamed: 0,Metric,Count,Accuracy
0,Total Questions,1170,100%
1,OQwen Accuracy,995,85.04%
2,DeepseekQwen Accuracy,1008,86.15%
3,DeepseekLlama Accuracy,1006,85.98%
4,Majority Vote (>1 models) Accuracy,1018,87.01%


In [19]:
## ARC Challenger Iteration Test


temp_discriminator_dict = {
    "OQwen": temp_df_oqwen,
    "DeepseekQwen": temp_df_qwen,
    "DeepseekLlama": temp_df_llama2
}

discriminator_dict = {
    "OQwen": df_oqwen,
    "DeepseekQwen": df_deepseekqwen,
    "DeepseekLlama": df_deepseekllama
}




majority_vote_accuracies = []
for t_step_val in range(20):
    discriminator_dict_update =  batch_update_discriminators(
        discriminator_dict,
        choice_df = df_oqwen, 
        batch_size=8,
        T_steps=t_step_val,
        learning_rate=0.1
    )
        
    combined_df_refine_update, results_df_refine_update = evaluate_majority_vote(
        discriminator_dict_update,  
        answer_col='updated_answer_letter', 
        label_col='answer_letter')



    last_row = results_df_refine_update.iloc[-1]
    majority_vote_accuracy = last_row['Accuracy']
    # print(majority_vote_accuracy)
    majority_vote_accuracies.append(majority_vote_accuracy)

iteration_numbers = list(range(0, 20))
plot_df = pd.DataFrame({
    'Iteration Number (T_steps)': iteration_numbers,
    'Majority Vote Accuracy': majority_vote_accuracies
})
plot_df


Unnamed: 0,Iteration Number (T_steps),Majority Vote Accuracy
0,0,70.51%
1,1,87.01%
2,2,87.01%
3,3,87.01%
4,4,87.01%
5,5,87.01%
6,6,87.01%
7,7,87.01%
8,8,87.01%
9,9,87.01%


## 5 discriminator

In [35]:
temp_discriminator_dict = {
    "OQwen": temp_df_oqwen,
    "DeepseekQwen": temp_df_qwen,
    "Llama3_8B": temp_df_llama3_8b,
    "Gemma_7B":  temp_df_gemma_7B,
    "Mistral_7B":  temp_df_mistral_7B,
}

discriminator_dict = {
    "OQwen": df_oqwen,
    "DeepseekQwen": df_deepseekqwen,
    "Llama3_8B": df_llama3_8B,
    "Gemma_7B": df_gemma_7B,
    "Mistral_7B": df_mistral_7B,
}



In [36]:

combined_df_inital_results_dis, results_df_inital_dis = evaluate_majority_vote(
    model_dfs=temp_discriminator_dict,
    answer_col='disc_init_answer', 
    label_col='answer_letter'
)
print("\nInitial Discriminator Policy")
display(results_df_inital_dis)

combined_df_inital_results_ED, results_df_inital_ED  = evaluate_majority_vote(
    model_dfs=discriminator_dict,
     answer_col='ED_consensus', 
     label_col='answer_letter')

print("\nInitial Consensus Game Policy")
display(results_df_inital_ED)

## ARC Challenger Original Qwen as problem selector

discriminator_dict_update =  batch_update_discriminators(
    discriminator_dict,
    choice_df = df_oqwen, 
    batch_size=8,
    T_steps=11,
    learning_rate=0.1
)
       
combined_df_refine_update, results_df_refine_update = evaluate_majority_vote(
    discriminator_dict_update,  
    answer_col='updated_answer_letter', 
    label_col='answer_letter')

print("\n Updated PEG Policy")
display(results_df_refine_update)



Initial Discriminator Policy


Unnamed: 0,Metric,Count,Accuracy
0,Total Questions,1170,100%
1,OQwen Accuracy,970,82.91%
2,DeepseekQwen Accuracy,669,57.18%
3,Llama3_8B Accuracy,890,76.07%
4,Gemma_7B Accuracy,811,69.32%
5,Mistral_7B Accuracy,822,70.26%
6,Majority Vote (>2 models) Accuracy,891,76.15%



Initial Consensus Game Policy


Unnamed: 0,Metric,Count,Accuracy
0,Total Questions,1170,100%
1,OQwen Accuracy,1025,87.61%
2,DeepseekQwen Accuracy,822,70.26%
3,Llama3_8B Accuracy,948,81.03%
4,Gemma_7B Accuracy,796,68.03%
5,Mistral_7B Accuracy,854,72.99%
6,Majority Vote (>2 models) Accuracy,939,80.26%



 Updated PEG Policy


Unnamed: 0,Metric,Count,Accuracy
0,Total Questions,1170,100%
1,OQwen Accuracy,1013,86.58%
2,DeepseekQwen Accuracy,968,82.74%
3,Llama3_8B Accuracy,1002,85.64%
4,Gemma_7B Accuracy,976,83.42%
5,Mistral_7B Accuracy,963,82.31%
6,Majority Vote (>2 models) Accuracy,1011,86.41%


In [37]:

majority_vote_accuracies = []
for t_step_val in range(50):
    discriminator_dict_update =  batch_update_discriminators(
        discriminator_dict,
        choice_df = df_oqwen, 
        batch_size=8,
        T_steps=t_step_val,
        learning_rate=0.1
    )
        
    combined_df_refine_update, results_df_refine_update = evaluate_majority_vote(
        discriminator_dict_update,  
        answer_col='updated_answer_letter', 
        label_col='answer_letter')



    last_row = results_df_refine_update.iloc[-1]
    majority_vote_accuracy = last_row['Accuracy']
    # print(majority_vote_accuracy)
    majority_vote_accuracies.append(majority_vote_accuracy)

iteration_numbers = list(range(0, 50))
plot_df = pd.DataFrame({
    'Iteration Number (T_steps)': iteration_numbers,
    'Majority Vote Accuracy': majority_vote_accuracies
})
plot_df

Unnamed: 0,Iteration Number (T_steps),Majority Vote Accuracy
0,0,75.98%
1,1,86.92%
2,2,86.75%
3,3,86.58%
4,4,86.58%
5,5,86.58%
6,6,86.32%
7,7,86.32%
8,8,86.58%
9,9,86.07%


## 7 discriminator

In [58]:
temp_discriminator_7dict = {
    "OQwen": temp_df_oqwen,
    "DeepseekQwen": temp_df_qwen,
    "DeepseekLlama": temp_df_llama2,
    "Gemma_7B":  temp_df_gemma_7B,
    "Mistral_7B":  temp_df_mistral_7B,
    "Ai_Yi_9B": temp_df_ai_Yi_9B,
    "Openchat_7B": temp_df_openchat_7B,


}

discriminator_7dict = {
    "OQwen": df_oqwen,
    "DeepseekQwen": df_deepseekqwen,
    "DeepseekLlama": df_deepseekllama,
    "Gemma_7B": df_gemma_7B,
    "Mistral_7B": df_mistral_7B,
    "Ai_Yi_9B": df_ai_Yi_9B,
    "Openchat_7B": df_openchat_7B,
}


In [59]:
combined_df_inital_results_7dis, results_df_inital_7dis = evaluate_majority_vote(
    model_dfs=temp_discriminator_7dict,
    answer_col='disc_init_answer', 
    label_col='answer_letter'
)
print("\nInitial Discriminator Policy")
display(results_df_inital_7dis)

combined_df_inital_results_7ED, results_df_inital_7ED  = evaluate_majority_vote(
    model_dfs=discriminator_7dict,
     answer_col='ED_consensus', 
     label_col='answer_letter')

print("\nInitial Consensus Game Policy")
display(results_df_inital_7ED)

## ARC Challenger Original Qwen as problem selector

discriminator_7dict_update =  batch_update_discriminators(
    discriminator_7dict,
    choice_df = df_oqwen, 
    batch_size=8,
    T_steps=8,
    learning_rate=0.1
)
       
combined_df_refine_7update, results_df_refine_7update = evaluate_majority_vote(
    discriminator_7dict_update,  
    answer_col='updated_answer_letter', 
    label_col='answer_letter')

print("\n Updated PEG Policy")
display(results_df_refine_7update)



Initial Discriminator Policy


Unnamed: 0,Metric,Count,Accuracy
0,Total Questions,1170,100%
1,OQwen Accuracy,970,82.91%
2,DeepseekQwen Accuracy,669,57.18%
3,DeepseekLlama Accuracy,700,59.83%
4,Gemma_7B Accuracy,811,69.32%
5,Mistral_7B Accuracy,822,70.26%
6,Ai_Yi_9B Accuracy,951,81.28%
7,Openchat_7B Accuracy,925,79.06%
8,Majority Vote (>3 models) Accuracy,895,76.50%



Initial Consensus Game Policy


Unnamed: 0,Metric,Count,Accuracy
0,Total Questions,1170,100%
1,OQwen Accuracy,1025,87.61%
2,DeepseekQwen Accuracy,822,70.26%
3,DeepseekLlama Accuracy,741,63.33%
4,Gemma_7B Accuracy,796,68.03%
5,Mistral_7B Accuracy,854,72.99%
6,Ai_Yi_9B Accuracy,1022,87.35%
7,Openchat_7B Accuracy,984,84.10%
8,Majority Vote (>3 models) Accuracy,954,81.54%



 Updated PEG Policy


Unnamed: 0,Metric,Count,Accuracy
0,Total Questions,1170,100%
1,OQwen Accuracy,944,80.68%
2,DeepseekQwen Accuracy,871,74.44%
3,DeepseekLlama Accuracy,865,73.93%
4,Gemma_7B Accuracy,884,75.56%
5,Mistral_7B Accuracy,925,79.06%
6,Ai_Yi_9B Accuracy,943,80.60%
7,Openchat_7B Accuracy,934,79.83%
8,Majority Vote (>3 models) Accuracy,957,81.79%
