# Analysis for experiments

In [2]:
import json
import os
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

def load_results(file_path):
    with open(file_path, "r") as f:
        results = json.load(f)
    return results

def calculate_mach_iv_score(results):
    mach_iv_score = 20
    for question_id, result in results.items():
        selected_score = result["selected_score"]
        mach_iv_score += selected_score
    return mach_iv_score

def get_all_results(base_dir, models, personas, iterations):
    all_results = {}
    for model_dir in models:
        for persona_dir in personas:
            persona_name = os.path.basename(persona_dir)
            for iteration in range(1, iterations + 1):
                results_file = os.path.join(base_dir, model_dir, persona_name, f"mach_iv_scores_v{iteration}.json")
                if os.path.isfile(results_file):
                    results = load_results(results_file)
                    all_results[f"{model_dir}_{persona_name}_{iteration}"] = results
    return all_results

def create_scores_dataframe(all_results):
    data = []
    for model_persona_iteration, results in all_results.items():
        model, persona, iteration = model_persona_iteration.split("_")
        mach_iv_score = calculate_mach_iv_score(results)
        for question_id, result in results.items():
            selected_score = result["selected_score"]
            data.append({"Model": model, "Persona": persona, "Iteration": iteration, "Question": question_id, "Score": selected_score, "MACH-IV Score": mach_iv_score})
    df = pd.DataFrame(data)
    return df

# Set parameters
models = ["gpt-3.5-turbo-0125", "gemini-pro"]
personas = ['data/personas/mach_persona_0', 'data/personas/mach_persona_1', 'data/personas/mach_persona_2', 'data/personas/mach_persona_3', 'data/personas/mach_persona_4']
iterations = 5

# Load all results
base_dir = "../results"
all_results = get_all_results(base_dir, models, personas, iterations)

# Create scores dataframe
df = create_scores_dataframe(all_results)

# Plot MACH-IV scores for each model, persona, and iteration
plt.figure(figsize=(16, 8))
sns.barplot(x="Model", y="MACH-IV Score", hue="Persona", data=df, ci=None)
plt.title("MACH-IV Scores for Each Model, Persona, and Iteration")
plt.xlabel("Model")
plt.ylabel("MACH-IV Score")
plt.xticks(rotation=45)
plt.legend(title="Persona")
plt.tight_layout()
plt.show()

# Plot score distribution for each model, persona, and iteration
plt.figure(figsize=(16, 10))
sns.boxplot(x="Model", y="Score", hue="Persona", data=df)
plt.title("Score Distribution for Each Model, Persona, and Iteration")
plt.xlabel("Model")
plt.ylabel("Score")
plt.xticks(rotation=45)
plt.legend(title="Persona")
plt.tight_layout()
plt.show()

# Print MACH-IV scores for each model, persona, and iteration
print("MACH-IV Scores:")
for model_persona_iteration, results in all_results.items():
    mach_iv_score = calculate_mach_iv_score(results)
    print(f"{model_persona_iteration}: {mach_iv_score}")

ValueError: too many values to unpack (expected 3)

Run the log probs of the distribution 
temp = 1 and yse that variance 
The most interested is self reinforcing 
if a model's behavior, does it become more machivelian 
I want to see htat if you just sample questions and get the model answers, would it become machievelian 
