In [None]:
import os
import re
import json
import numpy as np
import pandas as pd

In [None]:
model_path = r"C:\Users\user\Desktop\temp\rolevectors_results\gemma-2-9b-it"

In [None]:
ROLE_DATASET_MAPPING = {
    "econ": ["economic researcher", "economist", "financial analyst"],
    "eecs": ["electronics technician", "data scientist", "electrical engineer", "software engineer", "web developer"],
    "law": ["bailiff", "lawyer"],
    "math": ["data analyst", "mathematician", "statistician"],
    "medicine": ["nurse", "doctor", "physician", "dentist", "surgeon"],
    "natural science": ["geneticist", "biologist", "physicist", "teacher", "chemist", "ecologist"],
    "politics": ["politician", "sheriff", "governor", "enthusiast", "partisan"],
    "psychology": ["psychologist"]
}

In [None]:
import pandas as pd
import numpy as np

def format_baseline(df):
    # Create a copy to avoid modifying the original DataFrame
    df_formatted = df.copy()
    
    # For each dataset in the MultiIndex (first level of columns)
    for dataset in df.columns.levels[0]:
        if 'baseline' in df[dataset].columns:
            # Format baseline values to always show 2 decimal places
            df_formatted[(dataset, 'baseline')] = df[dataset]['baseline'].apply(lambda x: f"{float(x):.2f}")
    
    return df_formatted

def color_columns(df):
    def color_value(val, baseline):
        # Convert values to floats if needed
        if isinstance(val, str):
            if '±' in val:
                val = float(val.split('±')[0].strip())
            else:
                val = float(val)
        if isinstance(baseline, str):
            if '±' in baseline:
                baseline = float(baseline.split('±')[0].strip())
            else:
                baseline = float(baseline)
                
        val = round(val, 2)
        baseline = round(baseline, 2)
        diff = val - baseline
        
        # Calculate relative difference (percentage)
        rel_diff = (diff / baseline) * 100 if baseline != 0 else diff * 100
        
        # Use logarithmic scaling for intensity
        intensity = np.log1p(abs(rel_diff)) / 10  # log1p is log(1+x)
        intensity = np.clip(intensity, 0.1, 0.5)  # Clip between 0.1 and 0.5
        
        # Choose colors based on the magnitude of difference
        if diff > 0:
            if rel_diff > 5:  # Large positive difference
                return f'background-color: rgba(0, 200, 0, {intensity}); color: black; border: 1px solid #000000'
            else:  # Small positive difference
                return f'background-color: rgba(144, 238, 144, {intensity}); color: black; border: 1px solid #000000'
        else:
            if rel_diff < -5:  # Large negative difference
                return f'background-color: rgba(255, 0, 0, {intensity}); color: black; border: 1px solid #000000'
            else:  # Small negative difference
                return f'background-color: rgba(255, 182, 193, {intensity}); color: black; border: 1px solid #000000'

    def style_df(x):
        # Create an empty DataFrame for styles with the same index and columns as x
        df_styled = pd.DataFrame('', index=x.index, columns=x.columns)
        # Ensure all cells have at least a border style
        df_styled = df_styled.fillna('border: 1px solid #000000')
        
        # Iterate through each dataset (top level of MultiIndex)
        for dataset in x.columns.levels[0]:
            # Check if the required columns exist: baseline and 1.0
            if 'baseline' in x[dataset].columns and '1.0' in x[dataset].columns:
                baseline_values = x[dataset]['baseline']
                
                # Style the '1.0' column based on the baseline
                for idx in x.index:
                    df_styled.loc[idx, (dataset, '1.0')] = color_value(
                        x.loc[idx, (dataset, '1.0')],
                        baseline_values[idx]
                    )
                    # Always set the baseline column to have a border
                    df_styled.loc[idx, (dataset, 'baseline')] = 'border: 1px solid #000000'
                    
                    # If the '3.0' column exists, style it based on the baseline
                    if '3.0' in x[dataset].columns:
                        df_styled.loc[idx, (dataset, '3.0')] = color_value(
                            x.loc[idx, (dataset, '3.0')],
                            baseline_values[idx]
                        )
            if 'baseline' in x[dataset].columns and 'ablation' in x[dataset].columns:
                baseline_values = x[dataset]['baseline']
                
                # Style the 'ablation' column based on the baseline
                for idx in x.index:
                    df_styled.loc[idx, (dataset, 'ablation')] = color_value(
                        x.loc[idx, (dataset, 'ablation')],
                        baseline_values[idx]
                    )
                    # Always set the baseline column to have a border
                    df_styled.loc[idx, (dataset, 'baseline')] = 'border: 1px solid #000000'
                    
                    # If the '3.0' column exists, style it based on the baseline
                    if '3.0' in x[dataset].columns:
                        df_styled.loc[idx, (dataset, '3.0')] = color_value(
                            x.loc[idx, (dataset, '3.0')],
                            baseline_values[idx]
                        )
                    
        return df_styled

    return style_df(df)


In [None]:
import os
import re
import json
import numpy as np
import pandas as pd

# Mapping from dataset category to list of roles.
ROLE_DATASET_MAPPING = {
    "econ": ["economic researcher", "economist", "financial analyst"],
    "eecs": ["electronics technician", "data scientist", "electrical engineer", "software engineer", "web developer"],
    "law": ["bailiff", "lawyer"],
    "math": ["data analyst", "mathematician", "statistician"],
    "medicine": ["nurse", "doctor", "physician", "dentist", "surgeon"],
    "natural_science": ["geneticist", "biologist", "physicist", "teacher", "chemist", "ecologist"],
    "politics": ["politician", "sheriff", "governor", "enthusiast", "partisan"],
    "psychology": ["psychologist"]
}

def get_dataset_category(role):
    """
    Returns the dataset category for a given role by looking up the mapping.
    """
    for category, roles in ROLE_DATASET_MAPPING.items():
        if role in roles:
            return category
    return None

def compute_mean_score(filepath):
    """
    Given a JSON file (a list of dicts), compute the mean of the 'score' values.
    """
    try:
        with open(filepath, "r") as f:
            data = json.load(f)
        scores = [d["score"] for d in data if "score" in d]
        return round(np.mean(scores), 2) if scores else np.nan
    except Exception as e:
        print(f"Error reading {filepath}: {e}")
        return np.nan

def get_baseline(select_dir):
    """
    In the run version folder (e.g. "1.0/select_direction"),
    load results_baseline.json and return its mean score.
    """
    baseline_file = os.path.join(select_dir, "results_baseline.json")
    if os.path.exists(baseline_file):
        return compute_mean_score(baseline_file)
    else:
        return np.nan

def get_addition_scores(select_dir, layer_range=False, start_percent=50):
    """
    Iterate over coefficient subdirectories under select_dir and compute scores.
    """
    addition_scores = []
    for sub in os.listdir(select_dir):
        sub_path = os.path.join(select_dir, sub)
        if os.path.isdir(sub_path) and re.match(r"^-?\d+$", sub):
            inner_dirs = [
                d for d in os.listdir(sub_path)
                if os.path.isdir(os.path.join(sub_path, d)) and re.match(r"^\d+$", d)
            ]
            inner_dirs_sorted = sorted(inner_dirs, key=lambda x: int(x))
            n_layers = len(inner_dirs_sorted)
            if n_layers == 0:
                continue
            if layer_range:
                start_idx = int(np.floor(n_layers * (start_percent / 100)))
                end_idx = int(np.ceil(n_layers * 0.8))
                start_idx = min(start_idx, end_idx - 1)
                selected_dirs = inner_dirs_sorted[start_idx:end_idx]
            else:
                keep_count = max(1, int(np.ceil(n_layers * 0.8)))
                selected_dirs = inner_dirs_sorted[:keep_count]
            
            for inner in selected_dirs:
                inner_path = os.path.join(sub_path, inner)
                for file in os.listdir(inner_path):
                    if file.startswith("results_addition") and file.endswith(".json"):
                        file_path = os.path.join(inner_path, file)
                        mean_file_score = compute_mean_score(file_path)
                        if not np.isnan(mean_file_score):
                            addition_scores.append(mean_file_score)
    
    if addition_scores:
        return np.mean(addition_scores), np.std(addition_scores)
    else:
        return np.nan, np.nan

def get_select_best_addition_score(version_path):
    """
    Get addition score for select_best mode by reading the metadata file.
    The metadata is expected at:
      {model}/{role}/{dataset}/{coeff}/direction_metadata.json
    Then, the addition score is computed from:
      {model}/{role}/{dataset}/{coeff}/select_direction/{pos}/{layer}/results_addition_{pos}_{layer}.json
    """
    metadata_path = os.path.join(version_path, "direction_metadata.json")
    if not os.path.exists(metadata_path):
        print(f"Warning: metadata file not found in {metadata_path}")
        return np.nan, np.nan
    try:
        with open(metadata_path, "r") as f:
            metadata = json.load(f)
    except Exception as e:
        print(f"Error reading metadata file {metadata_path}: {e}")
        return np.nan, np.nan
    
    if "pos" not in metadata or "layer" not in metadata:
        print(f"Warning: metadata file {metadata_path} missing 'pos' or 'layer'")
        return np.nan, np.nan
    
    pos = str(metadata["pos"])
    layer = str(metadata["layer"])
    select_dir = os.path.join(version_path, "select_direction")
    addition_file = os.path.join(select_dir, pos, layer, f"results_addition_{pos}_{layer}.json")
    if not os.path.exists(addition_file):
        print(f"Warning: addition file not found: {addition_file}")
        return np.nan, np.nan
    score = compute_mean_score(addition_file)
    # Since this is a single performance, we return the mean only (std remains NaN)
    return score, np.nan

def process_run_version(version_path, layer_range=False, start_percent=50, select_best=False):
    """
    Process the run version and compute baseline and addition scores.
    If select_best is True, the metadata is used to select the single best direction.
    """
    select_dir = os.path.join(version_path, "select_direction")
    if not os.path.exists(select_dir):
        return (np.nan, (np.nan, np.nan))
    
    # Only version "1.0" is expected to have a baseline.
    base_val = get_baseline(select_dir) if os.path.basename(version_path) == "1.0" else np.nan
    
    if select_best:
        addition_mean, addition_std = get_select_best_addition_score(version_path)
    else:
        addition_mean, addition_std = get_addition_scores(select_dir, layer_range, start_percent)
    
    return base_val, (addition_mean, addition_std)

def get_best_direction(role, primary_dataset_path, version, baseline):
    """
    For a given role and version folder inside the primary dataset folder, load the filtered direction evaluations file
    and return (position, layer) as strings.

    Expects the JSON file at:
       {model}/{role}/{primary_dataset}/{version}/select_direction/direction_evaluations_filtered.json

    When version is "3.0", since the file in that folder doesn't contain valid ablation information,
    the ablation values are retrieved from the baseline file at:
       {model}/{role}/{primary_dataset}/1.0/select_direction/direction_evaluations_filtered.json

    For versions other than "3.0", the JSON file is expected to contain both steering and ablation values.
    
    The function first searches for an entry where:
       - steering_performance_score >= baseline, and
       - ablation_performance_score < baseline.
       
    If no such entry is found:
       - For version "3.0": the function selects the candidate with the highest steering_performance_score from the 3.0 file.
       - For version "1.0": the function relaxes the ablation condition and selects the candidate with the highest steering_performance_score.
    
    If a suitable entry is found, its "position" and "layer" values are returned as strings.
    Otherwise, (None, None) is returned.
    """
    if version == "3.0":
        eval_file_path = os.path.join(
            primary_dataset_path, version, "select_direction", "direction_evaluations_filtered.json"
        )
        # Load ablation values from baseline (version "1.0")
        baseline_eval_file_path = os.path.join(
            primary_dataset_path, "1.0", "select_direction", "direction_evaluations_filtered.json"
        )
        
        if not os.path.exists(eval_file_path):
            print(f"Warning: filtered evaluations file not found for role '{role}' in {eval_file_path}")
            return None, None
        if not os.path.exists(baseline_eval_file_path):
            print(f"Warning: baseline filtered evaluations file not found for role '{role}' in {baseline_eval_file_path}")
            return None, None
        
        try:
            with open(eval_file_path, "r") as f:
                eval_entries = json.load(f)
            with open(baseline_eval_file_path, "r") as f:
                baseline_entries = json.load(f)
            
            # Build a dictionary mapping (position, layer) to ablation_performance_score from the baseline file.
            baseline_ablation = {}
            for entry in baseline_entries:
                if ("position" in entry and "layer" in entry and 
                    "ablation_performance_score" in entry):
                    key = (str(entry["position"]), str(entry["layer"]))
                    baseline_ablation[key] = entry["ablation_performance_score"]
            
            # First, try to find an entry satisfying the conditions.
            for entry in eval_entries:
                if ("steering_performance_score" in entry and 
                    "position" in entry and 
                    "layer" in entry):
                    pos = str(entry["position"])
                    layer = str(entry["layer"])
                    steer_perf = entry["steering_performance_score"]
                    
                    key = (pos, layer)
                    if key not in baseline_ablation:
                        continue
                    ablation_perf = baseline_ablation[key]
                    
                    if steer_perf >= baseline and ablation_perf < baseline:
                        return pos, layer
            
            # If no candidate meets the conditions, select the best candidate from the 3.0 evaluations
            best_entry = None
            best_score = -float("inf")
            for entry in eval_entries:
                if ("steering_performance_score" in entry and 
                    "position" in entry and 
                    "layer" in entry):
                    score = entry["steering_performance_score"]
                    if score > best_score:
                        best_score = score
                        best_entry = entry
            
            if best_entry is not None:
                return str(best_entry["position"]), str(best_entry["layer"])
            else:
                print(f"Warning: No valid entries found in {eval_file_path}.")
                return None, None
        except Exception as e:
            print(f"Error reading evaluations for role '{role}' in version '3.0': {e}")
            return None, None

    else:
        # For versions other than "3.0"
        eval_file_path = os.path.join(
            primary_dataset_path, version, "select_direction", "direction_evaluations_filtered.json"
        )
        
        if not os.path.exists(eval_file_path):
            print(f"Warning: filtered evaluations file not found for role '{role}' in {eval_file_path}")
            return None, None

        try:
            with open(eval_file_path, "r") as f:
                evaluations = json.load(f)
                
            # First, try to find an entry satisfying the conditions.
            for entry in evaluations:
                if ("steering_performance_score" in entry and 
                    "ablation_performance_score" in entry and
                    "position" in entry and 
                    "layer" in entry):
                    
                    if (entry["steering_performance_score"] >= baseline and 
                        entry["ablation_performance_score"] < baseline):
                        return str(entry["position"]), str(entry["layer"])
            
            # For version "1.0", relax the ablation requirement if no candidate is found.
            if version == "1.0":
                best_entry = None
                best_score = -float("inf")
                for entry in evaluations:
                    if ("steering_performance_score" in entry and 
                        "position" in entry and 
                        "layer" in entry):
                        score = entry["steering_performance_score"]
                        if score > best_score:
                            best_score = score
                            best_entry = entry
                if best_entry is not None:
                    return str(best_entry["position"]), str(best_entry["layer"])
            
            print(f"Warning: No evaluation entry in {eval_file_path} satisfies the conditions.")
            return None, None
        except Exception as e:
            print(f"Error reading evaluations for role '{role}' in {eval_file_path}: {e}")
            return None, None


def get_score_for_dataset_with_direction(model_name, role, dataset, version, pos, layer):
    """
    For a given role, dataset, and version (coefficient) folder, load the results file using the provided
    best direction parameters.
    
    The expected file path is:
       {model_name}/{role}/{dataset}/{version}/select_direction/{pos}/{layer}/results_addition_{pos}_{layer}.json
    """
    file_path = os.path.join(model_name, role, dataset, version, "select_direction", pos, layer,
                             f"results_addition_{pos}_{layer}.json")
    if not os.path.exists(file_path):
        print(f"Warning: score file not found: {file_path}")
        return np.nan
    return compute_mean_score(file_path)

def get_score_for_dataset_with_direction_ablation(model_name, role, dataset, version, pos, layer):
    """
    For a given role, dataset, and version (coefficient) folder, load the results file using the provided
    best direction parameters.
    
    The expected file path is:
       {model_name}/{role}/{dataset}/{version}/select_direction/{pos}/{layer}/results_ablation_{pos}_{layer}.json
    """
    file_path = os.path.join(model_name, role, dataset, version, "select_direction", pos, layer,
                             f"results_ablation_{pos}_{layer}.json")
    if not os.path.exists(file_path):
        print(f"Warning: score file not found: {file_path}")
        return np.nan
    return compute_mean_score(file_path)

def main(model_name, layer_range=False, start_percent=50, select_best=False, select_best_role=False, select_best_role_ablation=False):
    """
    Builds a MultiIndex DataFrame reporting performance scores.
    
    When select_best_role is True, for each role the code:
      1. Determines the "primary" dataset via get_dataset_category().
      2. In that primary folder (e.g. {model}/{role}/{primary_dataset}) identifies all version folders.
      3. For each version, loads the best direction from its filtered evaluations file using the provided baseline.
      4. Then for every dataset folder under {model}/{role} (excluding folders like "generate_directions" or "test_direction"),
         if that folder contains the same version folder, the score is loaded from:
              {model}/{role}/{dataset}/{version}/select_direction/{pos}/{layer}/results_addition_{pos}_{layer}.json
      5. For version "1.0", the score is also recorded as the baseline.
    
    The DataFrame’s rows are roles and its columns use a MultiIndex with:
      - Level 0: dataset folder name (e.g. "law", "math", …)
      - Level 1: version (with "baseline" included for version "1.0")
    
    (When select_best_role is False, the original logic is used; that branch is not implemented here.)
    """
    if sum([bool(layer_range), bool(select_best), bool(select_best_role), bool(select_best_role_ablation)]) > 1:
         raise ValueError("Only one of layer_range, select_best, select_best_role, or select_best_role_ablation can be True.")
    
    if select_best_role:
         roles = [d for d in os.listdir(model_name) if os.path.isdir(os.path.join(model_name, d))]
         results = {}
         for role in roles:
              role_path = os.path.join(model_name, role)
              primary_dataset = get_dataset_category(role)
              if primary_dataset is None:
                  print(f"Warning: No primary dataset mapping for role '{role}'. Skipping best-direction evaluation for this role.")
                  continue
              primary_dataset_path = os.path.join(role_path, primary_dataset)
              if not os.path.exists(primary_dataset_path):
                  print(f"Warning: Primary dataset folder '{primary_dataset_path}' not found for role '{role}'. Skipping.")
                  continue
              # Compute baseline from the primary dataset's version "1.0" select_direction folder.
              baseline_select_dir = os.path.join(primary_dataset_path, "1.0", "select_direction")
              baseline_value = get_baseline(baseline_select_dir)
              # Get the list of version folders (e.g. "1.0", "3.0", etc.) from the primary dataset folder.
              primary_versions = [v for v in os.listdir(primary_dataset_path) if re.match(r"^\d+\.\d+$", v)]
              primary_versions = sorted(primary_versions, key=lambda x: float(x))
              # Get all dataset directories under the role (excluding known non-performance folders).
              all_datasets = [d for d in os.listdir(role_path) 
                              if os.path.isdir(os.path.join(role_path, d)) 
                              and d not in ["generate_directions", "test_direction"]]
              # Prepare a dictionary: for each dataset, a dict mapping version (and baseline) to score.
              ds_result = {ds: {} for ds in all_datasets}
              for ver in primary_versions:
                   # Get best direction using the new function that requires baseline.
                   pos, layer = get_best_direction(role, primary_dataset_path, ver, baseline_value)
                   if pos is None or layer is None:
                        print(f"Skipping version {ver} for role '{role}' due to missing best direction.")
                        continue
                   # For each dataset folder, if it contains the same version folder, load its score.
                   for ds in all_datasets:
                        ds_ver_path = os.path.join(role_path, ds, ver)
                        if os.path.exists(ds_ver_path):
                            score = get_score_for_dataset_with_direction(model_name, role, ds, ver, pos, layer)
                            ds_result[ds][ver] = score if score != "nan" else "nan"
                        else:
                            ds_result[ds][ver] = "nan"
                        # If this is the "1.0" version, also record its baseline.
                        if ver == "1.0":
                            select_dir = os.path.join(model_name, role, ds, ver, "select_direction")
                            baseline_score = get_baseline(select_dir)
                            ds_result[ds]["baseline"] = f"{baseline_score:.2f}" if not np.isnan(baseline_score) else "nan"
              results[role] = ds_result
         
         # Build a DataFrame from the results dictionary.
         all_dataset_set = set()
         all_version_set = set()
         for role, ds_dict in results.items():
              for ds, ver_dict in ds_dict.items():
                   all_dataset_set.add(ds)
                   all_version_set.update(ver_dict.keys())
         all_dataset_list = sorted(all_dataset_set)
         # Sort versions so that "baseline" always comes first, then numeric order.
         def sort_ver(x):
              if x == "baseline":
                   return (0, 0)
              try:
                   return (1, float(x))
              except:
                   return (1, x)
         all_version_list = sorted(list(all_version_set), key=sort_ver)
         cols = pd.MultiIndex.from_product([all_dataset_list, all_version_list], names=["Dataset", "Version"])
         df = pd.DataFrame(index=sorted(results.keys()), columns=cols)
         for role, ds_dict in results.items():
              for ds, ver_dict in ds_dict.items():
                   for ver, value in ver_dict.items():
                        df.loc[role, (ds, ver)] = value
         return df

    elif select_best_role_ablation:
         roles = [d for d in os.listdir(model_name) if os.path.isdir(os.path.join(model_name, d))]
         results = {}
         for role in roles:
              role_path = os.path.join(model_name, role)
              primary_dataset = get_dataset_category(role)
              if primary_dataset is None:
                  print(f"Warning: No primary dataset mapping for role '{role}'. Skipping best-direction evaluation for this role.")
                  continue
              primary_dataset_path = os.path.join(role_path, primary_dataset)
              if not os.path.exists(primary_dataset_path):
                  print(f"Warning: Primary dataset folder '{primary_dataset_path}' not found for role '{role}'. Skipping.")
                  continue
              # Compute baseline from the primary dataset's version "1.0" select_direction folder.
              baseline_select_dir = os.path.join(primary_dataset_path, "1.0", "select_direction")
              baseline_value = get_baseline(baseline_select_dir)
              # Get the list of version folders from the primary dataset folder.
              primary_versions = [v for v in os.listdir(primary_dataset_path) if re.match(r"^\d+\.\d+$", v)]
              primary_versions = sorted(primary_versions, key=lambda x: float(x))
              # Get all dataset directories under the role (excluding known non-performance folders).
              all_datasets = [d for d in os.listdir(role_path) 
                              if os.path.isdir(os.path.join(role_path, d)) 
                              and d not in ["generate_directions", "test_direction"]]
              # Prepare a dictionary: for each dataset, a dict mapping version (and baseline) to score.
              ds_result = {ds: {} for ds in all_datasets}
              for ver in primary_versions:
                   if ver == "3.0":
                       continue
                   # Get best direction using baseline_value.
                   pos, layer = get_best_direction(role, primary_dataset_path, ver, baseline_value)
                   if pos is None or layer is None:
                        print(f"Skipping version {ver} for role '{role}' due to missing best direction.")
                        continue
                   # For each dataset folder, if it contains the same version folder, load its ablation score.
                   for ds in all_datasets:
                        ds_ver_path = os.path.join(role_path, ds, ver)
                        if os.path.exists(ds_ver_path):
                            score = get_score_for_dataset_with_direction_ablation(model_name, role, ds, ver, pos, layer)
                            ds_result[ds]["ablation"] = score if score != "nan" else "nan"
                        else:
                            ds_result[ds]["ablation"] = "nan"
                        # If this is the "1.0" version, also record its baseline.
                        if ver == "1.0":
                            select_dir = os.path.join(model_name, role, ds, ver, "select_direction")
                            baseline_score = get_baseline(select_dir)
                            ds_result[ds]["baseline"] = f"{baseline_score:.2f}" if not np.isnan(baseline_score) else "nan"
              results[role] = ds_result
         
         # Build a DataFrame from the results dictionary.
         all_dataset_set = set()
         all_version_set = set()
         for role, ds_dict in results.items():
              for ds, ver_dict in ds_dict.items():
                   all_dataset_set.add(ds)
                   all_version_set.update(ver_dict.keys())
         all_dataset_list = sorted(all_dataset_set)
         # Sort versions so that "baseline" always comes first, then numeric order.
         def sort_ver(x):
              if x == "baseline":
                   return (0, 0)
              try:
                   return (1, float(x))
              except:
                   return (1, x)
         all_version_list = sorted(list(all_version_set), key=sort_ver)
         cols = pd.MultiIndex.from_product([all_dataset_list, all_version_list], names=["Dataset", "Version"])
         df = pd.DataFrame(index=sorted(results.keys()), columns=cols)
         for role, ds_dict in results.items():
              for ds, ver_dict in ds_dict.items():
                   for ver, value in ver_dict.items():
                        df.loc[role, (ds, ver)] = value
         return df

    else:
         raise NotImplementedError("Other modes (non-select_best_role) are not implemented in this snippet.")


In [None]:
import os
import numpy as np
import pandas as pd


def transform_multiindex_df(df):
    """
    Given a multi-index DataFrame with columns like (Dataset, Version)
    where each dataset contains a "baseline" column as well as one or more version columns
    (e.g., "1.0", "3.0", "ablation"), this function computes:
         diff = (baseline - version_score) / baseline * 100
    for each version (excluding the baseline). It then flattens the differences into a single Series.
    
    This version converts the string values to numeric types before doing the subtraction.
    """
    diff_df = pd.DataFrame(index=df.index)
    
    # Process each dataset in the multi-index columns
    for ds in df.columns.get_level_values(0).unique():
        if "baseline" in df[ds].columns:
            # Convert the baseline column to numeric values.
            baseline = pd.to_numeric(df[(ds, "baseline")], errors="coerce")
            # Loop over all version columns that are not "baseline"
            for ver in df[ds].columns:
                if ver == "baseline":
                    continue
                ver_series = pd.to_numeric(df[(ds, ver)], errors="coerce")
                col_name = f"{ds}_{ver}"
                diff_df[col_name] = ver_series.sub(baseline).div(baseline).mul(100)
        else:
            print(f"Dataset {ds} does not have a baseline column; skipping difference computation for it.")
    
    # Flatten the diff_df: iterate over each row and append all computed differences
    flat_list = []
    for i in diff_df.index:
        for col in diff_df.columns:
            flat_list.append(diff_df.loc[i, col])
    flattened_series = pd.Series(flat_list)
    return flattened_series



def create_models_matrix(models_series):
    """
    Given a dictionary mapping model names to flattened Series (vectors of differences),
    create a DataFrame whose columns are the model names and rows correspond to the
    concatenated (flattened) differences. If the lengths differ, shorter arrays are padded with NaN.
    """
    max_len = max(len(s) for s in models_series.values())
    data = {}
    for model, series in models_series.items():
        arr = series.values
        if len(arr) < max_len:
            # Pad with NaN at the end if necessary.
            arr = np.pad(arr, (0, max_len - len(arr)), constant_values=np.nan)
        data[model] = arr
    matrix_df = pd.DataFrame(data)
    return matrix_df


# Replace with the parent folder that contains your models.
models_parent_dir = r"C:\Users\user\Desktop\temp\rolevectors_results"
# Get a list of model directories.
models = [d for d in os.listdir(models_parent_dir) if os.path.isdir(os.path.join(models_parent_dir, d))]

all_model_series = {}

for model in models:
    model_path = os.path.join(models_parent_dir, model)
    try:
        # Call your existing main() function.
        # (Assuming that your main() function returns a multi-index DataFrame.
        #  Here we use select_best_role_ablation=True as an example.)
        df_multi = main(model_path, select_best_role=True)
    except Exception as e:
        print(f"Skipping model {model} due to error: {e}")
        continue

    # Transform the multi-index DataFrame: compute (baseline - score) differences and flatten.
    diff_series = transform_multiindex_df(df_multi)
    all_model_series[model] = diff_series

# Now, create the final matrix: each column is the flattened differences for one model.
final_matrix = create_models_matrix(all_model_series)

# For example, print or save the matrix:
print(final_matrix)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.colors as mcolors
import pandas as pd
import numpy as np
from scipy.stats import pearsonr

# Assume that final_matrix is a DataFrame where each column represents a model.
# final_matrix = pd.DataFrame({...})

# 1. Change the order of models.
new_order = [
    'gemma-2-2b-it', 
    'gemma-2-9b-it',
    'Llama-3.2-1B-Instruct', 
    'Llama-3.2-3B-Instruct', 
    'Llama-3.1-8B-Instruct', 
    'Qwen-1_8B-Chat', 
    'Qwen-7B-Chat',   
]
final_matrix = final_matrix[new_order]
final_matrix = final_matrix.rename(columns={'Qwen-1_8B-Chat': 'Qwen-1.8B'})
final_matrix = final_matrix.rename(columns={'gemma-2-2b-it': 'gemma-2-2b'})
final_matrix = final_matrix.rename(columns={'gemma-2-9b-it': 'gemma-2-9b'})
final_matrix = final_matrix.rename(columns={'Llama-3.2-1B-Instruct': 'Llama-3.2-1B'})
final_matrix = final_matrix.rename(columns={'Llama-3.2-3B-Instruct': 'Llama-3.2-3B'})
final_matrix = final_matrix.rename(columns={'Llama-3.1-8B-Instruct': 'Llama-3.1-8B'})
final_matrix = final_matrix.rename(columns={'Qwen-7B-Chat': 'Qwen-7B'})

# Update the list of models
models = final_matrix.columns

# 2. Compute the correlation matrix.
corr_matrix = final_matrix.corr()

# 3. Compute p-values for each correlation pair.
p_values = pd.DataFrame(index=models, columns=models, dtype=float)

for i in models:
    for j in models:
        x = final_matrix[i]
        y = final_matrix[j]
        df_pair = pd.concat([x, y], axis=1).dropna()
        if df_pair.shape[0] > 1:
            r, p = pearsonr(df_pair.iloc[:, 0], df_pair.iloc[:, 1])
        else:
            p = np.nan
        p_values.loc[i, j] = p

# 4. Create an annotation matrix.
def significance_stars(p):
    if pd.isna(p):
        return ""
    elif p < 0.001:
        return "***"
    elif p < 0.01:
        return "**"
    elif p < 0.05:
        return "*"
    else:
        return ""

annot = corr_matrix.copy().astype(str)
for i in models:
    for j in models:
        star = significance_stars(p_values.loc[i, j])
        annot.loc[i, j] = f"{corr_matrix.loc[i, j]:.2f}{star}"

# 5. Define a symmetric color map.
cmap_sym = mcolors.LinearSegmentedColormap.from_list('red_white_blue', ['white', 'blue'])

# Create a mask for the upper triangle.
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
annot_masked = annot.where(~mask, "")

# Set the font scale and seaborn style (white background).
sns.set(font_scale=1.5)
sns.set_style("white")

# 6. Display the heatmap without grid and with a white background.
plt.figure(figsize=(10, 8))
ax = sns.heatmap(
    corr_matrix,
    cmap=cmap_sym,
    vmin=-0.1,
    vmax=1,
    center=0.5,
    annot=annot_masked,
    fmt="",
    mask=mask,
    cbar=False,
    annot_kws={"size": 16},
    linewidths=0  # Remove cell lines 
)

# Ensure the axes background is white.
ax.set_facecolor("white")
ax.grid(False)

plt.xticks(rotation=45, ha='right', fontsize=16)
plt.yticks(rotation=0, fontsize=16)

plt.tight_layout()
plt.show()
