# Test preparation:  handle import and load files

In [None]:
# --- Install basic libraries
!pip install pandas

# --- Import libraries
import math
import os
import io
import re
import json
from google.colab import files
import joblib

from scipy import stats
import pandas as pd
from pandas.api.types import is_float_dtype
import numpy as np
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

from collections import defaultdict
from collections import Counter
from scipy.stats import entropy
from IPython.display import display
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [None]:
# Upload multiple JSON files
uploaded = files.upload()  # Opens file picker in Colab

# Combine all uploaded JSONs into a single list of tasks
all_tasks = []

for filename in uploaded.keys():
    try:
        with open(filename, 'r', encoding='utf-8') as f:
            data = json.load(f)
            if isinstance(data, list):
                all_tasks.extend(data)
            else:
                print(f"Skipped {filename} (not a list at top level)")
    except Exception as e:
        print(f"Failed to load {filename}: {e}")

# Keep only tasks that have at least one event
all_tasks = [task for task in all_tasks if len(task.get("events", [])) > 0]

print(f"Loaded {len(all_tasks)} total tasks from {len(uploaded)} files.")

# Help function for plotting metrics   + function reused in other sections

In [None]:
def sort_columns(df, tools_order=["WhatIf", "HypotheX"]):
    """
    Reorders metric columns so that for each metric, tools appear in tools_order.
    Automatically ensures 'participant_id' and 'task_number' are retained and placed first if present.
    """
    df = df.copy()
    all_cols = df.columns.tolist()
    metric_cols = [col for col in all_cols if "_" in col]
    fixed_cols = [col for col in all_cols if "_" not in col and col not in ["participant_id", "task_number"]]

    grouped = defaultdict(dict)
    for col in metric_cols:
        match = re.match(r"(.+)_([A-Za-z0-9]+)$", col)
        if match:
            metric, tool = match.groups()
            grouped[metric][tool] = col

    ordered_cols = []

    # Insert with check to avoid duplicates
    if "participant_id" in df.columns and "participant_id" not in ordered_cols:
        ordered_cols.insert(0, "participant_id")
    if "task_number" in df.columns and "task_number" not in ordered_cols:
        ordered_cols.insert(1 if "participant_id" in ordered_cols else 0, "task_number")

    ordered_cols.extend(fixed_cols)

    for metric in sorted(grouped.keys()):
        for tool in tools_order:
            col = grouped[metric].get(tool)
            if col:
                ordered_cols.append(col)

    return df[[col for col in ordered_cols if col in df.columns]]


def get_safe_aggfuncs(df, default_func="mean"):
    """
    Returns a dictionary of column-specific aggregation functions,
    but only for numeric columns. Object or unsupported types are skipped.
    """
    return {
        col: default_func
        for col in df.columns
        if pd.api.types.is_numeric_dtype(df[col])
    }

def get_metric_aggfunc_map(columns, default_func="mean"):
    """
    Assigns 'sum' to some metric names, 'mean' to the rest.
    """
    always_sum_metrics = {"hypothesis_count", "task_count"}
    return {
        col: "sum" if col in always_sum_metrics else default_func
        for col in columns
    }

def aggregate_precomputed_task_metrics(df, aggfuncs=None, tools_order=["WhatIf", "HypotheX"]):
    """
    Aggregates a precomputed task-level metrics DataFrame.
    Assumes input columns include: participant_id, task_number, tool_name, and metrics.
    Returns grouped summaries in original format: wide columns per tool per metric.
    """
    if aggfuncs is None:
        aggfuncs = {
            "tool_task": "sum",
            "per_task": "mean",
            "per_participant": "mean",
            "per_tool": "mean"
        }

    id_cols = {"participant_id", "task_number", "tool_name"}
    metric_cols = [col for col in df.columns if col not in id_cols and pd.api.types.is_numeric_dtype(df[col])]

    if not metric_cols:
        raise ValueError("No numeric metric columns found for aggregation.")

    # Impute missing values only for numeric columns
    df[metric_cols] = df[metric_cols].fillna(df[metric_cols].mean())  # Replace NaN values in numeric columns

    # --- Raw participant-task view (not pivoted) ---
    participant_task_df = df.sort_values(by=["participant_id", "task_number", "tool_name"])

    metric_aggfuncs = get_metric_aggfunc_map(metric_cols, default_func="mean")

    # --- Tool-task view: pivoted wide format ---
    grouped_tool = (
        df.groupby(["task_number", "tool_name"])[metric_cols]
        .agg(metric_aggfuncs)
        .reset_index()
    )

    tool_task_df = grouped_tool.pivot(index="task_number", columns="tool_name", values=metric_cols)
    tool_task_df.columns = [f"{metric}_{tool}" for metric, tool in tool_task_df.columns]
    tool_task_df = tool_task_df.reset_index()
    tool_task_df = sort_columns(tool_task_df, tools_order)

    # --- Per-task view (mean over all tools and participants) ---
    per_task_df = (
        df.groupby("task_number")[metric_cols]
        .agg(metric_aggfuncs)
        .reset_index()
        .sort_values(by="task_number")
    )

    # --- Per-participant view: pivoted wide format ---
    grouped_participant = (
        df.groupby(["participant_id", "tool_name"])[metric_cols]
        .agg(metric_aggfuncs)
        .reset_index()
    )

    per_participant_df = grouped_participant.pivot(index="participant_id", columns="tool_name", values=metric_cols)
    per_participant_df.columns = [f"{metric}_{tool}" for metric, tool in per_participant_df.columns]
    per_participant_df = per_participant_df.reset_index()
    per_participant_df = sort_columns(per_participant_df, tools_order)

    # --- Per-tool view (mean over all tasks and participants) ---
    per_tool_df = (
        df.groupby("tool_name")[metric_cols]
        .agg(metric_aggfuncs)
        .reset_index()
        .sort_values(by="tool_name")
    )

    return {
        "participant_task": participant_task_df,
        "tool_task": tool_task_df,
        "per_task": per_task_df,
        "per_participant": per_participant_df,
        "per_tool": per_tool_df
    }

def format_integer_columns(df):
    """
    Converts float columns with all integer values to int dtype for cleaner display.
    """
    for col in df.select_dtypes(include="float"):
        if df[col].dropna().apply(float.is_integer).all():
            df[col] = df[col].astype("Int64")  # nullable integer type
    return df


def format_summary_columns(summary_dict):
    """
    Format all summary tables:
    - Abbreviate metric/tool names (e.g. confirmation_ratio_WhatIf â†’ CAR_wit)
    - Convert floats: proportions to %, ints without .0
    - Replace NaNs with "-"
    """
    from pandas.api.types import is_float_dtype

    METRIC_ABBREVIATIONS = {
        "contradiction_ignoring_index": "CII",
        "contradictory_evidence_ratio": "CER",
        "confirmation_strategy_entropy": "CSE",
        "confirmed_hypothesis_action_agreement_ratio": "CHAAR",
        "disproved_hypothesis_action_agreement_ratio": "DHAAR",
        "main_profile_reached": "MPR",
        "subgroup_coverage": "SubC",
        "multiclass_selection_entropy": "MSelE",
        "class_strategy_entropy": "CSE",
        "feature_coverage": "FC",
        "feature_test_balance": "FTB",
        "goal_class_feature_range_coverage": "FC",
        "early_evaluation_consistency": "EEC",
        "late_evaluation_influence": "LEC",
        "average_hypothesis_complexity": "HC",
        "hypothesis_success_rate": "HSR",
        "task_duration_minutes": "AvTaskTime",
        "inspect_before_modify_ratio": "IBM_R",
    }

    TOOL_ABBREVIATIONS = {
        "WhatIf": "wit",
        "HypotheX": "HX"
    }

    METADATA_COLUMNS = {"participant_id", "task_number", "tool_name"}

    formatted = {}

    for name, df in summary_dict.items():
        df = df.copy()

        # Flatten MultiIndex if necessary
        if isinstance(df.columns, pd.MultiIndex):
            df.columns = ['_'.join([str(level) for level in col if level]) for col in df.columns]

        # Format numerical columns
        for col in df.columns:
            if is_float_dtype(df[col]):
                col_data = df[col].dropna()

                if col_data.apply(float.is_integer).all():
                    df[col] = df[col].astype("Int64").astype(str)
                else:
                    df[col] = df[col].round(2).astype(str)

            df[col] = df[col].replace(["nan", "NaN", "None", "nan%"], "-").fillna("-")

         # --- Build rename map ---
        rename_map = {}
        for col in df.columns:
            if col in METRIC_ABBREVIATIONS:
                rename_map[col] = METRIC_ABBREVIATIONS[col]
            elif "_" in col:
                metric, tool = col.rsplit("_", 1)
                if metric in METRIC_ABBREVIATIONS:
                    abbrev_metric = METRIC_ABBREVIATIONS[metric]
                    abbrev_tool = TOOL_ABBREVIATIONS.get(tool, tool)
                    rename_map[col] = f"{abbrev_metric}_{abbrev_tool}"


        df = df.rename(columns=rename_map)
        formatted[name] = df

    return formatted



def compute_entropy(values):
    """
    Computes entropy of a list of discrete values.
    """
    if not values:
        return None
    counts = Counter(values)
    total = sum(counts.values())
    probs = [count / total for count in counts.values()]
    return round(-sum(p * math.log2(p) for p in probs), 6)

def extract_declared_features(text):
    """
    Extracts declared feature IDs (e.g., f1, f2, etc.) from hypothesis description.
    """
    if not text:
        return []
    return re.findall(r"\bf\d+\b", text.lower())

def perform_one_way_anova(df, metrics=["average_hypothesis_complexity"], factor="tool_name"):
    """
    Perform One-Way ANOVA for each metric between the tools (WhatIf and HypotheX) for each participant and task.

    Parameters:
    df (pd.DataFrame): DataFrame containing the participant data for each tool.
    metrics (list): List of metrics to perform ANOVA on (e.g., ['TC', 'EC', 'average_hypothesis_complexity']).
    factor (str): The factor for ANOVA (e.g., "tool_name" or "participant_id").

    Returns:
    pd.DataFrame: A DataFrame containing ANOVA results for each metric.
    """
    # Store results for each metric in a dictionary
    anova_results = []

    # Strip whitespace from column names
    df.columns = df.columns.str.strip()

    # Loop through each metric in the metrics list
    for metric in metrics:
        metric = metric.strip()  # Ensure the metric name is clean of spaces

        if metric not in df.columns:
            print(f"Warning: Metric '{metric}' not found in the DataFrame.")
            continue

        # Check if there is variation in the metric (all values are not the same)
        if df[metric].nunique() < 2:
            print(f"Warning: Not enough variation in the metric '{metric}' to perform ANOVA.")
            continue

        # Filter data based on the factor (e.g., 'tool_name' or 'participant_id')
        #whatif_data = df[df[factor] == 'WhatIf'][metric]
        #hypothex_data = df[df[factor] == 'HypotheX'][metric]

        whatif_data = df[df[factor].str.lower() == 'whatif'.lower()][metric]
        hypothex_data = df[df[factor].str.lower() == 'hypothex'.lower()][metric]


        # Ensure both groups have at least 2 samples for the ANOVA
        if len(whatif_data) < 2 or len(hypothex_data) < 2:
            print(f"Warning: Not enough data for ANOVA for {metric} between WhatIf and HypotheX.")
            continue

        # Perform One-Way ANOVA for each metric
        formula = f"{metric} ~ C({factor})"
        model = ols(formula, data=df).fit()
        anova_table = anova_lm(model)

        # Extract the F-statistic and P-value
        f_statistic = anova_table["F"][0] if "F" in anova_table else None
        p_value = anova_table["PR(>F)"][0] if "PR(>F)" in anova_table else None

        # Handle case for missing values (e.g., NaN)
        if pd.isna(f_statistic) or pd.isna(p_value):
            f_statistic = "NaN"
            p_value = "NaN"
            print(f"Warning: ANOVA returned NaN for {metric}.")

        # Determine the meaning of the P-value
        if isinstance(p_value, (int, float)) and p_value < 0.05:
            significance = "yes"
        else:
            significance = "no"

        # Add the results to the list
        anova_results.append({
            "Metric Name": metric,
            "F-statistic": f_statistic,
            "Statistical Significance": significance,
            "P-value": p_value,
            "P-value Meaning": significance
        })

    # Convert the results into a DataFrame
    anova_results_df = pd.DataFrame(anova_results)

    return anova_results_df


def perform_two_way_anova(df, metrics=["average_hypothesis_complexity"], factors=["tool_name", "task_number"]):
    """
    Perform Two-Way ANOVA for multiple metrics with specified factors.
    Supports combinations of 'tool_name' and 'task_number', or 'tool_name' and 'participant_id'.

    Parameters:
    df (pd.DataFrame): DataFrame containing the participant data.
    metrics (list): List of metrics to perform ANOVA on (e.g., ['TC', 'EC', 'average_hypothesis_complexity']).
    factors (list): List of two factors for ANOVA (e.g., ['tool_name', 'task_number'] or ['tool_name', 'participant_id']).

    Returns:
    pd.DataFrame: A DataFrame containing ANOVA results for each metric.
    """
    # Validate the factors length (should be 2 for two-way ANOVA)
    if len(factors) != 2:
        raise ValueError("Two factors are required for Two-Way ANOVA. Use factors=['factor1', 'factor2'].")

    factor_1, factor_2 = factors

    # Store results for all metrics in a list
    anova_results = []

    # Loop through each metric in the metrics list
    for metric in metrics:
        metric = metric.strip()  # Ensure the metric name is clean of spaces

        if metric not in df.columns:
            print(f"Warning: Metric '{metric}' not found in the DataFrame.")
            continue

        # Check if there is variation in the metric (all values are not the same)
        if df[metric].nunique() < 2:
            print(f"Warning: Not enough variation in the metric '{metric}' to perform ANOVA.")
            continue

        # Perform Two-Way ANOVA for each metric
        formula = f"{metric} ~ C({factor_1}) + C({factor_2}) + C({factor_1}):C({factor_2})"
        model = ols(formula, data=df).fit()
        anova_table = anova_lm(model)

        # Access the interaction term row explicitly using .iloc() or .loc()
        interaction_row = anova_table.iloc[2]  # Interaction term

        # Extract the F-statistic and P-value
        f_statistic = interaction_row["F"]
        p_value = interaction_row["PR(>F)"]

        # Handle case for missing values (e.g., NaN)
        if pd.isna(f_statistic) or pd.isna(p_value):
            f_statistic = "NaN"
            p_value = "NaN"
            print(f"Warning: ANOVA returned NaN for {metric}.")

        # Determine the meaning of the P-value
        if isinstance(p_value, (int, float)) and p_value < 0.05:
            significance = "yes"
        else:
            significance = "no"

        # Add the results to the list
        anova_results.append({
            "Metric Name": metric,
            "F-statistic": f_statistic,
            "Statistical Significance": significance,
            "P-value": p_value,
            "P-value Meaning": significance
        })

    # Convert the results into a DataFrame
    anova_results_df = pd.DataFrame(anova_results)

    return anova_results_df


def compute_all_task_metrics(all_tasks, all_metric_functions):
    """
    Computes all specified metrics per task.
    Returns a flat DataFrame: one row per task, one column per metric.
    """
    import pandas as pd

    rows = []

    for task in all_tasks:
        row = {
            "participant_id": task.get("participant_id"),
            "task_number": task.get("task_number"),
            "tool_name": task.get("tool_name")
        }

        for metric_name, metric_func in all_metric_functions.items():
            try:
                row[metric_name] = metric_func(task)
            except Exception:
                row[metric_name] = None  # fallback for errors

        rows.append(row)

    return pd.DataFrame(rows)

# Preparing models and dataset information for each tool

In [None]:
def load_and_prepare_tool_data():
    """
    Loads all necessary files, performs scaling and inference, and returns:
    - df_raw: original dataset
    - df_scaled: scaled features
    - model: loaded model
    - label_encoder: loaded encoder
    - predictions: predicted class labels
    - probabilities: predicted class probabilities
    """
    label_column = "Class"
    uploaded_files = files.upload()

    dataset_filename = model_filename = scaler_filename = label_encoder_filename = None

    for filename in uploaded_files:
        if filename.endswith(".csv"):
            dataset_filename = filename
        elif "model" in filename and filename.endswith(".pkl"):
            model_filename = filename
        elif "scaler" in filename and filename.endswith(".pkl"):
            scaler_filename = filename
        elif "label_encoder" in filename and filename.endswith(".pkl"):
            label_encoder_filename = filename

    # Load files
    df = pd.read_csv(dataset_filename)
    print(f"âœ… Loaded dataset: {dataset_filename}")

    model = joblib.load(model_filename)
    print(f"âœ… Loaded model: {model_filename}")

    scaler = joblib.load(scaler_filename)
    print(f"âœ… Loaded scaler: {scaler_filename}")

    label_encoder = joblib.load(label_encoder_filename)
    print(f"âœ… Loaded label encoder: {label_encoder_filename}")

    # Apply scaler
    feature_columns = [col for col in df.columns if col != label_column]
    X_scaled = scaler.transform(df[feature_columns])
    df_scaled = pd.DataFrame(X_scaled, columns=feature_columns, index=df.index)

    # Inference
    predictions = model.predict(X_scaled)
    probabilities = model.predict_proba(X_scaled)

    print("âœ… Scaling, encoding, and inference complete.")

    return df, df_scaled, predictions, probabilities, label_encoder


def setup_tool_model_structure(df_raw, df_scaled, pred_classes, probs, label_encoder, tool_name, max_k=5, label_column = "Class"):
    """
    For a given tool:
    1. Maps point_id to prediction and probability
    2. Runs k-means clustering per predicted class
    3. Stores cluster profiles, main and high probability clusters
    """

    # Prepare augmented DataFrame
    df_raw = df_raw.copy()
    df_raw["predicted_class"] = pred_classes
    df_raw["point_id"] = df_raw.index
    df_raw["predicted_prob"] = probs.max(axis=1)
    prob_df = pd.DataFrame(probs, columns=[f"prob_class_{i}" for i in range(probs.shape[1])])
    df_full = pd.concat([df_raw.reset_index(drop=True), prob_df], axis=1)

    # Encode original class labels (optional, for mapping)
    true_classes = label_encoder.fit_transform(df_raw[label_column])
    class_mapping = {i: label for i, label in enumerate(label_encoder.classes_)}
    feature_columns = [col for col in df_raw.columns if col not in ["Class", "predicted_class", "predicted_prob", "point_id"] and not col.startswith("prob_class_")]

    # Prepare return structure
    profiles = {
        "point_id_to_class": df_full.set_index("point_id")["predicted_class"].to_dict(),
        "point_id_to_prob": df_full.set_index("point_id")["predicted_prob"].to_dict(),
        "point_id_to_full_probs": df_full[["point_id"] + [f"prob_class_{i}" for i in range(probs.shape[1])]].set_index("point_id").to_dict("index"),
        "clusters": {},         # cluster_id per class
        "main_profile": {},    # largest cluster
        "high_profile": {},     # cluster with highest avg probability
        "class_feature_spaces": {},  # max feature ranges for one class
        "label_conversion_map": {},
        "point_id_to_features": {},
    }

    # Store per-class feature values
    class_feature_spaces = {}
    for class_idx in np.unique(pred_classes):
        point_ids = [i for i, pred in enumerate(pred_classes) if pred == class_idx]
        class_feature_spaces[class_idx] = [df_raw.iloc[i][feature_columns].to_dict() for i in point_ids]
    profiles["class_feature_spaces"] = class_feature_spaces

    profiles["point_id_to_features"] = df_raw[feature_columns].to_dict("index")

    # This is fixed knowledge about the semantic meaning of c1, c2, c3
    tool_semantic_c_map = {
        "WhatIf": {
            "duck": "c1",
            "cat": "c2",
            "dog": "c3"
        },
        "HypotheX": {
            "zarnak": "c1",
            "bliptor": "c2",
            "quorvian": "c3"
        }
    }

    semantic_map = tool_semantic_c_map.get(tool_name, {})
    label_conversion_map = {}

    for label in label_encoder.classes_:
        index = int(label_encoder.transform([label])[0])
        label_low = label.lower()
        # Assign also c1/c2/c3 mapping
        if label_low in semantic_map:
            cx = semantic_map[label_low]
            label_conversion_map[cx] = index
    profiles["label_conversion_map"] = label_conversion_map

    # Run KMeans per predicted class
    for c in np.unique(pred_classes):
        class_df = df_scaled[pred_classes == c]
        if len(class_df) < 3:
            continue

        best_k = 2
        max_diff = -np.inf
        best_labels = None

        # Try k=2 to max_k clusters to maximize probability separation
        for k in range(2, min(max_k, len(class_df)) + 1):
            kmeans = KMeans(n_clusters=k, random_state=42).fit(class_df)
            labels = kmeans.labels_

            temp_df = pd.DataFrame({
                "cluster": labels,
                "predicted_prob": probs[pred_classes == c, c]
            })

            mean_probs = temp_df.groupby("cluster")["predicted_prob"].mean()
            diff = mean_probs.max() - mean_probs.min()

            if diff > max_diff:
                max_diff = diff
                best_k = k
                best_labels = labels

        # Assign cluster labels to point_ids
        class_point_ids = df_raw[pred_classes == c].index
        cluster_map = dict(zip(class_point_ids, best_labels))
        profiles["clusters"][int(c)] = cluster_map

        # Main profile: largest cluster
        largest_cluster = pd.Series(best_labels).value_counts().idxmax()
        profiles["main_profile"][int(c)] = [pid for pid, cl in cluster_map.items() if cl == largest_cluster]

        # High profile: cluster with highest avg predicted prob
        temp_df = pd.DataFrame({
            "point_id": list(class_point_ids),
            "cluster": best_labels,
            "prob": probs[pred_classes == c, c]
        })
        high_cluster = temp_df.groupby("cluster")["prob"].mean().idxmax()
        profiles["high_profile"][int(c)] = temp_df[temp_df["cluster"] == high_cluster]["point_id"].tolist()
    return profiles

In [None]:
# Upload all file from Animal Hobbies Dataset and RF model, also scaler and label_encode

tool_profiles = {}
wit_df, wit_scaled, wit_preds, wit_probs, wit_encoder = load_and_prepare_tool_data()

In [None]:
# Upload all file from Animal Hobbies Dataset and RF model, also scaler and label_encode

hyp_df, hyp_scaled, hyp_preds, hyp_probs, hyp_encoder = load_and_prepare_tool_data()

In [None]:
### add all file from Animal Hobbies Dataset and RF model, also scaler and label_encode
# Process model structure
tool_profiles["WhatIf"] = setup_tool_model_structure(
    df_raw=wit_df,
    df_scaled=wit_scaled,
    pred_classes=wit_preds,
    probs=wit_probs,
    label_encoder = wit_encoder,
    tool_name = "WhatIf"
)
# Process model structure
tool_profiles["HypotheX"] = setup_tool_model_structure(
    df_raw=hyp_df,
    df_scaled=hyp_scaled,
    pred_classes=hyp_preds,
    probs=hyp_probs,
    label_encoder = hyp_encoder,
    tool_name = "HypotheX"
)

print(tool_profiles["WhatIf"]['label_conversion_map'])

# Basic metrics

In [None]:
#################################
#     BASIC STATS FUNCTIONS     #
#################################
def average_hypothesis_complexity(task):
    """
    Returns average complexity score of all hypotheses in a task.
    Complexity is based on number of features, range cues, boundary cues, and class mentions.
    """
    def extract_declared_features(text):
        if not text:
            return []
        return re.findall(r"\bf\d+\b", text.lower())

    def compute_complexity(desc):
        desc = desc.lower()

        # Features
        features = set(extract_declared_features(desc))
        num_features = len(features)

        # Ranges
        range_patterns = [
            r"between\s+\d+\s+(and|to)\s+\d+",
            r"from\s+\d+\s+(to|-)\s+\d+",
            r"\b\d+\s*-\s*\d+\b"
        ]
        range_count = sum(len(re.findall(p, desc)) for p in range_patterns)

        qual_range_words = [
            "high", "higher", "like", "increase", "increasing",
            "low", "lower", "don't like", "decrease", "decreasing", "avoid", "dislike",
            "medium", "middle", "moderate", "smaller"
        ]
        range_count += sum(len(re.findall(rf"\b{word}\b", desc)) for word in qual_range_words)

        # Boundaries
        boundary_patterns = [
            r"f\d+\s*(>|<|>=|<=)\s*\d+",
            r"more than\s+\d+",
            r"less than\s+\d+"
        ]
        boundary_count = sum(len(re.findall(p, desc)) for p in boundary_patterns)

        # Class mentions
        class_keywords = ["c1", "c2", "c3", "duck", "dog", "cat", "bliptor", "zarnak", "quorvian"]
        class_count = sum(1 for c in class_keywords if re.search(rf"\b{c}\b", desc))

        # Normalized components
        feature_score = min(num_features / 5, 1.0)
        range_score = min(range_count / 5, 1.0)
        boundary_score = min(boundary_count / 5, 1.0)
        class_score = min(class_count / 3, 1.0)

        return round((feature_score + range_score + boundary_score + class_score) / 4, 3)

    scores = [
        compute_complexity(h.get("description", ""))
        for h in task.get("events", [])
        if h.get("event_class") == "Hypothesis"


    ]

    return round(sum(scores) / len(scores), 3) if scores else None

def second_order_event_count(task):
    """
    Counts only second-order events nested under Hypothesis-type events.
    """
    count = 0
    for event in task.get("events", []):
        nested = event.get("events", [])
        if isinstance(nested, list):
            count += len(nested)
    return count

def hypothesis_count(task):
    """
    Counts the number of top-level Hypothesis events in a task.
    """
    return sum(1 for event in task.get("events", []) if event.get("event_class") == "Hypothesis")


In [None]:
# Compute and plot basic metrics

basic_metrics = {
    #"task_count": lambda task: 1,
    #"event_count": second_order_event_count,
    "average_hypothesis_complexity": average_hypothesis_complexity,
    #"hypothesis_count": hypothesis_count,
}

basic_metrics_df = compute_all_task_metrics(all_tasks, basic_metrics)

aggfuncs = {
    "tool_task": "sum",  #
    "per_task": "var" # var, sum, mean
}

summary = aggregate_precomputed_task_metrics(basic_metrics_df, aggfuncs)
summary = format_summary_columns(summary)

#print("ðŸ“Š Participant-Task Summary")
#display(summary["participant_task"])

print("ðŸ“Š Tool-Task Summary")
display(summary["tool_task"])

print("ðŸ“Š Mean per Task")
display(summary["per_task"])

print("ðŸ“Š Mean per Participant")
display(summary["per_participant"])

print("ðŸ“Š Mean per Tool")
display(summary["per_tool"])


metrics_list=["average_hypothesis_complexity"]

one_way_anova_df = perform_one_way_anova(df=basic_metrics_df, metrics=metrics_list, factor="tool_name")
print("One way anova per tool")
display(one_way_anova_df)
two_way_anova_tasks = perform_two_way_anova(df=basic_metrics_df, metrics=metrics_list, factors=["tool_name", "task_number"])
print("Two way anova per tool and task")
display(two_way_anova_tasks)
two_way_anova_participants = perform_two_way_anova(df=basic_metrics_df, metrics=metrics_list, factors=["tool_name", "participant_id"])
print("Two way anova per tool and paticipant")
display(two_way_anova_participants)

# Confirmation metrics

In [None]:
#################################
#  CONFIRMATION BIAS FUNCTIONS  #
#################################

def confirmation_action_ratio(task):
    """
    Computes the ratio of confirming actions over total (confirm + disprove)
    from second-order Action events inside top-level Hypothesis events.
    """
    confirm = 0
    disprove = 0

    for event in task.get("events", []):
        # Only search inside Hypothesis-type top-level events
        if event.get("event_class") == "Hypothesis":
            for subevent in event.get("events", []):
                if subevent.get("event_class") == "Action":
                    goal = subevent.get("goal", "").lower()
                    if goal == "confirm":
                        confirm += 1
                    elif goal == "disprove":
                        disprove += 1

    total = confirm + disprove
    return confirm / total if total > 0 else None

def disconfirmation_action_ratio(task):
    confirm = 0
    disprove = 0

    for event in task.get("events", []):
        if event.get("event_class") == "Hypothesis":
            for subevent in event.get("events", []):
                if subevent.get("event_class") == "Action":
                    goal = subevent.get("goal", "").lower()
                    if goal == "confirm":
                        confirm += 1
                    elif goal == "disprove":
                        disprove += 1

    total = confirm + disprove
    return disprove / total if total > 0 else None


def hypothesis_confirmation_score_strict(task):
    """
    Returns strict confirmation score:
    confirmed / total evaluated hypotheses
    """
    confirmed = 0
    evaluated = 0

    for event in task.get("events", []):
        if event.get("event_class") == "Hypothesis":
            evaluation = event.get("evaluation", "").lower()
            if evaluation in {"confirmed", "weakly_confirmed", "disproved", "weakly_disproved"}:
                evaluated += 1
                if evaluation == "confirmed":
                    confirmed += 1

    return confirmed / evaluated if evaluated > 0 else None


def hypothesis_confirmation_score_lenient(task):
    """
    Returns lenient confirmation score:
    (confirmed + weakly_confirmed) / total evaluated hypotheses
    """
    confirmed = 0
    evaluated = 0

    for event in task.get("events", []):
        if event.get("event_class") == "Hypothesis":
            evaluation = event.get("evaluation", "").lower()
            if evaluation in {"confirmed", "weakly_confirmed", "disproved", "weakly_disproved"}:
                evaluated += 1
                if evaluation in {"confirmed", "weakly_confirmed"}:
                    confirmed += 1

    return confirmed / evaluated if evaluated > 0 else None
# measuring inconsistency in hypothesis evaluation

LABEL_SCORES = {
    "confirmed": 0.0,
    "weakly_confirmed": 0.25,
    "partially_confirmed_or_disproved": 0.5,
    "weakly_disproved": 0.75,
    "disproved": 1.0
}

def is_close_to_label_or_midpoint(hyp_score, avg_score, label_scores=LABEL_SCORES, tolerance=0.01):
    if abs(hyp_score - avg_score) <= tolerance:
        return True  # exact match

    # Check midpoint match
    sorted_scores = sorted(label_scores.values())
    for a, b in zip(sorted_scores, sorted_scores[1:]):
        midpoint = round((a + b) / 2, 5)
        if abs(avg_score - midpoint) <= tolerance and hyp_score in (a, b):
            return True
    return False

def contradiction_ignoring_index(task):
    def get_score(label, goal):
        score = LABEL_SCORES.get(label, None)
        if score is None:
            return None
        if goal == "disprove":
            return 1.0 - score
        return score

    hypothesis_differences = []

    for hyp in task.get("events", []):
        if hyp.get("event_class") != "Hypothesis":
            continue

        hyp_eval = hyp.get("evaluation", "").lower()
        hyp_score = LABEL_SCORES.get(hyp_eval)
        if hyp_score is None:
            continue

        action_scores = []
        for act in hyp.get("events", []):
            if act.get("event_class") != "Action":
                continue

            label = act.get("evaluation", "").lower()
            goal = act.get("goal", "").lower()
            score = get_score(label, goal)
            if score is not None:
                action_scores.append(score)

        if action_scores:
            avg_action_score = sum(action_scores) / len(action_scores)
            aligned = is_close_to_label_or_midpoint(hyp_score, avg_action_score)
            diff = 0.0 if aligned else round(abs(hyp_score - avg_action_score), 2)
            hypothesis_differences.append(diff)

    if not hypothesis_differences:
        return None

    return sum(hypothesis_differences) / len(hypothesis_differences)

def extract_hypothesis_evaluation_differences(all_tasks):
    """
    For each task and hypothesis, compute:
    - participant evaluation score
    - mean action score
    - absolute difference
    Returns a flat DataFrame: one row per hypothesis.
    """
    def get_score(label, goal):
        score = LABEL_SCORES.get(label, None)
        if score is None:
            return None
        return 1.0 - score if goal == "disprove" else score

    rows = []

    for task in all_tasks:
        pid = task.get("participant_id")
        tid = task.get("task_number")
        tool = task.get("tool_name")

        for i, hyp in enumerate(task.get("events", [])):
            if hyp.get("event_class") != "Hypothesis":
                continue

            hyp_eval = hyp.get("evaluation", "").lower()
            hyp_score = LABEL_SCORES.get(hyp_eval)
            if hyp_score is None:
                continue

            action_scores = []
            for act in hyp.get("events", []):
                if act.get("event_class") != "Action":
                    continue
                label = act.get("evaluation", "").lower()
                goal = act.get("goal", "").lower()
                score = get_score(label, goal)
                if score is not None:
                    action_scores.append(score)

            if not action_scores:
                continue

            avg_action_score = sum(action_scores) / len(action_scores)
            aligned = is_close_to_label_or_midpoint(hyp_score, avg_action_score)
            diff = 0.0 if aligned else round(abs(hyp_score - avg_action_score), 2)

            rows.append({
                "participant_id": pid,
                "task_number": tid,
                "tool_name": tool,
                "hypothesis_index": i,
                "hypothesis_eval": hyp_eval,
                "avg_action_score": round(avg_action_score, 3),
                "difference": diff
            })

    return pd.DataFrame(rows)

def contradictory_evidence_ratio(task):
    """
    Calculates the mean ratio of contradictory actions to total actions
    per hypothesis in the task.
    """
    ratios = []

    for hyp in task.get("events", []):
        if hyp.get("event_class") != "Hypothesis":
            continue

        hyp_eval = hyp.get("evaluation", "").lower()
        if hyp_eval == "partially_confirmed_or_disproved":
            continue

        is_confirm_family = hyp_eval in {"confirmed", "weakly_confirmed"}
        is_disprove_family = hyp_eval in {"disproved", "weakly_disproved"}

        contradictory_count = 0
        action_count = 0

        for act in hyp.get("events", []):
            if act.get("event_class") != "Action":
                continue

            label = act.get("evaluation", "").lower()
            if not label or label == "partially_confirmed_or_disproved":
                continue

            action_count += 1
            if is_confirm_family and label in {"disproved", "weakly_disproved"}:
                contradictory_count += 1
            elif is_disprove_family and label in {"confirmed", "weakly_confirmed"}:
                contradictory_count += 1

        if action_count > 0:
            ratios.append(contradictory_count / action_count)

    return sum(ratios) / len(ratios) if ratios else 0.0



def extract_contradictory_evidence(all_tasks):
    """
    Returns a flat DataFrame: one row per hypothesis,
    with count of contradictory action labels and description.
    """
    rows = []

    for task in all_tasks:
        pid = task.get("participant_id")
        tid = task.get("task_number")
        tool = task.get("tool_name")

        for i, hyp in enumerate(task.get("events", [])):
            if hyp.get("event_class") != "Hypothesis":
                continue

            hyp_eval = hyp.get("evaluation", "").lower()
            description = hyp.get("description", "").strip()

            if hyp_eval == "partially_confirmed_or_disproved":
                contradictory_count = 0
            else:
                is_confirm_family = hyp_eval in {"confirmed", "weakly_confirmed"}
                is_disprove_family = hyp_eval in {"disproved", "weakly_disproved"}

                contradictory_count = 0
                for act in hyp.get("events", []):
                    if act.get("event_class") != "Action":
                        continue
                    label = act.get("evaluation", "").lower()
                    if not label or label == "partially_confirmed_or_disproved":
                        continue

                    if is_confirm_family and label in {"disproved", "weakly_disproved"}:
                        contradictory_count += 1
                    elif is_disprove_family and label in {"confirmed", "weakly_confirmed"}:
                        contradictory_count += 1

            rows.append({
                "participant_id": pid,
                "task_number": tid,
                "tool_name": tool,
                "hypothesis_index": i,
                "hypothesis_eval": hyp_eval,
                "hypothesis_description": description,
                "contradictory_count": contradictory_count
            })

    import pandas as pd
    return pd.DataFrame(rows)

def goal_strategy_entropy(task):
    goals = []
    for hyp in task.get("events", []):
        if hyp.get("event_class") != "Hypothesis":
            continue
        for act in hyp.get("events", []):
            if act.get("event_class") == "Action":
                goal = act.get("goal", "").lower()
                if goal:
                    goals.append(goal)
    return compute_entropy(goals)

def class_strategy_entropy(task):
    strategies = []
    for hyp in task.get("events", []):
        if hyp.get("event_class") != "Hypothesis":
            continue
        for act in hyp.get("events", []):
            if act.get("event_class") == "Action":
                cls = act.get("class_strategy") or act.get("class_strategy_type")
                if cls:
                    strategies.append(cls.lower())
    return compute_entropy(strategies)


def hypothesis_evaluation_entropy(task):
    """
    Computes entropy over the types of hypothesis evaluations given in the task.
    Measures how diversely the participant judged their hypotheses (e.g., confirmed, disproved).
    """
    evals = []
    for hyp in task.get("events", []):
        if hyp.get("event_class") == "Hypothesis":
            label = hyp.get("evaluation", "").lower()
            if label:
                evals.append(label)
    return compute_entropy(evals)


def confirmed_hypothesis_action_agreement_ratio(task):
    """
    For hypotheses labeled as confirmed or weakly confirmed by the participant,
    compute how many are also confirmed/weakly_confirmed based on aggregated action evaluations.
    Returns: ratio of consistent confirmations.
    """
    def get_score(label, goal):
        score = LABEL_SCORES.get(label)
        if score is None:
            return None
        return 1.0 - score if goal == "disprove" else score

    consistent = 0
    total = 0

    for hyp in task.get("events", []):
        if hyp.get("event_class") != "Hypothesis":
            continue

        hyp_eval = hyp.get("evaluation", "").lower()
        if hyp_eval not in {"confirmed", "weakly_confirmed"}:
            continue

        total += 1

        action_scores = []
        for act in hyp.get("events", []):
            if act.get("event_class") != "Action":
                continue
            label = act.get("evaluation", "").lower()
            goal = act.get("goal", "").lower()
            score = get_score(label, goal)
            if score is not None:
                action_scores.append(score)

        if not action_scores:
            continue

        avg_score = sum(action_scores) / len(action_scores)

        # Define thresholds for confirmation family
        if avg_score < 0.5:  # midpoint between 0.25 (weakly_confirmed) and 0.5
            consistent += 1

    return consistent / total if total > 0 else None

def disproved_hypothesis_action_agreement_ratio(task):
    """
    For hypotheses labeled as disproved or weakly disproved by the participant,
    compute how many are also disproved/weakly_disproved based on aggregated action evaluations.
    Returns: ratio of consistent disconfirmations.
    """
    def get_score(label, goal):
        score = LABEL_SCORES.get(label)
        if score is None:
            return None
        return 1.0 - score if goal == "disprove" else score

    consistent = 0
    total = 0

    for hyp in task.get("events", []):
        if hyp.get("event_class") != "Hypothesis":
            continue

        hyp_eval = hyp.get("evaluation", "").lower()
        if hyp_eval not in {"disproved", "weakly_disproved"}:
            continue

        total += 1

        action_scores = []
        for act in hyp.get("events", []):
            if act.get("event_class") != "Action":
                continue
            label = act.get("evaluation", "").lower()
            goal = act.get("goal", "").lower()
            score = get_score(label, goal)
            if score is not None:
                action_scores.append(score)

        if not action_scores:
            continue

        avg_score = sum(action_scores) / len(action_scores)

        # Define thresholds for disconfirmation family
        if avg_score > 0.5:  # midpoint between 0.5 and 0.75 (weakly_disproved)
            consistent += 1

    return consistent / total if total > 0 else None



In [None]:
all_metrics = {
    #"hypothesis_count": hypothesis_count,
    #"confirmation_ratio": confirmation_action_ratio,
    #"disconfirmation_ratio": disconfirmation_action_ratio,
    #"hypo_conf_score_strict": hypothesis_confirmation_score_strict,
    #"hypo_conf_score_lenient": hypothesis_confirmation_score_lenient,
    #"hypothesis_evaluation_entropy": hypothesis_evaluation_entropy
    "contradiction_ignoring_index": contradiction_ignoring_index,
    "contradictory_evidence_ratio": contradictory_evidence_ratio,
    "confirmed_hypothesis_action_agreement_ratio": confirmed_hypothesis_action_agreement_ratio,
    "disproved_hypothesis_action_agreement_ratio": disproved_hypothesis_action_agreement_ratio,
    "confirmation_strategy_entropy": goal_strategy_entropy,
}
all_metrics_df = compute_all_task_metrics(all_tasks, all_metrics)

summary = aggregate_precomputed_task_metrics(all_metrics_df)
# Format numeric outputs before display
summary = format_summary_columns(summary)
print("ðŸ“Š Tool-Task Summary")
tool_task_df = summary["tool_task"]
tool_task_df = tool_task_df[['task_number', 'CII_wit', 'CII_HX', 'CSE_wit', 'CSE_HX', 'CHAAR_wit', 'CHAAR_HX',
        'DHAAR_wit', 'DHAAR_HX', 'CER_wit', 'CER_HX']]
display(tool_task_df)

print("ðŸ“Š Mean per Participant")
participant_df = summary["per_participant"][['participant_id', 'CII_wit', 'CII_HX', 'CSE_wit', 'CSE_HX', 'CHAAR_wit', 'CHAAR_HX',
        'DHAAR_wit', 'DHAAR_HX', 'CER_wit', 'CER_HX']]
display(participant_df)

print("ðŸ“Š Mean per Tool")
tool_df = summary["per_tool"][['tool_name', 'CII', 'CER', 'CHAAR', 	'DHAAR', 'CSE']]
display(tool_df)

metrics_list=["contradiction_ignoring_index", "contradictory_evidence_ratio", "confirmed_hypothesis_action_agreement_ratio", "disproved_hypothesis_action_agreement_ratio", "confirmation_strategy_entropy"]
one_way_anova_df = perform_one_way_anova(df=all_metrics_df, metrics=metrics_list, factor="tool_name")
print("One way anova per tool")
display(one_way_anova_df)
two_way_anova_tasks = perform_two_way_anova(df=all_metrics_df, metrics=metrics_list, factors=["tool_name", "task_number"])
print("Two way anova per tool and task")
display(two_way_anova_tasks)
two_way_anova_participants = perform_two_way_anova(df=all_metrics_df, metrics=metrics_list, factors=["tool_name", "participant_id"])
print("Two way anova per tool and paticipant")
display(two_way_anova_participants)

In [None]:
df = extract_hypothesis_evaluation_differences(all_tasks)
#df = extract_contradictory_evidence(all_tasks)
display(df)

# Anchoring metrics

In [None]:
#################################
#  ANCHORING BIAS FUNCTIONS     #
#################################

def strategy_entropy(task):
    """
    Computes strategy entropy for selection_strategy and class_strategy across all Actions.
    Returns a dict with two values: selection_entropy and class_entropy
    """
    selection_strategies = []
    class_strategies = []

    for hyp in task.get("events", []):
        if hyp.get("event_class") != "Hypothesis":
            continue
        for act in hyp.get("events", []):
            if act.get("event_class") != "Action":
                continue

            sel = act.get("selection_strategy") or act.get("selectionStrategy")
            cls = act.get("class_strategy") or act.get("class_strategy_type")

            if sel:
                selection_strategies.append(sel)
            if cls:
                class_strategies.append(cls)

    return {
        "selection_strategy_entropy": compute_entropy(selection_strategies),
        "class_strategy_entropy": compute_entropy(class_strategies)
    }

def strategy_entropy_selection(task):
    return strategy_entropy(task).get("selection_strategy_entropy")

def strategy_entropy_class(task):
    return strategy_entropy(task).get("class_strategy_entropy")

def hypothesis_feature_coverage(task):
    """
    Returns average feature coverage across all hypotheses:
    (# tested features âˆ© declared) / (# declared)
    """
    coverage_scores = []

    for hyp in task.get("events", []):
        if hyp.get("event_class") != "Hypothesis":
            continue

        declared = set(extract_declared_features(hyp.get("description", "")))
        if not declared:
            continue

        tested = set()
        for act in hyp.get("events", []):
            if act.get("event_class") != "Action":
                continue
            tested.update([f.lower() for f in act.get("axis_on", [])])
            for op in act.get("operations", []):
                feat_match = re.match(r"(f\d+)\s*=", op.lower())
                if feat_match:
                    tested.add(feat_match.group(1))

        coverage = len(tested & declared) / len(declared)
        coverage_scores.append(coverage)

    if not coverage_scores:
        return None

    return round(sum(coverage_scores) / len(coverage_scores), 3)

def feature_test_balance(task):
    """
    Computes the entropy over feature usage (axis_on + operations).
    Lower = one-feature focus (anchoring), higher = balanced.
    """
    from collections import Counter
    features = []

    for hyp in task.get("events", []):
        if hyp.get("event_class") != "Hypothesis":
            continue
        for act in hyp.get("events", []):
            if act.get("event_class") != "Action":
                continue
            features += [f.lower() for f in act.get("axis_on", [])]
            for op in act.get("operations", []):
                feat_match = re.match(r"(f\d+)\s*=", op.lower())
                if feat_match:
                    features.append(feat_match.group(1))

    return compute_entropy(features)

def feature_drift_score(task):
    """
    Measures the proportion of features used in actions that were NOT declared in the hypothesis.
    """
    declared = set()
    used = set()

    for hyp in task.get("events", []):
        if hyp.get("event_class") != "Hypothesis":
            continue
        declared.update(extract_declared_features(hyp.get("description", "")))

        for act in hyp.get("events", []):
            if act.get("event_class") != "Action":
                continue
            used.update([f.lower() for f in act.get("axis_on", [])])
            for op in act.get("operations", []):
                feat_match = re.match(r"(f\d+)\s*=", op.lower())
                if feat_match:
                    used.add(feat_match.group(1))

    if not used:
        return None

    drift = len(used - declared) / len(used)
    return round(drift, 3)

def main_profile_reached_score(task):
    """
    Returns the fraction of hypotheses in this task that touch the main profile
    of the predicted goal class. Returns None if no hypotheses with valid actions.
    """
    from collections import Counter

    tool_name = task.get("tool_name")
    profile = tool_profiles.get(tool_name)
    if profile is None:
        return None

    main_profiles = profile["main_profile"]
    point_to_class = profile["point_id_to_class"]

    scores = []

    for hyp in task.get("events", []):
        if hyp.get("event_class") != "Hypothesis":
            continue
        if not hyp.get("events"):
            continue

        selected_ids = []
        for act in hyp.get("events", []):
            if act.get("event_class") == "Action":
                selected_ids.extend([int(pid) for pid in act.get("point_ids", []) if str(pid).isdigit()])

        if not selected_ids:
            continue

        predicted_classes = [point_to_class.get(pid) for pid in selected_ids if pid in point_to_class]
        if not predicted_classes:
            continue

        goal_class_index = Counter(predicted_classes).most_common(1)[0][0]
        main_cluster_points = set(main_profiles.get(goal_class_index, []))
        selected_set = set(selected_ids)

        match = not main_cluster_points.isdisjoint(selected_set)
        scores.append(1.0 if match else 0.0)

    return sum(scores) / len(scores) if scores else None

def subgroup_coverage_score(task):
    """
    For each hypothesis, computes how many predicted class subclusters (KMeans)
    were touched by selected points, relative to the total clusters for that class.

    Returns:
    - Average subgroup coverage across hypotheses (0.0â€“1.0), or None if no valid hypotheses
    """
    from collections import Counter

    tool_name = task.get("tool_name")
    profile = tool_profiles.get(tool_name)
    if profile is None:
        return None

    clusters_by_class = profile["clusters"]
    point_to_class = profile["point_id_to_class"]

    scores = []

    for hyp in task.get("events", []):
        if hyp.get("event_class") != "Hypothesis":
            continue
        if not hyp.get("events"):
            continue

        selected_ids = []
        for act in hyp.get("events", []):
            if act.get("event_class") == "Action":
                selected_ids.extend([int(pid) for pid in act.get("point_ids", []) if str(pid).isdigit()])

        if not selected_ids:
            continue

        predicted_classes = [point_to_class.get(pid) for pid in selected_ids if pid in point_to_class]
        if not predicted_classes:
            continue

        goal_class_index = Counter(predicted_classes).most_common(1)[0][0]
        cluster_map = clusters_by_class.get(goal_class_index)
        if not cluster_map:
            continue

        visited_clusters = {cluster_map[pid] for pid in selected_ids if pid in cluster_map}
        total_clusters = len(set(cluster_map.values()))
        if total_clusters == 0:
            continue

        coverage = len(visited_clusters) / total_clusters
        scores.append(coverage)

    return sum(scores) / len(scores) if scores else None

def multiclass_selection_entropy(task):
    """
    Calculates the entropy of predicted class distribution for selected points
    in all hypothesis actions. Measures how diversely participants explored classes.

    Returns:
    - Entropy score (0.0 = all points from one class, high = diverse class selection)
    - None if no points selected
    """
    tool_name = task.get("tool_name")
    profile = tool_profiles.get(tool_name)
    if profile is None:
        return None

    point_to_class = profile["point_id_to_class"]
    selected_classes = []

    for hyp in task.get("events", []):
        if hyp.get("event_class") != "Hypothesis":
            continue
        for act in hyp.get("events", []):
            if act.get("event_class") == "Action":
                int_pids = [int(pid) for pid in act.get("point_ids", []) if str(pid).isdigit()]
                selected_classes.extend([
                    point_to_class[pid] for pid in int_pids if pid in point_to_class
                ])


    if not selected_classes:
        return None

    class_counts = Counter(selected_classes)
    probs = np.array(list(class_counts.values())) / len(selected_classes)
    return float(entropy(probs, base=2))  # base 2 entropy

def goal_class_feature_range_coverage(task):
    """
    For each hypothesis in a task, compute how well the selected test points
    cover the feature range of the goal class.

    Returns the average per hypothesis, or None if no valid hypotheses.
    """
    tool_name = task.get("tool_name")
    profile = tool_profiles.get(tool_name)
    if not profile:
        return None

    label_map = profile.get("label_conversion_map", {})
    point_to_features = profile.get("point_id_to_features", {})
    class_feature_space = profile.get("class_feature_spaces", {})

    hypothesis_scores = []

    for hyp in task.get("events", []):
        if hyp.get("event_class") != "Hypothesis":
            continue
        goal_class = None
        for act in hyp.get("events", []):
            if act.get("event_class") == "Action":
                goal_class = act.get("goal_class", "").strip().lower()
                break  # only take the first action

        goal_idx = label_map.get(goal_class)
        if goal_idx is None or goal_idx not in class_feature_space:
            continue

        # Feature range for the goal class
        feature_ranges = {}
        for feature_dict in class_feature_space[goal_idx]:
            for feat, val in feature_dict.items():
                feature_ranges.setdefault(feat, []).append(val)
        feature_min = {f: np.min(v) for f, v in feature_ranges.items()}
        feature_max = {f: np.max(v) for f, v in feature_ranges.items()}

        # Selected points used in the hypothesis
        selected_points = []
        for act in hyp.get("events", []):
            #print(act)
            if act.get("event_class") == "Action":
                point_ids = [int(pid) for pid in act.get("point_ids", []) if str(pid).isdigit()]
                selected_points.extend(point_ids)
        if not selected_points:
            continue

        # Compute coverage per feature
        coverage_ratios = []
        for feat in feature_min.keys():
            goal_min, goal_max = feature_min[feat], feature_max[feat]
            if goal_min == goal_max:
                continue  # Skip constant features
            values = [point_to_features[pid][feat] for pid in selected_points if feat in point_to_features[pid]]
            if not values:
                continue
            covered_min = min(values)
            covered_max = max(values)

            covered_range = max(0, min(goal_max, covered_max) - max(goal_min, covered_min))
            full_range = goal_max - goal_min
            ratio = covered_range / full_range if full_range > 0 else 0
            coverage_ratios.append(ratio)
        if coverage_ratios:
            hypothesis_scores.append(np.mean(coverage_ratios))

    if not hypothesis_scores:
        return None
    return round(np.mean(hypothesis_scores), 3)


In [None]:
anchoring_metrics = {
    "main_profile_reached": main_profile_reached_score,
    "subgroup_coverage": subgroup_coverage_score,
    "class_strategy_entropy": strategy_entropy_class,
    "multiclass_selection_entropy": multiclass_selection_entropy,
    "goal_class_feature_range_coverage": goal_class_feature_range_coverage,
    "feature_test_balance": feature_test_balance,
    #"feature_drift_score": feature_drift_score,
    #"feature_coverage": hypothesis_feature_coverage,
    #"selection_entropy": strategy_entropy_selection,
}
df = compute_all_task_metrics(all_tasks, anchoring_metrics)
summary = aggregate_precomputed_task_metrics(df)
formatted = format_summary_columns(summary)

print("ðŸ“Š Tool-Task Summary")
tool_task_df = formatted["tool_task"]
tool_task_df = tool_task_df[['task_number', 'MPR_wit', 'MPR_HX', 'SubC_wit',
       'SubC_HX', 'CSE_wit', 'CSE_HX', 'MSelE_wit', 'MSelE_HX', 'FC_wit',
       'FC_HX', 'FTB_wit', 'FTB_HX']]
print(tool_task_df.columns)
display(tool_task_df)

print("ðŸ“Š Mean per Participant")
participant_df = formatted["per_participant"]
participant_df = participant_df[['participant_id', 'MPR_wit', 'MPR_HX', 'SubC_wit',
       'SubC_HX', 'CSE_wit', 'CSE_HX', 'MSelE_wit', 'MSelE_HX', 'FC_wit',
       'FC_HX', 'FTB_wit', 'FTB_HX']]
print(participant_df.columns)
display(participant_df)

print("ðŸ“Š Mean per Tool")
tool_df = formatted["per_tool"]
tool_df = tool_df[['tool_name', 'MPR', 'SubC', 'CSE', 'MSelE', 'FC', 'FTB']]
print(tool_df.columns)
display(tool_df)

metrics_list=["selection_entropy", "class_strategy_entropy", "feature_coverage", "feature_test_balance", "feature_drift_score", "main_profile_reached", "subgroup_coverage", "multiclass_selection_entropy", "goal_class_feature_range_coverage"]

one_way_anova_df = perform_one_way_anova(df=df, metrics=metrics_list, factor="tool_name")
print("One way anova per tool")
display(one_way_anova_df)
two_way_anova_tasks = perform_two_way_anova(df=df, metrics=metrics_list, factors=["tool_name", "task_number"])
print("Two way anova per tool and task")
display(two_way_anova_tasks)
two_way_anova_participants = perform_two_way_anova(df=df, metrics=metrics_list, factors=["tool_name", "participant_id"])
print("Two way anova per tool and paticipant")
display(two_way_anova_participants)


# Availability metrics

In [None]:
#################################
#  AVAILABILITY BIAS FUNCTIONS  #
#################################
def early_evaluation_consistency(task):
    def get_score(label, goal):
        score = LABEL_SCORES.get(label.lower())
        if score is None:
            return None
        return 1.0 - score if goal == "disprove" else score

    scores = []

    for hyp in task.get("events", []):
        if hyp.get("event_class") != "Hypothesis":
            continue

        hyp_eval = hyp.get("evaluation", "").lower()
        hyp_score = LABEL_SCORES.get(hyp_eval)
        if hyp_score is None:
            continue

        actions = [a for a in hyp.get("events", []) if a.get("event_class") == "Action"]
        if len(actions) < 2:
            continue

        cutoff = max(1, int(0.3 * len(actions)))
        early_actions = actions[:cutoff]

        early_scores = [
            get_score(a.get("evaluation", ""), a.get("goal", ""))
            for a in early_actions
        ]
        early_scores = [s for s in early_scores if s is not None]

        if not early_scores:
            continue

        avg_early = sum(early_scores) / len(early_scores)
        scores.append(1 - abs(hyp_score - avg_early))

    return round(sum(scores) / len(scores), 3) if scores else None

def late_evaluation_influence(task):
    def get_score(label, goal):
        score = LABEL_SCORES.get(label.lower())
        if score is None:
            return None
        return 1.0 - score if goal == "disprove" else score

    scores = []

    for hyp in task.get("events", []):
        if hyp.get("event_class") != "Hypothesis":
            continue

        hyp_eval = hyp.get("evaluation", "").lower()
        hyp_score = LABEL_SCORES.get(hyp_eval)
        if hyp_score is None:
            continue

        actions = [a for a in hyp.get("events", []) if a.get("event_class") == "Action"]
        if len(actions) < 2:
            continue  # Skip if not enough actions for a late phase

        cutoff = int(0.8 * len(actions))
        if cutoff >= len(actions):
            cutoff = len(actions) - 1  # Ensure at least 1 late action

        late_actions = actions[cutoff:]
        late_scores = [
            get_score(a.get("evaluation", ""), a.get("goal", ""))
            for a in late_actions
        ]
        late_scores = [s for s in late_scores if s is not None]

        if not late_scores:
            continue

        avg_late = sum(late_scores) / len(late_scores)
        scores.append(1 - abs(hyp_score - avg_late))

    return round(sum(scores) / len(scores), 3) if scores else None

def feature_revisiting_rate(task):
    from collections import defaultdict

    feature_timestamps = defaultdict(list)

    for event in task.get("events", []):
        if event.get("event_class") == "Hypothesis":
            for act in event.get("events", []):
                if act.get("event_class") == "Action":
                    ts = act.get("created_at")
                    for f in act.get("axis_on", []):
                        feature_timestamps[f].append(ts)

    revisited = 0
    early_features = []

    for f, ts_list in feature_timestamps.items():
        if len(ts_list) >= 2:
            revisited += 1
        if ts_list:
            early_features.append(f)

    return revisited / len(early_features) if early_features else None



In [None]:
all_metrics = {
    "early_evaluation_consistency": early_evaluation_consistency,
    "late_evaluation_influence": late_evaluation_influence,
    "average_hypothesis_complexity": average_hypothesis_complexity,
    #"feature_revisiting_rate": feature_revisiting_rate,
}

availability_metrics_df = compute_all_task_metrics(all_tasks, all_metrics)
print(availability_metrics_df)

summary = aggregate_precomputed_task_metrics(availability_metrics_df)
# Format numeric outputs before display
summary = format_summary_columns(summary)
print("ðŸ“Š Tool-Task Summary")
display(summary["tool_task"])

print("ðŸ“Š Mean per Participant")
display(summary["per_participant"])

print("ðŸ“Š Mean per Tool")
display(summary["per_tool"])

metrics_list=["early_evaluation_consistency", "late_evaluation_influence", "average_hypothesis_complexity"]
one_way_anova_df = perform_one_way_anova(df=availability_metrics_df, metrics=metrics_list, factor="tool_name")
print("One way anova per tool")
display(one_way_anova_df)
two_way_anova_tasks = perform_two_way_anova(df=availability_metrics_df, metrics=metrics_list, factors=["tool_name", "task_number"])
print("Two way anova per tool and task")
display(two_way_anova_tasks)
two_way_anova_participants = perform_two_way_anova(df=availability_metrics_df, metrics=metrics_list, factors=["tool_name", "participant_id"])
print("Two way anova per tool and paticipant")
display(two_way_anova_participants)

# Other metrics

In [None]:
uploaded = files.upload()
file_name = list(uploaded.keys())[0]  # Get the first uploaded file name
text_answer_df = pd.read_csv(io.BytesIO(uploaded[file_name]))

display(text_answer_df)

metrics_list = ["importance_score",	"value_score",	"profile_score",	"overall_score"]


one_way_anova_df = perform_one_way_anova(df=text_answer_df, metrics=metrics_list, factor="participant_id")
print("One way anova per participant")
display(one_way_anova_df)
two_way_anova_participants = perform_two_way_anova(df=text_answer_df, metrics=metrics_list, factors=["tool_name", "participant_id"])
print("Two way anova per tool and paticipant")
display(two_way_anova_participants)


In [None]:
# Check for sufficient variation in metrics (more than 1 unique value)
def check_metric_variation(df, metrics):
    metric_variation = {}
    for metric in metrics:
        unique_values = df[metric].nunique()
        metric_variation[metric] = unique_values
        if unique_values < 2:
            print(f"Warning: Metric '{metric}' has insufficient variation (only {unique_values} unique values).")
    return metric_variation

# Check for sufficient variation in factors (more than 1 unique value for each factor)
def check_factor_variation(df, factors):
    factor_variation = {}
    for factor in factors:
        unique_values = df[factor].nunique()
        factor_variation[factor] = unique_values
        if unique_values < 2:
            print(f"Warning: Factor '{factor}' has insufficient variation (only {unique_values} unique values).")
    return factor_variation

# Define metrics and factors
metrics = ["importance_score", "value_score", "profile_score", "overall_score"]
factors = ["tool_name", "participant_id"]

# Check the variation for metrics and factors
metric_variation = check_metric_variation(text_answer_df, metrics)
factor_variation = check_factor_variation(text_answer_df, factors)

# Print the results of the checks
print("\nMetric Variation:", metric_variation)
print("\nFactor Variation:", factor_variation)


In [None]:
def hypothesis_success_rate(task):
    evaluation_weights = {
        "confirmed": 1.0,
        "weakly_confirmed": 0.5,
        "disproved": 0.0,
        "weakly_disproved": 0.0
    }

    total_score = 0.0
    valid_count = 0

    for event in task.get("events", []):
        if event.get("event_class") == "Hypothesis":
            for action in event.get("events", []):
                eval_label = action.get("evaluation", "").strip().lower()
                if eval_label in evaluation_weights:
                    total_score += evaluation_weights[eval_label]
                    valid_count += 1

    return total_score / valid_count if valid_count > 0 else None


def task_duration_minutes(task):
    return task.get("duration", None)

from datetime import datetime

def inspect_before_modify_ratio(task):
    def parse_time(t): return datetime.fromisoformat(t.replace("Z", "+00:00"))

    all_events = task.get("events", [])
    if not all_events:
        return None

    # Flatten hypothesis internal events too
    flat_events = []
    for event in all_events:
        flat_events.append(event)
        if event.get("event_class") == "Hypothesis":
            for subevent in event.get("events", []):
                subevent["_is_action"] = True
                subevent["_parent_time"] = event.get("created_at")
                flat_events.append(subevent)

    # Sort by timestamp to preserve chronological order
    flat_events.sort(key=lambda e: e.get("created_at"))

    modifies = 0
    with_inspect = 0
    prior_inspect_times = []

    for event in flat_events:
        event_type = event.get("type", "") or ""
        event_class = event.get("event_class", "") or ""
        event_time = parse_time(event["created_at"])

        # Record inspect-type explorations
        if event_class == "DataExploration" and "inspect" in event_type.lower():
            prior_inspect_times.append(event_time)

        # For Modify_Feature_Value, check if any prior inspect exists
        elif event.get("_is_action") and event.get("type") == "Modify_Feature_Value":
            modifies += 1
            if any(t < event_time for t in prior_inspect_times):
                with_inspect += 1

    return with_inspect / modifies if modifies > 0 else None

def feature_exploration_counts(task):
    all_features = [f"f{i}" for i in range(1, 6)]  # Adjust range if you have more
    feature_counter = Counter({f: 0 for f in all_features})

    for event in task.get("events", []):
        if event.get("event_class") == "DataExploration" and event.get("type") == "Change_Scatterplot_Axis":
            for fid in event.get("feature_ids", []):
                if fid in feature_counter:
                    feature_counter[fid] += 1

    # Rename keys to be column-safe
    return {f"{k}_explorations": v for k, v in feature_counter.items()}



In [None]:
task_metrics = {
    "hypothesis_success_rate": hypothesis_success_rate,
    "task_duration_minutes": task_duration_minutes,
    "inspect_before_modify_ratio": inspect_before_modify_ratio,
    #"hypothesis_count": hypothesis_count,
    #"average_hypothesis_complexity": average_hypothesis_complexity,
    #"feature_exploration_counts": feature_exploration_counts
}

print("Number of tasks loaded:", len(all_tasks))
metrics_df = compute_all_task_metrics(all_tasks, task_metrics)
display(metrics_df)

summary = aggregate_precomputed_task_metrics(metrics_df)
# Format numeric outputs before display
summary = format_summary_columns(summary)
print("ðŸ“Š Tool-Task Summary")
display(summary["tool_task"])

print("ðŸ“Š Mean per Participant")
display(summary["per_participant"])

print("ðŸ“Š Mean per Tool")
display(summary["per_tool"])

metrics_list=["average_hypothesis_complexity", "inspect_before_modify_ratio", "task_duration_minutes"]
one_way_anova_df = perform_one_way_anova(df=metrics_df, metrics=metrics_list, factor="tool_name")
print("One way anova per tool")
display(one_way_anova_df)
two_way_anova_tasks = perform_two_way_anova(df=metrics_df, metrics=metrics_list, factors=["tool_name", "task_number"])
print("Two way anova per tool and task")
display(two_way_anova_tasks)
two_way_anova_participants = perform_two_way_anova(df=metrics_df, metrics=metrics_list, factors=["tool_name", "participant_id"])
print("Two way anova per tool and paticipant")
display(two_way_anova_participants)