# Check logs

In [1]:
pip install seaborn inspect




ERROR: Could not find a version that satisfies the requirement inspect (from versions: none)
ERROR: No matching distribution found for inspect


In [2]:
from inspect_ai.log import read_eval_log

log = read_eval_log(log_file=r"logs\2025-03-26T23-43-33+01-00_nested-dirs-challenge_DxYcXZSDzKP9aKqDKAoYhq.eval")

ModuleNotFoundError: No module named 'inspect_ai'

# Check logs raw JSONs 
- change .eval to .zip 
- unzip 
- analyze headers.json and 

In [2]:
pip install seaborn

Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Using cached seaborn-0.13.2-py3-none-any.whl (294 kB)
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2
Note: you may need to restart the kernel to use updated packages.


In [6]:
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import glob
from pathlib import Path
import re

def parse_component_scores(explanation_text):
    """
    Parse component scores from explanation text using regex.
    Looks for patterns like "Depth: ✓", "Breadth: ✗", etc.

    Args:
        explanation_text: The string containing the scorer explanation.

    Returns:
        A dictionary with binary scores for each component (0 or 1).
        Defaults to 0 if a component is not found or parsing fails.
    """
    results = {
        "depth_score": 0,
        "breadth_score": 0,
        "uniqueness_score": 0
    }

    if not explanation_text:
        return results

    try:
        # Search for "Depth: [symbol]" pattern
        depth_match = re.search(r"Depth:\s*(✓|✗)", explanation_text)
        if depth_match:
            results["depth_score"] = 1 if depth_match.group(1) == "✓" else 0

        # Search for "Breadth: [symbol]" pattern
        breadth_match = re.search(r"Breadth:\s*(✓|✗)", explanation_text)
        if breadth_match:
            results["breadth_score"] = 1 if breadth_match.group(1) == "✓" else 0

        # Search for "Uniqueness: [symbol]" pattern
        uniqueness_match = re.search(r"Uniqueness:\s*(✓|✗)", explanation_text)
        if uniqueness_match:
            results["uniqueness_score"] = 1 if uniqueness_match.group(1) == "✓" else 0

        return results

    except Exception as e:
        # Log the error but return the default scores
        print(f"Error parsing component scores with regex: {e}")
        # Ensure default values are returned on error
        return {
            "depth_score": 0,
            "breadth_score": 0,
            "uniqueness_score": 0
        }

# # --- Example Test ---
# test_explanation_1 = "[Py Direct /directory_task] Depth: ✓ (4/4), Breadth: ✓, Uniqueness: ✓Found 340..."
# test_explanation_2 = "Depth: ✗, Breadth: ✓, Uniqueness: ✗"
# test_explanation_3 = "Some other text, Breadth: ✓, Uniqueness: ✓ maybe depth is missing"
# test_explanation_4 = "[Direct] Depth: ✓ (2/2), Breadth: ✓, Uniqueness: ✓ Found 6" # From exp1 perhaps?

# print(f"Parsing: '{test_explanation_1[:50]}...' -> {parse_component_scores(test_explanation_1)}")
# print(f"Parsing: '{test_explanation_2}' -> {parse_component_scores(test_explanation_2)}")
# print(f"Parsing: '{test_explanation_3}' -> {parse_component_scores(test_explanation_3)}")
# print(f"Parsing: '{test_explanation_4}' -> {parse_component_scores(test_explanation_4)}")

def extract_experiment_data(exp_dir: str, scorer_name: str = "nested_dirs"):
    """
    Extract data from an experiment directory
    
    Args:
        exp_dir: Path to the experiment directory
        
    Returns:
        Dictionary with experiment metadata and results
    """
    results = []
    metadata = {}
    print(f"Processing {exp_dir}")
    # Load header.json for metadata
    header_path = os.path.join(exp_dir, "header.json")
    print(f"Header path: {header_path}")
    if os.path.exists(header_path):
        try:
            with open(header_path, 'r', encoding='utf-8') as f:
                header_data = json.load(f)
                if "eval" in header_data:
                    metadata = header_data["eval"]
        except Exception as e:
            print(f"Error loading header.json from {exp_dir}: {e}")
    
    # Load summaries.json for results
    summaries_path = os.path.join(exp_dir, "summaries.json")
    print(f"Summaries path: {summaries_path}")
    if os.path.exists(summaries_path):
        try:
            with open(summaries_path, 'r', encoding='utf-8') as f:
                summaries_data = json.load(f)
                
                for sample in summaries_data:
                    if "scores" in sample and scorer_name in sample["scores"]:
                        score_data = sample["scores"][scorer_name]
                        
                        # Extract score value
                        score_value = score_data.get("value", 0)
                        
                        # Extract explanation and parse component scores
                        explanation = score_data.get("explanation", "")
                        component_scores = parse_component_scores(explanation)
                        
                        # Create result entry
                        result = {
                            "id": sample.get("id", ""),
                            "epoch": sample.get("epoch", 0),
                            "score_value": score_value,
                            "explanation": explanation,
                            **component_scores
                        }
                        
                        results.append(result)
        except Exception as e:
            print(f"Error loading summaries.json from {exp_dir}: {e}")
    
    return {
        "metadata": metadata,
        "results": results
    }

def find_experiment_folders(base_dir):
    """
    Find all experiment folders with 'nested-dirs-challenge' in the name
    
    Args:
        base_dir: Base directory to search in
        
    Returns:
        List of experiment directory paths
    """
    pattern = os.path.join(base_dir, "*nested-dirs-challenge*")
    return glob.glob(pattern)

def extract_all_experiments_data(base_dir):
    """
    Extract data from all experiment folders
    
    Args:
        base_dir: Base directory containing experiment folders
        
    Returns:
        DataFrame with combined data from all experiments
    """
    all_data = []
    
    # Find all experiment folders
    exp_folders = find_experiment_folders(base_dir)
    print(f"Found {len(exp_folders)} experiment folders")
    
    for folder in exp_folders:
        try:
            exp_data = extract_experiment_data(folder, scorer_name="check_nested_dirs_py")
            
            # Extract model name from metadata
            model_name = exp_data["metadata"].get("model", "unknown")
            
            for result in exp_data["results"]:
                result["model"] = model_name
                result["experiment_folder"] = os.path.basename(folder)
                
                # Convert id to numerical value (task complexity)
                # and potentially sample number
                result_id_str = str(result["id"]) # Ensure id is a string
                result["sample_num"] = None # Default sample number
                result["task_complexity_val"] = None # Default complexity value

                try:
                    # Try to interpret as a single integer complexity
                    result["task_complexity_val"] = int(result_id_str)
                    result["task_complexity"] = result["task_complexity_val"] # Keep original field for now
                except ValueError:
                    # If not a single int, try to parse as "sample_num, complexity_val"
                    if ',' in result_id_str:
                        try:
                            parts = result_id_str.split(',')
                            if len(parts) == 2:
                                sample_num_str = parts[0].strip()
                                complexity_str = parts[1].strip()
                                
                                # Attempt to convert both parts to int
                                sample_num = int(sample_num_str)
                                complexity_val = int(complexity_str)
                                
                                result["sample_num"] = sample_num
                                result["task_complexity_val"] = complexity_val
                                # For compatibility or specific use, you might store the parsed string or individual parts
                                result["task_complexity"] = f"Sample {sample_num}, Complexity {complexity_val}"
                            else:
                                # If not two parts, keep original id
                                result["task_complexity"] = result_id_str
                                print(f"Warning: ID '{result_id_str}' contains a comma but not in 'sample, complexity' format. Storing as is.")

                        except ValueError:
                            # If parts are not convertible to int, keep original id
                            result["task_complexity"] = result_id_str
                            print(f"Warning: Could not parse '{result_id_str}' as 'sample_num, complexity_val'. Storing as is.")
                    else:
                        # If no comma and not an int, keep original id
                        result["task_complexity"] = result_id_str
                        print(f"Warning: ID '{result_id_str}' is not an integer and does not contain a comma. Storing as is.")
                
                all_data.append(result)
                
        except Exception as e:
            print(f"Error processing folder {folder}: {e}")
    
    # Convert to DataFrame
    if all_data:
        return pd.DataFrame(all_data)
    else:
        return pd.DataFrame()

In [12]:
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import glob
from pathlib import Path
import re

def parse_component_scores(explanation_text):
    """
    Parse component scores from explanation text using regex.
    Looks for patterns like "Depth: ✓", "Breadth: ✗", etc.

    Args:
        explanation_text: The string containing the scorer explanation.

    Returns:
        A dictionary with binary scores for each component (0 or 1).
        Defaults to 0 if a component is not found or parsing fails.
    """
    results = {
        "depth_score": 0,
        "breadth_score": 0,
        "uniqueness_score": 0
    }

    if not explanation_text:
        # Return default scores if explanation_text is None or empty
        return results
    
    if not isinstance(explanation_text, str):
        # Ensure explanation_text is a string
        print(f"Warning: explanation_text is not a string: {type(explanation_text)}. Returning default scores.")
        return results

    try:
        # Search for "Depth: [symbol]" pattern
        depth_match = re.search(r"Depth:\s*(✓|✗)", explanation_text)
        if depth_match:
            results["depth_score"] = 1 if depth_match.group(1) == "✓" else 0

        # Search for "Breadth: [symbol]" pattern
        breadth_match = re.search(r"Breadth:\s*(✓|✗)", explanation_text)
        if breadth_match:
            results["breadth_score"] = 1 if breadth_match.group(1) == "✓" else 0

        # Search for "Uniqueness: [symbol]" pattern
        uniqueness_match = re.search(r"Uniqueness:\s*(✓|✗)", explanation_text)
        if uniqueness_match:
            results["uniqueness_score"] = 1 if uniqueness_match.group(1) == "✓" else 0

        return results

    except Exception as e:
        # Log the error but return the default scores
        print(f"Error parsing component scores with regex from '{explanation_text[:50]}...': {e}")
        # Ensure default values are returned on error
        return {
            "depth_score": 0,
            "breadth_score": 0,
            "uniqueness_score": 0
        }

def extract_experiment_data(exp_dir: str, scorer_name: str = "nested_dirs"):
    """
    Extract data from an experiment directory.
    Validates parameters and checks return values.
    
    Args:
        exp_dir: Path to the experiment directory. Must be a non-empty string.
        scorer_name: Name of the scorer. Must be a non-empty string.
        
    Returns:
        Dictionary with experiment metadata and results.
        Returns empty dicts if critical files are missing or errors occur.
    """
    if not exp_dir or not isinstance(exp_dir, str):
        print("Error: exp_dir must be a non-empty string.")
        return {"metadata": {}, "results": []}
    if not scorer_name or not isinstance(scorer_name, str):
        print("Error: scorer_name must be a non-empty string.")
        return {"metadata": {}, "results": []}

    results_list = []
    metadata = {}
    print(f"Processing {exp_dir}")

    # Load header.json for metadata
    header_path = os.path.join(exp_dir, "header.json")
    print(f"Header path: {header_path}")
    if os.path.exists(header_path):
        try:
            with open(header_path, 'r', encoding='utf-8') as f:
                header_data = json.load(f)
                if "eval" in header_data and isinstance(header_data["eval"], dict):
                    metadata = header_data["eval"]
                else:
                    print(f"Warning: 'eval' key missing or not a dict in header.json from {exp_dir}")
        except Exception as e:
            print(f"Error loading header.json from {exp_dir}: {e}")
            # metadata remains {}
    else:
        print(f"Warning: header.json not found in {exp_dir}")

    # Load summaries.json for results
    summaries_path = os.path.join(exp_dir, "summaries.json")
    print(f"Summaries path: {summaries_path}")
    if os.path.exists(summaries_path):
        try:
            with open(summaries_path, 'r', encoding='utf-8') as f:
                summaries_data = json.load(f)
                if not isinstance(summaries_data, list):
                    print(f"Error: summaries.json in {exp_dir} does not contain a list.")
                    summaries_data = [] # Process as empty list

                for sample in summaries_data:
                    if not isinstance(sample, dict):
                        print(f"Warning: Skipping non-dictionary item in summaries.json from {exp_dir}")
                        continue

                    if "scores" in sample and isinstance(sample["scores"], dict) and \
                       scorer_name in sample["scores"] and isinstance(sample["scores"][scorer_name], dict):
                        
                        score_data = sample["scores"][scorer_name]
                        
                        score_value = score_data.get("value", 0) # Default to 0 if missing
                        if not isinstance(score_value, (int, float)):
                            print(f"Warning: Score value '{score_value}' is not numeric. Defaulting to 0 for sample ID {sample.get('id', 'N/A')}.")
                            score_value = 0

                        explanation = score_data.get("explanation", "") # Default to empty string
                        component_scores_dict = parse_component_scores(explanation)
                        
                        result_entry = {
                            "id": sample.get("id", ""), # Default to empty string
                            "epoch": sample.get("epoch", 0), # Default to 0
                            "score_value": score_value,
                            "explanation": explanation,
                            **component_scores_dict
                        }
                        results_list.append(result_entry)
                    else:
                        print(f"Warning: Scorer '{scorer_name}' data missing or malformed for sample ID {sample.get('id', 'N/A')} in {exp_dir}")
        except Exception as e:
            print(f"Error loading or parsing summaries.json from {exp_dir}: {e}")
            # results_list remains as is (potentially partially filled or empty)
    else:
        print(f"Warning: summaries.json not found in {exp_dir}")
    
    return {
        "metadata": metadata,
        "results": results_list
    }

def find_experiment_folders(base_dir: str):
    """
    Find all experiment folders with 'nested-dirs-challenge' in the name.
    Validates parameters.
    
    Args:
        base_dir: Base directory to search in. Must be a non-empty string.
        
    Returns:
        List of experiment directory paths. Returns empty list on error.
    """
    if not base_dir or not isinstance(base_dir, str):
        print("Error: base_dir must be a non-empty string.")
        return []
    
    pattern = os.path.join(base_dir, "*nested-dirs-challenge*")
    try:
        folders = glob.glob(pattern)
        return folders
    except Exception as e:
        print(f"Error during glob.glob with pattern {pattern}: {e}")
        return []


def extract_all_experiments_data(base_dir: str):
    """
    Extract data from all experiment folders.
    Validates parameters.
    
    Args:
        base_dir: Base directory containing experiment folders. Must be a non-empty string.
        
    Returns:
        DataFrame with combined data from all experiments. Returns empty DataFrame on error or if no data.
    """
    if not base_dir or not isinstance(base_dir, str):
        print("Error: base_dir must be a non-empty string.")
        return pd.DataFrame()

    all_data_list = []
    
    exp_folders = find_experiment_folders(base_dir)
    if not exp_folders: # find_experiment_folders now returns list or handles its errors
        print(f"No experiment folders found in {base_dir} matching the pattern.")
        # Continue to return empty DataFrame if no folders
    else:
        print(f"Found {len(exp_folders)} experiment folders")
    
    for folder in exp_folders:
        try:
            # Specify the correct scorer name as used in your logs
            exp_data_dict = extract_experiment_data(folder, scorer_name="check_nested_dirs_py")
            
            # Validate return from extract_experiment_data
            if not exp_data_dict or not isinstance(exp_data_dict, dict) or \
               "metadata" not in exp_data_dict or "results" not in exp_data_dict or \
               not isinstance(exp_data_dict["results"], list):
                print(f"Warning: Invalid data structure returned from extract_experiment_data for folder {folder}. Skipping.")
                continue

            model_name = exp_data_dict["metadata"].get("model", "unknown")
            if not isinstance(model_name, str):
                 print(f"Warning: Model name is not a string: {model_name}. Using 'unknown_type'.")
                 model_name = "unknown_type"
            
            for result_item in exp_data_dict["results"]:
                if not isinstance(result_item, dict):
                    print(f"Warning: Skipping non-dictionary result item in folder {folder}.")
                    continue

                result_item["model"] = model_name
                result_item["experiment_folder"] = os.path.basename(folder)
                
                result_id_str = str(result_item.get("id", "")) # Ensure id is string, default to empty
                result_item["sample_num"] = None 
                result_item["task_complexity_val"] = None

                try:
                    result_item["task_complexity_val"] = int(result_id_str)
                    result_item["task_complexity"] = result_item["task_complexity_val"]
                except ValueError:
                    if ',' in result_id_str:
                        try:
                            parts = result_id_str.split(',')
                            if len(parts) == 2:
                                sample_num_str = parts[0].strip()
                                complexity_str = parts[1].strip()
                                
                                sample_num = int(sample_num_str)
                                complexity_val = int(complexity_str)
                                
                                result_item["sample_num"] = sample_num
                                result_item["task_complexity_val"] = complexity_val
                                result_item["task_complexity"] = f"Sample {sample_num}, Complexity {complexity_val}"
                            else:
                                result_item["task_complexity"] = result_id_str
                                print(f"Warning: ID '{result_id_str}' in {folder} contains a comma but not in 'sample, complexity' format. Storing as is.")
                        except ValueError:
                            result_item["task_complexity"] = result_id_str
                            print(f"Warning: Could not parse '{result_id_str}' in {folder} as 'sample_num, complexity_val'. Storing as is.")
                    else:
                        result_item["task_complexity"] = result_id_str
                        if result_id_str: # Only print warning if id_str is not empty
                            print(f"Warning: ID '{result_id_str}' in {folder} is not an integer and does not contain a comma. Storing as is.")
                
                all_data_list.append(result_item)
                
        except Exception as e:
            print(f"Error processing folder {folder}: {e}")
            # Continue to next folder
    
    if all_data_list:
        try:
            df = pd.DataFrame(all_data_list)
            return df
        except Exception as e:
            print(f"Error creating DataFrame from all_data_list: {e}")
            return pd.DataFrame() # Return empty DataFrame on error
    else:
        print("No data collected from any experiment folder.")
        return pd.DataFrame()


def analyze_llm_evaluations(base_dir: str):
    """
    Analyze LLM evaluation data, create visualizations, and save summary tables.
    Validates parameters.
    
    Args:
        base_dir: Base directory containing experiment folders. Must be a non-empty string.
    """
    if not base_dir or not isinstance(base_dir, str):
        print("Error: base_dir must be a non-empty string.")
        return

    # Extract data from all experiments
    df = extract_all_experiments_data(base_dir)
    
    if df.empty:
        print("No data found to analyze in the specified directories.")
        return
    
    # Define output directory
    output_dir = os.path.join(base_dir, "analysis")
    try:
        os.makedirs(output_dir, exist_ok=True)
    except OSError as e:
        print(f"Error creating output directory {output_dir}: {e}")
        return # Cannot proceed without output directory

    # Display basic information
    print(f"\nTotal records processed: {len(df)}")
    if 'model' in df.columns:
        print(f"Models evaluated: {df['model'].unique()}")
    else:
        print("Warning: 'model' column not found in DataFrame.")

    if 'task_complexity_val' in df.columns:
        df_numeric_complexity = df.dropna(subset=['task_complexity_val'])
        if not df_numeric_complexity.empty and pd.api.types.is_numeric_dtype(df_numeric_complexity['task_complexity_val']):
            min_comp = df_numeric_complexity['task_complexity_val'].min()
            max_comp = df_numeric_complexity['task_complexity_val'].max()
            print(f"Task complexity value range: {min_comp} to {max_comp}")
        else:
            print("No valid numeric task complexity values found for min/max range or column is not numeric.")
    else:
        print("Warning: 'task_complexity_val' column not found for range calculation.")
    
    # Extract and save amount of samples for each model
    if 'model' in df.columns:
        model_sample_counts = df.groupby('model').size().reset_index(name='number_of_samples')
        print("\nNumber of samples per model:")
        try:
            print(model_sample_counts.to_string())
            model_counts_csv_path = os.path.join(output_dir, "model_sample_counts.csv")
            model_sample_counts.to_csv(model_counts_csv_path, index=False)
            print(f"\nModel sample counts saved to {model_counts_csv_path}")
        except Exception as e:
            print(f"Error processing or saving model sample counts: {e}")
    else:
        print("Skipping model sample counts as 'model' column is missing.")

    # Save the detailed raw data to CSV
    # This llm_evaluation_data.csv contains all available columns from the DataFrame.
    csv_path = os.path.join(output_dir, "llm_evaluation_data.csv")
    try:
        df.to_csv(csv_path, index=False)
        print(f"Detailed raw data saved to {csv_path}")
    except Exception as e:
        print(f"Error saving detailed raw data to CSV: {e}")

    # --- Create visualizations (only if necessary columns exist) ---
    required_cols_for_plots = ['task_complexity_val', 'score_value', 'model', 
                               'depth_score', 'breadth_score', 'uniqueness_score']
    missing_cols = [col for col in required_cols_for_plots if col not in df.columns]

    if missing_cols:
        print(f"\nWarning: Skipping visualizations due to missing columns: {', '.join(missing_cols)}")
        print(f"Analysis (CSV exports) complete. Visualizations skipped. Output in {output_dir}")
        return

    # Ensure 'task_complexity_val' is numeric for plotting
    if not pd.api.types.is_numeric_dtype(df['task_complexity_val']):
        print("\nWarning: 'task_complexity_val' is not numeric. Skipping visualizations that require it as numeric.")
        print(f"Analysis (CSV exports) complete. Visualizations skipped. Output in {output_dir}")
        return

    component_columns = ["depth_score", "breadth_score", "uniqueness_score"]

    try:
        # 1. Overall score by model and task complexity value
        plt.figure(figsize=(12, 8))
        sns.boxplot(x="task_complexity_val", y="score_value", hue="model", data=df)
        plt.title("Overall Score by Model and Task Complexity Value")
        plt.xlabel("Task Complexity Value (N)")
        plt.ylabel("Score")
        plt.grid(linestyle='--', alpha=0.7)
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, "overall_score_by_model_complexity_value.png"))
        plt.close() # Close plot to free memory

        # 2. Component scores by model (aggregated over complexity values for this plot)
        # For this plot, we might want to average component scores per model, irrespective of task_complexity_val
        # or show it per model and task_complexity_val if that's more insightful.
        # Current component_long_df groups by model AND task_complexity_val which might be too granular for a simple bar plot by model.
        # Let's adjust to average by model only for this specific bar plot:
        component_avg_by_model_df = df.groupby("model")[component_columns].mean().reset_index()
        component_avg_by_model_long_df = pd.melt(
            component_avg_by_model_df,
            id_vars=["model"],
            value_vars=component_columns,
            var_name="Component",
            value_name="Average Success Rate"
        )
        plt.figure(figsize=(14, 8))
        sns.barplot(x="model", y="Average Success Rate", hue="Component", data=component_avg_by_model_long_df)
        plt.title("Average Component Success Rate by Model")
        plt.xlabel("Model")
        plt.ylabel("Average Success Rate (0-1)")
        plt.grid(linestyle='--', alpha=0.7)
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, "avg_component_scores_by_model.png"))
        plt.close()

        # 3. Success rate heatmap by task complexity value and model
        # Ensure task_complexity_val is suitable for pivot (e.g., not too many unique values)
        if df['task_complexity_val'].nunique() < 20 : # Arbitrary threshold for heatmap readability
            pivot_df = df.pivot_table(
                index="model", 
                columns="task_complexity_val",
                values="score_value",
                aggfunc="mean"
            )
            plt.figure(figsize=(12, 8))
            sns.heatmap(pivot_df, annot=True, cmap="YlGnBu", vmin=0, vmax=1, fmt=".2f")
            plt.title("Success Rate Heatmap by Model and Task Complexity Value")
            plt.xlabel("Task Complexity Value (N)")
            plt.ylabel("Model")
            plt.tight_layout()
            plt.savefig(os.path.join(output_dir, "success_rate_heatmap.png"))
            plt.close()
        else:
            print("Skipping heatmap generation as there are too many unique task_complexity_val values.")

        # 4. Performance by task complexity value for each model
        plt.figure(figsize=(12, 8))
        sns.lineplot(x="task_complexity_val", y="score_value", hue="model", data=df, marker="o", errorbar=('ci', 95))
        plt.title("Model Performance by Task Complexity Value")
        plt.xlabel("Task Complexity Value (N)")
        plt.ylabel("Average Score")
        plt.grid(linestyle='--', alpha=0.7)
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, "performance_by_complexity_value.png"))
        plt.close()
        
        # 5. Component success rates by task complexity value
        fig, axes = plt.subplots(1, 3, figsize=(18, 6), sharey=True) # sharey might be useful
        fig.suptitle("Component Success Rates by Task Complexity and Model", fontsize=16) # Add a main title
        
        for i, component in enumerate(component_columns):
            sns.lineplot(
                x="task_complexity_val", 
                y=component, 
                hue="model", 
                data=df, 
                marker="o",
                ax=axes[i],
                errorbar=('ci', 95)
            )
            axes[i].set_title(f"{component.replace('_score', '').capitalize()} Success Rate")
            axes[i].set_xlabel("Task Complexity Value (N)")
            axes[i].set_ylabel("Success Rate" if i == 0 else "") # Only show Y label on the first plot
            axes[i].grid(linestyle='--', alpha=0.7)
            axes[i].legend(title='Model') if i == len(component_columns) -1 else axes[i].legend().set_visible(False) # Show legend on last plot or adjust

        plt.tight_layout(rect=[0, 0, 1, 0.96]) # Adjust layout to make space for suptitle
        plt.savefig(os.path.join(output_dir, "component_success_by_complexity_value.png"))
        plt.close()
        
        print(f"\nAnalysis and visualizations complete. Output saved to {output_dir}")

    except Exception as e:
        print(f"An error occurred during visualization generation: {e}")


In [13]:
base_dir = r"C:\Users\Daniil Anisimov\git\spar_admission_evals\logs\exp6_13.05"
    
# Run the analysis
analyze_llm_evaluations(base_dir)

Found 10 experiment folders
Processing C:\Users\Daniil Anisimov\git\spar_admission_evals\logs\exp5_11.05\2025-05-11T15-32-20+02-00_nested-dirs-challenge_g9BVSLYE4CY5pQeMp8M6Gy
Header path: C:\Users\Daniil Anisimov\git\spar_admission_evals\logs\exp5_11.05\2025-05-11T15-32-20+02-00_nested-dirs-challenge_g9BVSLYE4CY5pQeMp8M6Gy\header.json
Summaries path: C:\Users\Daniil Anisimov\git\spar_admission_evals\logs\exp5_11.05\2025-05-11T15-32-20+02-00_nested-dirs-challenge_g9BVSLYE4CY5pQeMp8M6Gy\summaries.json
Processing C:\Users\Daniil Anisimov\git\spar_admission_evals\logs\exp5_11.05\2025-05-11T15-32-20+02-00_nested-dirs-challenge_g9BVSLYE4CY5pQeMp8M6Gy.zip
Header path: C:\Users\Daniil Anisimov\git\spar_admission_evals\logs\exp5_11.05\2025-05-11T15-32-20+02-00_nested-dirs-challenge_g9BVSLYE4CY5pQeMp8M6Gy.zip\header.json
Summaries path: C:\Users\Daniil Anisimov\git\spar_admission_evals\logs\exp5_11.05\2025-05-11T15-32-20+02-00_nested-dirs-challenge_g9BVSLYE4CY5pQeMp8M6Gy.zip\summaries.json
Proc

In [3]:
# Debug - 
base_dir = r"C:\Users\Daniil Anisimov\git\spar_admission_evals\logs\exp5_11.05"
all_data_df = extract_all_experiments_data(base_dir)
print(all_data_df)

NameError: name 'extract_all_experiments_data' is not defined