# Table of Contents

- [Introduction](#table-of-contents)
- [Environment](#raw-data-import)
- [Projects](#dn-project-details)
- [Runs](#dn-run-details-per-project)
- [Metrics](#metrics)
- [Tasks](#tasks)
- [Traces](#traces)

# Raw data import

### Set DN variables

In [None]:
import os

from dotenv import load_dotenv

load_dotenv()

DREADNODE_API_KEY = os.getenv("DREADNODE_API_KEY")
if DREADNODE_API_KEY is None:
    raise RuntimeError("DREADNODE_API_KEY not set")
DREADNODE_API_TOKEN = os.getenv("DREADNODE_API_TOKEN")
if DREADNODE_API_TOKEN is None:
    raise RuntimeError("DREADNODE_API_TOKEN not set")
DREADNODE_SERVER_URL = os.getenv("DREADNODE_SERVER_URL")
if DREADNODE_SERVER_URL is None:
    raise RuntimeError("DREADNODE_SERVER_URL not set")

PROJECT = "AIRTBench"

### DN project details

In [None]:
import dreadnode

# Configure the client
dreadnode.configure(
    token=os.getenv("DREADNODE_API_TOKEN"),
)

# Get the API client
api = dreadnode.api()

projects = api.list_projects()
print(f"Found {len(projects)} projects")

### DN runs per-project

In [None]:
project = api.get_project(PROJECT)
runs = api.list_runs(PROJECT)

print(f"Total runs found: {len(runs)}")

In [None]:
project_runs = {}

for project in projects:
    print(f"Fetching runs for project: {project.name} ({project.key})")
    runs = api.strikes.list_runs(project.key)
    project_runs[project.key] = runs
    print(f"  Found {len(runs)} runs")

print("\n# Project Runs Summary\n")
for project in projects:
    project_key = project.key
    runs = project_runs[project_key]

    print(f"## Project: {project.name}")
    print(f"- **ID**: {project.id}")
    print(f"- **Key**: {project_key}")
    print(f"- **Total Runs**: {len(runs)}")

    if runs:
        print("\n### Run Status Summary:")
        status_counts = {}
        for run in runs:
            status = run.status
            if status not in status_counts:
                status_counts[status] = 0
            status_counts[status] += 1

        for status, count in status_counts.items():
            print(f"- **{status}**: {count} runs")

        print("\n### Latest 5 Runs:")
        for i, run in enumerate(runs[:5]):
            print(f"- Run {i+1}: {run.id} ({run.status}) - {run.timestamp}")

    print("\n---\n")

# Filter the runs

In [None]:
for run in runs:
    print(run.id, run.status, run.timestamp, run.duration)

print(f"\nFound a total of {len(runs)} runs for project {project.name} ({project.key})")

#### Filter failed runs

In [None]:
# Filter failed runs
runs = [run for run in runs if run.status == "completed"]

#### **Optional**: Filter by run timestamps

In [None]:
# runs = [run for run in runs if run.timestamp > datetime(2025, 2, 28, 6, 20)]
# print(len(runs))

#### **Optional**: Filter runs by model

In [None]:
# runs = [run for run in runs if run.params.get("model") == "claude-3-7-sonnet-20250219"]

# Inspect the data

# Metrics

`dn.log_metrics` is a function that returns a dataframe with the metrics of the runs. SDK reference [here](https://vscode.dev/github/dreadnode/callisto/blob/ads/eng-1705-initial-notebook-data-analysis-for-callistodnode/__init__.py#L15)

In [None]:
# Get the latest run
if runs:
    latest_run = runs[0]  # Assuming runs are sorted with the latest first

    # Create a DataFrame to display all metrics
    import pandas as pd
    from datetime import datetime

    metrics_data = []

    print(f"Run ID: {latest_run.id}")
    print(f"Started: {latest_run.timestamp}")
    print(f"Status: {latest_run.status}")
    print(f"Duration: {latest_run.duration}")
    print("\nAvailable metrics:")

    # Print the metric names available
    for metric_name, metric_points in latest_run.metrics.items():
        print(f"- {metric_name}: {len(metric_points)} data points")

        # Get the latest value for each metric
        if metric_points:
            last_point = metric_points[-1]
            metrics_data.append({
                "Metric": metric_name,
                "Last Value": last_point.value,
                "Step": last_point.step,
                "Timestamp": last_point.timestamp
            })

    # Create a DataFrame with metric details
    if metrics_data:
        metrics_df = pd.DataFrame(metrics_data)
        display(metrics_df)
    else:
        print("No metric data points available")

    # Print all available parameters/metadata
    print("\nRun Parameters:")
    params_df = pd.DataFrame([latest_run.params]).T.reset_index()
    params_df.columns = ["Parameter", "Value"]
    display(params_df)
else:
    print("No runs found")

## Create and export a dataframe with the metrics of the runs

In [None]:
def create_metrics_dataframe_for_current_project(runs, project_key):
    """
    Create a comprehensive DataFrame of metrics from filtered runs for a specific project.

    Args:
        runs: List of already filtered run objects
        project_key: Project key these runs belong to

    Returns:
        pandas.DataFrame: DataFrame containing all metrics with run metadata
    """
    all_metrics = []
    all_scores = []

    for run in runs:
        # Base metadata for each run
        run_metadata = {
            "project_key": project_key,
            "run_id": str(run.id),
            "run_name": run.name,
            "timestamp": run.timestamp,
            "status": run.status,
            "duration": run.duration,
            "tags": ", ".join(run.tags) if run.tags else ""
        }

        # Extract parameters
        params = {}
        for param_name, param_value in run.params.items():
            params[f"param_{param_name}"] = param_value

        # Process each metric
        for metric_name, metric_points in run.metrics.items():
            for point in metric_points:
                metric_data = {
                    **run_metadata,
                    **params,
                    "data_type": "metric",
                    "metric_name": metric_name,
                    "value": point.value,
                    "step": point.step,
                    "metric_timestamp": point.timestamp
                }
                all_metrics.append(metric_data)

        # Process scores if available
        if hasattr(run, 'scores') and run.scores:
            for score in run.scores:
                score_data = {
                    **run_metadata,
                    **params,
                    "data_type": "score",
                    "metric_name": score.name,
                    "value": score.value,
                    "step": 0,  # Scores typically don't have steps
                    "metric_timestamp": run.timestamp,  # Use run timestamp
                }

                # Add score attributes as additional columns
                if hasattr(score, 'attributes') and score.attributes:
                    for attr_name, attr_value in score.attributes.items():
                        score_data[f"score_{attr_name}"] = attr_value

                all_scores.append(score_data)

    # Combine metrics and scores
    all_data = all_metrics + all_scores

    # Create DataFrame
    if not all_data:
        return pd.DataFrame()

    df = pd.DataFrame(all_data)

    # Add computed columns
    if not df.empty:
        # Calculate time from run start to metric
        if "timestamp" in df.columns and "metric_timestamp" in df.columns:
            df["time_from_start"] = (df["metric_timestamp"] - df["timestamp"]).dt.total_seconds()

        # Create a flag_found column for easier filtering
        if "metric_name" in df.columns:
            df["is_flag"] = df["metric_name"].isin(["found_flag", "flag_found"])

    return df

# Use the already filtered runs (which should be filtered for completed status)
project_metrics_df = create_metrics_dataframe_for_current_project(runs, PROJECT)

# Export to CSV and Parquet with the project name in the filename
csv_path = os.path.join(datasets_dir, f"{PROJECT}_metrics.csv")
parquet_path = os.path.join(datasets_dir, f"{PROJECT}_metrics.parquet")

project_metrics_df.to_csv(csv_path, index=False)
project_metrics_df.to_parquet(parquet_path, index=False)

print(f"Created DataFrame with {len(project_metrics_df)} data points from {len(runs)} filtered runs")
print(f"Files saved to: {datasets_dir}")
if not project_metrics_df.empty:
    print(f"Unique metrics: {project_metrics_df['metric_name'].unique()}")
    print(f"Data types: {project_metrics_df['data_type'].unique()}")
    print(f"Time span: {project_metrics_df['timestamp'].min()} to {project_metrics_df['timestamp'].max()}")

    # Show flag-related data if available
    flag_data = project_metrics_df[project_metrics_df["is_flag"] == True]
    if not flag_data.empty:
        print(f"\nFound {len(flag_data)} flag events across {flag_data['run_id'].nunique()} runs")
else:
    print("No data found in the filtered runs")

### Deep dive into the metrics dataframe flags

Based on the code in in the strikes SDK `main.py`, it appears that while the flag values should theoretically be in the scores' attributes, they're truncated or only partially stored:

```python
flag_score = dn.Score(
    name="flag_found",
    value=1.0,
    attributes={
        "challenge_id": challenge.id,
        "flag": match[:10] + "...",  # Only first 10 chars are stored
    },
)
```

So while our dataframe is capturing everything available from the API, the full flag values aren't included in the data returned from the API itself.

In [None]:
print("\n### In-depth Flag Analysis")

if not project_metrics_df.empty and 'is_flag' in project_metrics_df.columns:
    flag_runs = project_metrics_df[project_metrics_df['is_flag'] == True]['run_id'].unique()

    if len(flag_runs) > 0:
        print(f"Found {len(flag_runs)} runs with flags")

        # Direct inspection of flag data
        flag_metrics = project_metrics_df[project_metrics_df['is_flag'] == True]
        flag_summary = flag_metrics.groupby(['run_id', 'metric_name']).agg({
            'value': 'sum',
            'step': 'max',
            'param_challenge': 'first',
            'param_model': 'first',
            'tags': 'first'
        }).reset_index()

        # Show flag distribution by challenge
        challenge_counts = flag_summary.groupby('param_challenge')['value'].sum().sort_values(ascending=False)

        print("\nFlags found by challenge:")
        print(challenge_counts)

        # Show flag distribution by model
        if 'param_model' in flag_summary.columns:
            model_counts = flag_summary.groupby('param_model')['value'].sum().sort_values(ascending=False)
            print("\nFlags found by model:")
            print(model_counts)

        # Get the top 5 runs with the most flags
        top_flag_runs = flag_summary.groupby('run_id')['value'].sum().sort_values(ascending=False).head(5)
        print("\nTop 5 runs with most flags:")
        print(top_flag_runs)

        # Show sample flag data for analysis
        print("\nSample flag events (first 10):")
        display(flag_metrics[['run_id', 'metric_name', 'value', 'param_challenge', 'timestamp']].head(10))

        # Try to extract flag patterns from run tags
        if 'tags' in flag_summary.columns:
            print("\nTags from flag-successful runs:")
            tag_list = flag_summary['tags'].str.split(', ').explode().dropna().unique()
            if len(tag_list) > 0:
                for tag in tag_list[:20]:  # Show up to 20 unique tags
                    print(f"- {tag}")

        # Based on the attached notebook, we might see patterns by grouping
        print("\nFlag success rate by challenge:")
        if 'is_flag' in project_metrics_df.columns and 'param_challenge' in project_metrics_df.columns:
            # Group flags by challenge
            challenge_success = project_metrics_df[project_metrics_df['is_flag'] == True].groupby('param_challenge')['value'].sum()
            # Count total run attempts by challenge
            challenge_attempts = project_metrics_df.groupby('param_challenge')['run_id'].nunique()

            # Combine into a success rate table
            challenge_stats = pd.DataFrame({
                'flags_found': challenge_success,
                'run_attempts': challenge_attempts
            }).fillna(0)
            challenge_stats['success_rate'] = challenge_stats['flags_found'] / challenge_stats['run_attempts']
            challenge_stats = challenge_stats.sort_values('success_rate', ascending=False)

            display(challenge_stats)
    else:
        print("No runs with flags found.")
else:
    print("No flag information available in the DataFrame.")