In [None]:
# Google Drive Folder-Level PageRank Analysis with REAL WWW DATA
# Enhanced version with page movement tracking (up/down/neutral)
# Multi-folder version - processes multiple comparison folders
# Uses FineWeb dataset CSV with FROM and TO columns as WWW graph
# Modified: 20 boosting rounds × 100 bridgings each = 2,000 simulations

# === INSTALLATION CELL (Run first) ===
!pip install networkit pandas numpy tqdm

# === MOUNT GOOGLE DRIVE ===
# from google.colab import drive
# drive.mount('/content/drive')

# === MAIN CODE ===
import pandas as pd
import numpy as np
import random
import networkit as nk
import os
import gc
from pathlib import Path
from multiprocessing import Pool, cpu_count
import time
from tqdm.auto import tqdm

# ============================================
# USER CONFIGURATION
# ============================================
BASELINE_PATH = "/content/drive/MyDrive/WebKnoGraph/results/link_graph_edges.csv"

# NEW: List of comparison folders to process
COMPARISON_FOLDERS = [
    "/content/drive/MyDrive/WebKnoGraph/results/expert_led/random_batches/",
    "/content/drive/MyDrive/WebKnoGraph/results/expert_led/high_batches/",
    "/content/drive/MyDrive/WebKnoGraph/results/expert_led/folder_batches/",
    "/content/drive/MyDrive/WebKnoGraph/results/expert_led/mixed_batches/",
    "/content/drive/MyDrive/WebKnoGraph/results/expert_led/low_batches/",
]

# Path to FineWeb WWW graph CSV
FINEWEB_WWW_PATH = "/content/drive/MyDrive/WebKnoGraph/results/fineweb_500k_pages.csv"

# NEW: Boosting configuration
NUM_BOOSTING_ROUNDS = 20
BRIDGINGS_PER_ROUND = 20
TOTAL_SIMULATIONS = NUM_BOOSTING_ROUNDS * BRIDGINGS_PER_ROUND

# NEW: Threshold for neutral changes (percentage)
NEUTRAL_THRESHOLD = 0.01  # ±0.01% considered neutral

# Simulation Parameters
MIN_CONNECTIONS = 5
MAX_CONNECTIONS = 50
PAGERANK_TOLERANCE = 1e-3

# Parallelization settings
USE_PARALLEL = True
NUM_WORKERS = max(2, cpu_count())
BATCH_SIZE = 10

# Output paths
OUTPUT_DIR = "/content/drive/MyDrive/WebKnoGraph/results/"
STRATEGY_SUMMARY_FILE = "strategy_comparison_results.csv"
DETAILED_RESULTS_FILE = "detailed_strategy_results.csv"

# ============================================
# CORE FUNCTIONS
# ============================================

_www_graph_cache = None


def load_graph_from_csv_networkit(file_path, graph_name="graph"):
    """Load graph from CSV file."""
    try:
        print(f"  Loading {graph_name} from {os.path.basename(file_path)}...")
        df = pd.read_csv(file_path, usecols=["FROM", "TO"])
        df = df.dropna()
        df["FROM"] = df["FROM"].astype(str)
        df["TO"] = df["TO"].astype(str)

        if len(df) == 0:
            print(f"  ERROR: No valid edges found in {file_path}")
            return None, None, None

        from_urls = df["FROM"].values
        to_urls = df["TO"].values
        all_urls = np.unique(np.concatenate([from_urls, to_urls]))
        url_to_idx = {url: i for i, url in enumerate(all_urls)}

        g = nk.Graph(n=len(all_urls), weighted=False, directed=True)
        for src_url, tgt_url in zip(from_urls, to_urls):
            g.addEdge(url_to_idx[src_url], url_to_idx[tgt_url])

        print(f"    Loaded: {len(all_urls):,} nodes, {len(df):,} edges")
        return g, all_urls, url_to_idx
    except Exception as e:
        print(f"  ERROR loading {file_path}: {str(e)}")
        return None, None, None


def load_www_graph_networkit(www_csv_path):
    """Load REAL WWW graph from FineWeb dataset with caching."""
    global _www_graph_cache

    if _www_graph_cache is not None and _www_graph_cache[0] == www_csv_path:
        print("  Using cached WWW graph")
        cached_graph = _www_graph_cache[1]
        new_graph = nk.Graph(
            n=cached_graph.numberOfNodes(), weighted=False, directed=True
        )
        for u, v in cached_graph.iterEdges():
            new_graph.addEdge(u, v)
        return new_graph, _www_graph_cache[2]

    print("\nLoading REAL WWW graph from FineWeb dataset...")
    www_graph, www_nodes, www_url_mapping = load_graph_from_csv_networkit(
        www_csv_path, graph_name="WWW graph"
    )

    if www_graph is None:
        raise ValueError(f"Failed to load WWW graph from {www_csv_path}")

    cached_graph = nk.Graph(n=www_graph.numberOfNodes(), weighted=False, directed=True)
    for u, v in www_graph.iterEdges():
        cached_graph.addEdge(u, v)
    _www_graph_cache = (www_csv_path, cached_graph, www_nodes)

    print(f"  WWW graph cached successfully")
    return www_graph, www_nodes


def process_configuration_networkit(
    www_graph, www_nodes, kalicube_edges, kalicube_nodes, kalicube_url_mapping
):
    """Process configuration and calculate PageRank."""
    kalicube_offset = www_graph.numberOfNodes()
    n_kalicube = len(kalicube_nodes)
    n_www = www_graph.numberOfNodes()

    merged_graph = nk.Graph(n=n_www, weighted=False, directed=True)
    for u, v in www_graph.iterEdges():
        merged_graph.addEdge(u, v)

    for _ in range(n_kalicube):
        merged_graph.addNode()

    if kalicube_edges:
        for src, tgt in kalicube_edges:
            merged_graph.addEdge(src + kalicube_offset, tgt + kalicube_offset)

    n_www_sample = min(MAX_CONNECTIONS, n_www)
    n_kalicube_sample = min(MAX_CONNECTIONS, n_kalicube)

    www_nodes_sample = np.random.choice(n_www, size=n_www_sample, replace=False)
    kalicube_indices = np.random.choice(
        n_kalicube, size=n_kalicube_sample, replace=False
    )

    for www_node_id, kalicube_idx in zip(www_nodes_sample, kalicube_indices):
        kalicube_node_id = kalicube_idx + kalicube_offset
        merged_graph.addEdge(www_node_id, kalicube_node_id)

    pagerank_algo = nk.centrality.PageRank(
        merged_graph, damp=0.85, tol=PAGERANK_TOLERANCE
    )
    pagerank_algo.run()
    pagerank_scores = pagerank_algo.scores()

    pagerank_dict = {}
    for i, url in enumerate(kalicube_nodes):
        vertex_id = i + kalicube_offset
        pagerank_dict[url] = pagerank_scores[vertex_id]

    return pagerank_dict


def run_single_simulation_networkit(
    simulation_id,
    www_graph,
    www_nodes,
    kalicube_old_edges,
    kalicube_new_edges,
    kalicube_nodes_old,
    kalicube_nodes_new,
    kalicube_url_mapping_old,
    kalicube_url_mapping_new,
):
    """Run single simulation with page movement tracking."""
    sim_seed = 42 + simulation_id
    np.random.seed(sim_seed)
    random.seed(sim_seed)

    pagerank_old_dict = process_configuration_networkit(
        www_graph,
        www_nodes,
        kalicube_old_edges,
        kalicube_nodes_old,
        kalicube_url_mapping_old,
    )

    pagerank_new_dict = process_configuration_networkit(
        www_graph,
        www_nodes,
        kalicube_new_edges,
        kalicube_nodes_new,
        kalicube_url_mapping_new,
    )

    old_urls = set(pagerank_old_dict.keys())
    new_urls = set(pagerank_new_dict.keys())
    common_urls = old_urls & new_urls

    if not common_urls:
        return None

    deltas_pct = []
    pages_up = 0
    pages_down = 0
    pages_neutral = 0

    for url in common_urls:
        before = pagerank_old_dict[url]
        after = pagerank_new_dict[url]
        if before > 0:
            delta_pct = ((after - before) / before) * 100
            deltas_pct.append(delta_pct)

            # Count page movement
            if delta_pct > NEUTRAL_THRESHOLD:
                pages_up += 1
            elif delta_pct < -NEUTRAL_THRESHOLD:
                pages_down += 1
            else:
                pages_neutral += 1

    if len(deltas_pct) == 0:
        return None

    return {
        "mean_delta_pct": np.mean(deltas_pct),
        "min_delta_pct": np.min(deltas_pct),
        "max_delta_pct": np.max(deltas_pct),
        "pages_up": pages_up,
        "pages_down": pages_down,
        "pages_neutral": pages_neutral,
        "total_pages": len(common_urls),
    }


def run_simulation_batch(args):
    """Helper function for parallel processing of simulation batches."""
    (
        sim_ids,
        www_graph_data,
        www_nodes,
        kalicube_old_edges,
        kalicube_new_edges,
        kalicube_nodes_old,
        kalicube_nodes_new,
        kalicube_url_mapping_old,
        kalicube_url_mapping_new,
    ) = args

    www_graph = nk.Graph(n=www_graph_data["n_nodes"], weighted=False, directed=True)
    for u, v in www_graph_data["edges"]:
        www_graph.addEdge(u, v)

    batch_results = []
    for sim_id in sim_ids:
        result = run_single_simulation_networkit(
            sim_id,
            www_graph,
            www_nodes,
            kalicube_old_edges,
            kalicube_new_edges,
            kalicube_nodes_old,
            kalicube_nodes_new,
            kalicube_url_mapping_old,
            kalicube_url_mapping_new,
        )
        if result is not None:
            batch_results.append(result)

    return batch_results


def analyze_csv_pair(www_graph, www_nodes, old_csv_path, new_csv_path):
    """Analyze a pair of CSV files with boosting rounds and page tracking."""
    print(f"\nAnalyzing: {os.path.basename(new_csv_path)}")
    start_time = time.time()

    kalicube_graph_old, kalicube_nodes_old, kalicube_url_mapping_old = (
        load_graph_from_csv_networkit(old_csv_path, "baseline Kalicube")
    )
    if kalicube_graph_old is None:
        print(f"  Failed to load old graph")
        return None

    kalicube_graph_new, kalicube_nodes_new, kalicube_url_mapping_new = (
        load_graph_from_csv_networkit(new_csv_path, "comparison Kalicube")
    )
    if kalicube_graph_new is None:
        print(f"  Failed to load new graph")
        return None

    kalicube_old_edges = [(u, v) for u, v in kalicube_graph_old.iterEdges()]
    kalicube_new_edges = [(u, v) for u, v in kalicube_graph_new.iterEdges()]

    del kalicube_graph_old, kalicube_graph_new
    gc.collect()

    print(
        f"  Running {NUM_BOOSTING_ROUNDS} boosting rounds ({BRIDGINGS_PER_ROUND} bridgings each)..."
    )
    print(f"  Total simulations: {TOTAL_SIMULATIONS}")

    www_graph_data = {
        "n_nodes": www_graph.numberOfNodes(),
        "edges": [(u, v) for u, v in www_graph.iterEdges()],
    }

    all_sim_results = []
    boosting_round_results = []

    if USE_PARALLEL:
        print(
            f"  Using parallel processing with {NUM_WORKERS} workers (batch size: {BATCH_SIZE})"
        )

        all_sim_ids = list(range(TOTAL_SIMULATIONS))
        sim_id_batches = []
        for i in range(0, TOTAL_SIMULATIONS, BATCH_SIZE):
            sim_ids = all_sim_ids[i : i + BATCH_SIZE]
            sim_id_batches.append(sim_ids)

        print(f"  Total batches to process: {len(sim_id_batches)}")

        batch_args = [
            (
                sim_ids,
                www_graph_data,
                www_nodes,
                kalicube_old_edges,
                kalicube_new_edges,
                kalicube_nodes_old,
                kalicube_nodes_new,
                kalicube_url_mapping_old,
                kalicube_url_mapping_new,
            )
            for sim_ids in sim_id_batches
        ]

        with Pool(NUM_WORKERS) as pool:
            with tqdm(
                total=len(sim_id_batches),
                desc="  Processing batches",
                unit="batch",
                ncols=100,
            ) as pbar:
                for batch_results in pool.imap_unordered(
                    run_simulation_batch, batch_args
                ):
                    all_sim_results.extend(batch_results)
                    pbar.update(1)

                    elapsed = time.time() - start_time
                    completed_sims = len(all_sim_results)
                    rate = completed_sims / elapsed if elapsed > 0 else 0
                    pbar.set_postfix(
                        {
                            "sims": completed_sims,
                            "rate": f"{rate:.1f}/s",
                            "ETA": f"{(TOTAL_SIMULATIONS - completed_sims) / rate / 60:.1f}m"
                            if rate > 0
                            else "N/A",
                        }
                    )

        print(f"  Organizing results into {NUM_BOOSTING_ROUNDS} rounds...")
        for round_num in range(NUM_BOOSTING_ROUNDS):
            round_size = len(all_sim_results) // NUM_BOOSTING_ROUNDS
            start_idx = round_num * round_size
            end_idx = (
                start_idx + round_size
                if round_num < NUM_BOOSTING_ROUNDS - 1
                else len(all_sim_results)
            )
            round_results = all_sim_results[start_idx:end_idx]

            if len(round_results) > 0:
                round_summary = {
                    "round": round_num + 1,
                    "num_bridgings": len(round_results),
                    "avg_mean_delta": np.mean(
                        [r["mean_delta_pct"] for r in round_results]
                    ),
                    "avg_min_delta": np.mean(
                        [r["min_delta_pct"] for r in round_results]
                    ),
                    "avg_max_delta": np.mean(
                        [r["max_delta_pct"] for r in round_results]
                    ),
                    "avg_pages_up": np.mean([r["pages_up"] for r in round_results]),
                    "avg_pages_down": np.mean([r["pages_down"] for r in round_results]),
                    "avg_pages_neutral": np.mean(
                        [r["pages_neutral"] for r in round_results]
                    ),
                }
                boosting_round_results.append(round_summary)

    else:
        print(f"  Using sequential processing")

        with tqdm(
            total=NUM_BOOSTING_ROUNDS, desc="  Boosting rounds", unit="round", ncols=100
        ) as pbar_rounds:
            for round_num in range(NUM_BOOSTING_ROUNDS):
                round_results = []

                with tqdm(
                    total=BRIDGINGS_PER_ROUND,
                    desc=f"    Round {round_num + 1}",
                    unit="bridging",
                    leave=False,
                    ncols=100,
                ) as pbar_bridgings:
                    for bridging_num in range(BRIDGINGS_PER_ROUND):
                        sim_id = round_num * BRIDGINGS_PER_ROUND + bridging_num

                        result = run_single_simulation_networkit(
                            sim_id,
                            www_graph,
                            www_nodes,
                            kalicube_old_edges,
                            kalicube_new_edges,
                            kalicube_nodes_old,
                            kalicube_nodes_new,
                            kalicube_url_mapping_old,
                            kalicube_url_mapping_new,
                        )
                        if result is not None:
                            result["round"] = round_num + 1
                            result["bridging"] = bridging_num + 1
                            round_results.append(result)
                            all_sim_results.append(result)

                        pbar_bridgings.update(1)

                        elapsed = time.time() - start_time
                        completed = len(all_sim_results)
                        rate = completed / elapsed if elapsed > 0 else 0
                        pbar_bridgings.set_postfix({"rate": f"{rate:.1f}/s"})

                if len(round_results) > 0:
                    round_summary = {
                        "round": round_num + 1,
                        "num_bridgings": len(round_results),
                        "avg_mean_delta": np.mean(
                            [r["mean_delta_pct"] for r in round_results]
                        ),
                        "avg_min_delta": np.mean(
                            [r["min_delta_pct"] for r in round_results]
                        ),
                        "avg_max_delta": np.mean(
                            [r["max_delta_pct"] for r in round_results]
                        ),
                        "avg_pages_up": np.mean([r["pages_up"] for r in round_results]),
                        "avg_pages_down": np.mean(
                            [r["pages_down"] for r in round_results]
                        ),
                        "avg_pages_neutral": np.mean(
                            [r["pages_neutral"] for r in round_results]
                        ),
                    }
                    boosting_round_results.append(round_summary)

                pbar_rounds.update(1)

                elapsed = time.time() - start_time
                completed = len(all_sim_results)
                rate = completed / elapsed if elapsed > 0 else 0
                remaining = TOTAL_SIMULATIONS - completed
                eta = remaining / rate if rate > 0 else 0
                pbar_rounds.set_postfix(
                    {
                        "sims": completed,
                        "rate": f"{rate:.1f}/s",
                        "ETA": f"{eta / 60:.1f}m",
                    }
                )

    if len(all_sim_results) == 0:
        print(f"  No valid results")
        return None

    # Calculate overall statistics including page movements
    avg_mean = np.mean([r["mean_delta_pct"] for r in all_sim_results])
    avg_min = np.mean([r["min_delta_pct"] for r in all_sim_results])
    avg_max = np.mean([r["max_delta_pct"] for r in all_sim_results])
    avg_pages_up = np.mean([r["pages_up"] for r in all_sim_results])
    avg_pages_down = np.mean([r["pages_down"] for r in all_sim_results])
    avg_pages_neutral = np.mean([r["pages_neutral"] for r in all_sim_results])

    elapsed_total = time.time() - start_time
    print(
        f"  ✓ Completed in {elapsed_total / 60:.1f} minutes "
        f"({len(all_sim_results) / elapsed_total:.1f} bridgings/sec)"
    )

    return {
        "filename": os.path.basename(new_csv_path),
        "num_boosting_rounds": NUM_BOOSTING_ROUNDS,
        "bridgings_per_round": BRIDGINGS_PER_ROUND,
        "total_bridgings": len(all_sim_results),
        "avg_mean_delta_pct": avg_mean,
        "avg_min_delta_pct": avg_min,
        "avg_max_delta_pct": avg_max,
        "avg_pages_up": avg_pages_up,
        "avg_pages_down": avg_pages_down,
        "avg_pages_neutral": avg_pages_neutral,
        "elapsed_time_seconds": elapsed_total,
        "boosting_round_results": boosting_round_results,
        "sim_results": all_sim_results,
    }


def validate_results(result):
    """Validate that max >= mean >= min."""
    if result is None:
        return False

    avg_max = result["avg_max_delta_pct"]
    avg_mean = result["avg_mean_delta_pct"]
    avg_min = result["avg_min_delta_pct"]

    if not (avg_max >= avg_mean >= avg_min):
        print(f"  WARNING: Validation failed for {result['filename']}")
        print(f"     Max: {avg_max:.2f}%, Mean: {avg_mean:.2f}%, Min: {avg_min:.2f}%")
        return False

    return True


def process_folder(www_graph, www_nodes, baseline_path, comparison_folder):
    """Process a single comparison folder."""
    folder_name = os.path.basename(comparison_folder.rstrip("/"))
    print("\n" + "=" * 70)
    print(f"PROCESSING FOLDER: {folder_name}")
    print("=" * 70)

    if not os.path.exists(comparison_folder):
        print(f"ERROR: Folder not found: {comparison_folder}")
        return None

    csv_files = sorted([f for f in os.listdir(comparison_folder) if f.endswith(".csv")])

    if len(csv_files) == 0:
        print(f"ERROR: No CSV files found in {comparison_folder}")
        return None

    print(f"Found {len(csv_files)} CSV files")

    results = []
    all_simulation_results = []

    for new_csv_filename in csv_files:
        new_csv_path = os.path.join(comparison_folder, new_csv_filename)
        result = analyze_csv_pair(www_graph, www_nodes, baseline_path, new_csv_path)

        if result is not None and validate_results(result):
            results.append(result)
            all_simulation_results.extend(result["sim_results"])
            print(f"  ✓ Valid results obtained")

    if len(results) == 0:
        print("No valid results obtained for this folder")
        return None

    # Calculate overall statistics
    overall_mean = np.mean([r["mean_delta_pct"] for r in all_simulation_results])
    overall_min = np.mean([r["min_delta_pct"] for r in all_simulation_results])
    overall_max = np.mean([r["max_delta_pct"] for r in all_simulation_results])
    total_avg_pages_up = np.mean([r["pages_up"] for r in all_simulation_results])
    total_avg_pages_down = np.mean([r["pages_down"] for r in all_simulation_results])
    total_avg_pages_neutral = np.mean(
        [r["pages_neutral"] for r in all_simulation_results]
    )

    print("\n" + "-" * 70)
    print(f"FOLDER SUMMARY: {folder_name}")
    print("-" * 70)
    print(f"Files analyzed: {len(results)}/{len(csv_files)}")
    print(f"Total boosting rounds per file: {NUM_BOOSTING_ROUNDS}")
    print(f"Bridgings per round: {BRIDGINGS_PER_ROUND}")
    print(f"Total simulations: {len(all_simulation_results)}")
    print(f"Overall Average Mean Delta: {overall_mean:>10.2f}%")
    print(f"Overall Average Min Delta:  {overall_min:>10.2f}%")
    print(f"Overall Average Max Delta:  {overall_max:>10.2f}%")
    print(f"Average Pages Up:           {total_avg_pages_up:>10.1f}")
    print(f"Average Pages Down:         {total_avg_pages_down:>10.1f}")
    print(f"Average Pages Neutral:      {total_avg_pages_neutral:>10.1f}")

    return {
        "folder_name": folder_name,
        "folder_path": comparison_folder,
        "files_analyzed": len(results),
        "total_files": len(csv_files),
        "num_boosting_rounds": NUM_BOOSTING_ROUNDS,
        "bridgings_per_round": BRIDGINGS_PER_ROUND,
        "total_simulations": len(all_simulation_results),
        "overall_mean_delta_pct": overall_mean,
        "overall_min_delta_pct": overall_min,
        "overall_max_delta_pct": overall_max,
        "total_avg_pages_up": total_avg_pages_up,
        "total_avg_pages_down": total_avg_pages_down,
        "total_avg_pages_neutral": total_avg_pages_neutral,
        "std_mean": np.std([r["mean_delta_pct"] for r in all_simulation_results]),
        "median_mean": np.median([r["mean_delta_pct"] for r in all_simulation_results]),
        "file_results": results,
    }


# ============================================
# MAIN EXECUTION
# ============================================

if __name__ == "__main__":
    print("=" * 70)
    print("MULTI-FOLDER PAGERANK ANALYSIS WITH PAGE MOVEMENT TRACKING")
    print(
        f"Configuration: {NUM_BOOSTING_ROUNDS} boosting rounds × {BRIDGINGS_PER_ROUND} bridgings"
    )
    print(f"Total simulations per comparison: {TOTAL_SIMULATIONS}")
    print(f"Neutral threshold: ±{NEUTRAL_THRESHOLD}%")
    if USE_PARALLEL:
        print(f"Parallel processing: ENABLED ({NUM_WORKERS} workers)")
    else:
        print(f"Parallel processing: DISABLED (sequential mode)")
    print("=" * 70)

    if not os.path.exists(BASELINE_PATH):
        print(f"\nERROR: Baseline file not found: {BASELINE_PATH}")
        exit(1)

    if not os.path.exists(FINEWEB_WWW_PATH):
        print(f"\nERROR: FineWeb WWW file not found: {FINEWEB_WWW_PATH}")
        exit(1)

    print(f"\nWWW Graph Source: {os.path.basename(FINEWEB_WWW_PATH)}")
    print(f"Baseline: {os.path.basename(BASELINE_PATH)}")
    print(f"Number of folders to process: {len(COMPARISON_FOLDERS)}")

    www_graph, www_nodes = load_www_graph_networkit(FINEWEB_WWW_PATH)

    folder_summaries = []

    for folder_path in COMPARISON_FOLDERS:
        folder_summary = process_folder(
            www_graph, www_nodes, BASELINE_PATH, folder_path
        )
        if folder_summary is not None:
            folder_summaries.append(folder_summary)

    # Print final summary table
    print("\n" + "=" * 70)
    print("FINAL SUMMARY - ALL STRATEGIES")
    print("=" * 70)

    if len(folder_summaries) == 0:
        print("\nNo valid results obtained from any folder")
    else:
        print(
            f"\n{'Strategy':<40} {'Mean %':<10} {'Up':<8} {'Down':<8} {'Neutral':<10}"
        )
        print("-" * 90)

        for summary in folder_summaries:
            print(
                f"{summary['folder_name']:<40} "
                f"{summary['overall_mean_delta_pct']:>8.2f}% "
                f"{summary['total_avg_pages_up']:>6.1f} "
                f"{summary['total_avg_pages_down']:>6.1f} "
                f"{summary['total_avg_pages_neutral']:>8.1f}"
            )

        print("-" * 90)
        print(
            f"\nTotal strategies analyzed: {len(folder_summaries)}/{len(COMPARISON_FOLDERS)}"
        )

        # Save strategy summary CSV
        summary_output_path = os.path.join(OUTPUT_DIR, STRATEGY_SUMMARY_FILE)
        summary_df = pd.DataFrame(
            [
                {
                    "strategy": s["folder_name"],
                    "mean_delta_pct": s["overall_mean_delta_pct"],
                    "min_delta_pct": s["overall_min_delta_pct"],
                    "max_delta_pct": s["overall_max_delta_pct"],
                    "total_avg_pages_up": s["total_avg_pages_up"],
                    "total_avg_pages_down": s["total_avg_pages_down"],
                    "total_avg_pages_neutral": s["total_avg_pages_neutral"],
                    "files_analyzed": s["files_analyzed"],
                    "num_boosting_rounds": s["num_boosting_rounds"],
                    "bridgings_per_round": s["bridgings_per_round"],
                    "total_simulations": s["total_simulations"],
                    "std_mean": s["std_mean"],
                    "median_mean": s["median_mean"],
                }
                for s in folder_summaries
            ]
        )

        summary_df.to_csv(summary_output_path, index=False)
        print(f"\n✓ Strategy summary saved to: {summary_output_path}")

        # Save detailed per-file results CSV
        detailed_records = []
        for folder_summary in folder_summaries:
            for file_result in folder_summary["file_results"]:
                detailed_records.append(
                    {
                        "strategy": folder_summary["folder_name"],
                        "filename": file_result["filename"],
                        "total_bridgings": file_result["total_bridgings"],
                        "avg_mean_delta_pct": file_result["avg_mean_delta_pct"],
                        "avg_min_delta_pct": file_result["avg_min_delta_pct"],
                        "avg_max_delta_pct": file_result["avg_max_delta_pct"],
                        "avg_pages_up": file_result["avg_pages_up"],
                        "avg_pages_down": file_result["avg_pages_down"],
                        "avg_pages_neutral": file_result["avg_pages_neutral"],
                        "elapsed_time_seconds": file_result["elapsed_time_seconds"],
                    }
                )

        if detailed_records:
            detailed_output_path = os.path.join(OUTPUT_DIR, DETAILED_RESULTS_FILE)
            detailed_df = pd.DataFrame(detailed_records)
            detailed_df.to_csv(detailed_output_path, index=False)
            print(f"✓ Detailed results saved to: {detailed_output_path}")

    # Cleanup
    _www_graph_cache = None
    gc.collect()

    print("\n" + "=" * 70)
    print("Analysis complete!")
    print("=" * 70)