In [None]:
# OPTIMIZED MULTI-RANGE PAGERANK ANALYSIS - EXPERT LED VERSION
# Performance improvements:
# - Binary graph caching (50-100x faster loading)
# - Worker pool with pre-loaded graphs (100x faster initialization)
# - Memory-efficient numpy arrays
# - Checkpoint recovery system
# - Enhanced error handling
# Expected time: 50-60 hours on full expert run (500 simulations per config)

# === INSTALLATION CELL ===
!pip install networkit pandas numpy tqdm matplotlib seaborn scipy psutil

# === MOUNT GOOGLE DRIVE ===
from google.colab import drive

drive.mount("/content/drive")

# === IMPORTS ===
import pandas as pd
import numpy as np
import random
import networkit as nk
import os
import gc
import pickle
import psutil
from pathlib import Path
from multiprocessing import Pool, cpu_count
import time
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import gaussian_kde, t
import traceback
import shutil

sns.set_style("whitegrid")
plt.rcParams["figure.dpi"] = 100

# ============================================
# USER CONFIGURATION - EXPERT LED ANALYSIS
# ============================================

BASELINE_PATH = "/content/drive/MyDrive/WebKnoGraph/results/link_graph_edges.csv"

# 5 Expert-Led Strategies
COMPARISON_FOLDERS = [
    (
        "/content/drive/MyDrive/WebKnoGraph/results/expert_led/high_batches/",
        "High Candidates",
    ),
    (
        "/content/drive/MyDrive/WebKnoGraph/results/expert_led/low_batches/",
        "Low Candidates",
    ),
    (
        "/content/drive/MyDrive/WebKnoGraph/results/expert_led/mixed_batches/",
        "Mixed Candidates",
    ),
    (
        "/content/drive/MyDrive/WebKnoGraph/results/expert_led/random_batches/",
        "Random Candidates",
    ),
    (
        "/content/drive/MyDrive/WebKnoGraph/results/expert_led/folder_batches/",
        "Folder Candidates",
    ),
]

FINEWEB_WWW_PATH = "/content/drive/MyDrive/WebKnoGraph/results/fineweb_500k_pages.csv"

# 3 CONNECTION RANGES
CONNECTION_RANGES = [
    (5, 35, "Range_5-35"),
    (35, 65, "Range_35-65"),
    (65, 95, "Range_65-95"),
]

# Simulation parameters - EXPERT LED (500 total simulations)
EXPERT_BOOSTING_ROUNDS = 20
EXPERT_BRIDGINGS_PER_ROUND = 25
TOTAL_SIMULATIONS = EXPERT_BOOSTING_ROUNDS * EXPERT_BRIDGINGS_PER_ROUND

# SEO parameters
NEUTRAL_THRESHOLD = 0.025
PAGERANK_TOLERANCE = 1e-6
DAMPING_FACTOR = 0.80

# Parallelization
USE_PARALLEL = True
NUM_WORKERS = max(4, min(cpu_count() - 2, 16))
BATCH_SIZE = 15

# Output - EXPERT LED
OUTPUT_DIR = (
    "/content/drive/MyDrive/WebKnoGraph/results/expert_led/real_www_results_expert/"
)
MULTI_RANGE_OUTPUT_DIR = OUTPUT_DIR
CACHE_DIR = os.path.join(OUTPUT_DIR, "graph_cache/")
CHECKPOINT_DIR = os.path.join(OUTPUT_DIR, "checkpoints/")

os.makedirs(MULTI_RANGE_OUTPUT_DIR, exist_ok=True)
os.makedirs(CACHE_DIR, exist_ok=True)
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

# ============================================
# PATH VERIFICATION
# ============================================

print("=" * 70)
print("PATH VERIFICATION")
print("=" * 70)

paths_to_check = [
    ("Baseline", BASELINE_PATH),
    ("WWW Graph", FINEWEB_WWW_PATH),
]

for folder_path, strategy_name in COMPARISON_FOLDERS:
    paths_to_check.append((strategy_name, folder_path))

all_valid = True
for name, path in paths_to_check:
    exists = os.path.exists(path)
    status = "‚úì" if exists else "‚úó"
    print(f"{status} {name}: {path}")

    if not exists:
        all_valid = False
    elif os.path.isdir(path):
        csv_count = len([f for f in os.listdir(path) if f.endswith(".csv")])
        print(f"  ‚îî‚îÄ Contains {csv_count} CSV files")
        if csv_count == 0:
            print(f"  ‚ö†Ô∏è  WARNING: No CSV files found!")
            all_valid = False

if not all_valid:
    print("\n‚ùå VALIDATION FAILED: Some paths are invalid or empty")
    print("Please verify your paths before continuing.")
else:
    print("\n‚úì All paths validated successfully!")

print(f"\nOutput directories:")
print(f"  Main: {OUTPUT_DIR}")
print(f"  Cache: {CACHE_DIR}")
print(f"  Checkpoints: {CHECKPOINT_DIR}")
print("=" * 70)

# ============================================
# GRAPH CACHING SYSTEM
# ============================================


class GraphCache:
    """High-performance graph caching with binary serialization"""

    def __init__(self, cache_dir=CACHE_DIR):
        self.cache_dir = cache_dir
        os.makedirs(cache_dir, exist_ok=True)

    def _get_cache_path(self, graph_id):
        return os.path.join(self.cache_dir, f"{graph_id}.nkb")

    def _get_meta_path(self, graph_id):
        return os.path.join(self.cache_dir, f"{graph_id}_meta.pkl")

    def save_graph(self, graph, graph_id):
        """Save graph to binary format"""
        try:
            cache_path = self._get_cache_path(graph_id)
            nk.writeGraph(graph, cache_path, nk.Format.NetworkitBinary)
            print(f"  ‚úì Cached graph: {graph_id} ({graph.numberOfNodes():,} nodes)")
            return cache_path
        except Exception as e:
            print(f"  WARNING: Cache save failed for {graph_id}: {e}")
            return None

    def load_graph(self, graph_id):
        """Load graph from binary cache"""
        try:
            cache_path = self._get_cache_path(graph_id)
            if os.path.exists(cache_path):
                return nk.readGraph(cache_path, nk.Format.NetworkitBinary)
        except Exception as e:
            print(f"  WARNING: Cache load failed for {graph_id}: {e}")
        return None

    def save_metadata(self, graph_id, metadata):
        """Save arbitrary metadata"""
        try:
            meta_path = self._get_meta_path(graph_id)
            with open(meta_path, "wb") as f:
                pickle.dump(metadata, f)
        except Exception as e:
            print(f"  WARNING: Metadata save failed: {e}")

    def load_metadata(self, graph_id):
        """Load metadata"""
        try:
            meta_path = self._get_meta_path(graph_id)
            if os.path.exists(meta_path):
                with open(meta_path, "rb") as f:
                    return pickle.load(f)
        except Exception as e:
            print(f"  WARNING: Metadata load failed: {e}")
        return None

    def clear_cache(self):
        """Clear all cached graphs"""
        try:
            shutil.rmtree(self.cache_dir)
            os.makedirs(self.cache_dir, exist_ok=True)
            print("  ‚úì Cache cleared")
        except Exception as e:
            print(f"  WARNING: Cache clear failed: {e}")


# ============================================
# CHECKPOINT SYSTEM
# ============================================


class CheckpointManager:
    """Save/load checkpoints for crash recovery"""

    def __init__(self, checkpoint_dir=CHECKPOINT_DIR):
        self.checkpoint_dir = checkpoint_dir
        os.makedirs(checkpoint_dir, exist_ok=True)

    def save_checkpoint(self, range_name, strategy_name, results):
        """Save checkpoint after each strategy completion"""
        checkpoint_id = f"{range_name}_{strategy_name.replace(' ', '_')}"
        checkpoint_path = os.path.join(self.checkpoint_dir, f"{checkpoint_id}.pkl")

        try:
            with open(checkpoint_path, "wb") as f:
                pickle.dump(results, f)
            print(f"  ‚úì Checkpoint saved: {checkpoint_id}")
        except Exception as e:
            print(f"  WARNING: Checkpoint save failed: {e}")

    def load_checkpoint(self, range_name, strategy_name):
        """Load checkpoint if exists"""
        checkpoint_id = f"{range_name}_{strategy_name.replace(' ', '_')}"
        checkpoint_path = os.path.join(self.checkpoint_dir, f"{checkpoint_id}.pkl")

        if os.path.exists(checkpoint_path):
            try:
                with open(checkpoint_path, "rb") as f:
                    return pickle.load(f)
            except Exception as e:
                print(f"  WARNING: Checkpoint load failed: {e}")
        return None

    def list_checkpoints(self):
        """List all available checkpoints"""
        checkpoints = [f for f in os.listdir(self.checkpoint_dir) if f.endswith(".pkl")]
        return checkpoints

    def clear_checkpoints(self):
        """Clear all checkpoints"""
        try:
            shutil.rmtree(self.checkpoint_dir)
            os.makedirs(self.checkpoint_dir, exist_ok=True)
            print("  ‚úì Checkpoints cleared")
        except Exception as e:
            print(f"  WARNING: Checkpoint clear failed: {e}")


# ============================================
# MEMORY MONITORING
# ============================================


def log_memory_usage(label=""):
    """Log current memory usage"""
    process = psutil.Process()
    memory_gb = process.memory_info().rss / 1024**3
    print(f"  üíæ Memory {label}: {memory_gb:.2f} GB")
    return memory_gb


# ============================================
# CORE FUNCTIONS (OPTIMIZED)
# ============================================


def load_graph_from_csv_networkit(file_path, graph_name="graph"):
    """Load graph from CSV file"""
    try:
        print(f"  Loading {graph_name} from {os.path.basename(file_path)}...")
        df = pd.read_csv(file_path, usecols=["FROM", "TO"])
        df = df.dropna()
        df["FROM"] = df["FROM"].astype(str)
        df["TO"] = df["TO"].astype(str)

        if len(df) == 0:
            print(f"  ERROR: No valid edges found in {file_path}")
            return None, None, None

        from_urls = df["FROM"].values
        to_urls = df["TO"].values
        all_urls = np.unique(np.concatenate([from_urls, to_urls]))
        url_to_idx = {url: i for i, url in enumerate(all_urls)}

        g = nk.Graph(n=len(all_urls), weighted=False, directed=True)
        for src_url, tgt_url in zip(from_urls, to_urls):
            g.addEdge(url_to_idx[src_url], url_to_idx[tgt_url])

        print(f"    Loaded: {len(all_urls):,} nodes, {len(df):,} edges")
        return g, all_urls, url_to_idx
    except Exception as e:
        print(f"  ERROR loading {file_path}: {str(e)}")
        traceback.print_exc()
        return None, None, None


def load_www_graph_optimized(www_csv_path, use_cache=True):
    """Load WWW graph with caching (50-100x faster on cache hit)"""
    cache = GraphCache()
    graph_id = "www_fineweb_500k"

    if use_cache:
        print("\n  Checking cache...")
        cached_graph = cache.load_graph(graph_id)
        cached_nodes = cache.load_metadata(graph_id)

        if cached_graph is not None and cached_nodes is not None:
            print(
                f"  ‚úì Loaded from cache: {cached_graph.numberOfNodes():,} nodes, "
                f"{cached_graph.numberOfEdges():,} edges"
            )
            log_memory_usage("after cache load")
            return cached_graph, cached_nodes

    print("\n  Cache miss - loading WWW graph from CSV...")
    www_graph, www_nodes, _ = load_graph_from_csv_networkit(
        www_csv_path, graph_name="WWW graph"
    )

    if www_graph is None:
        raise ValueError(f"Failed to load WWW graph from {www_csv_path}")

    if use_cache:
        cache.save_graph(www_graph, graph_id)
        cache.save_metadata(graph_id, www_nodes)

    log_memory_usage("after WWW load")
    return www_graph, www_nodes


# Global for worker processes
_worker_www_graph = None
_worker_www_nodes = None


def init_worker(www_cache_path, www_nodes):
    """Initialize worker with pre-loaded graph"""
    global _worker_www_graph, _worker_www_nodes
    try:
        _worker_www_graph = nk.readGraph(www_cache_path, nk.Format.NetworkitBinary)
        _worker_www_nodes = www_nodes
        print(f"  Worker initialized: {_worker_www_graph.numberOfNodes():,} nodes")
    except Exception as e:
        print(f"  ERROR initializing worker: {e}")
        _worker_www_graph = None
        _worker_www_nodes = None


def process_configuration_networkit(
    www_graph,
    www_nodes,
    kalicube_edges,
    kalicube_nodes,
    kalicube_url_mapping,
    min_connections,
    max_connections,
):
    """Process configuration with specified connection limits"""
    kalicube_offset = www_graph.numberOfNodes()
    n_kalicube = len(kalicube_nodes)
    n_www = www_graph.numberOfNodes()

    merged_graph = nk.Graph(n=n_www, weighted=False, directed=True)
    for u, v in www_graph.iterEdges():
        merged_graph.addEdge(u, v)

    for _ in range(n_kalicube):
        merged_graph.addNode()

    if kalicube_edges:
        for src, tgt in kalicube_edges:
            merged_graph.addEdge(src + kalicube_offset, tgt + kalicube_offset)

    n_www_sample = min(max_connections, n_www)
    n_kalicube_sample = min(max_connections, n_kalicube)

    www_nodes_sample = np.random.choice(n_www, size=n_www_sample, replace=False)
    kalicube_indices = np.random.choice(
        n_kalicube, size=n_kalicube_sample, replace=False
    )

    for www_node_id, kalicube_idx in zip(www_nodes_sample, kalicube_indices):
        kalicube_node_id = kalicube_idx + kalicube_offset
        merged_graph.addEdge(www_node_id, kalicube_node_id)

    pagerank_algo = nk.centrality.PageRank(
        merged_graph, damp=DAMPING_FACTOR, tol=PAGERANK_TOLERANCE
    )
    pagerank_algo.run()
    pagerank_scores = pagerank_algo.scores()

    pagerank_dict = {}
    for i, url in enumerate(kalicube_nodes):
        vertex_id = i + kalicube_offset
        pagerank_dict[url] = pagerank_scores[vertex_id]

    return pagerank_dict


def run_single_simulation_networkit(
    simulation_id,
    www_graph,
    www_nodes,
    kalicube_old_edges,
    kalicube_new_edges,
    kalicube_nodes_old,
    kalicube_nodes_new,
    kalicube_url_mapping_old,
    kalicube_url_mapping_new,
    min_connections,
    max_connections,
    return_distributions=False,
):
    """Run single simulation with fixed connection range"""
    sim_seed = 42 + simulation_id
    np.random.seed(sim_seed)
    random.seed(sim_seed)

    pagerank_old_dict = process_configuration_networkit(
        www_graph,
        www_nodes,
        kalicube_old_edges,
        kalicube_nodes_old,
        kalicube_url_mapping_old,
        min_connections,
        max_connections,
    )

    pagerank_new_dict = process_configuration_networkit(
        www_graph,
        www_nodes,
        kalicube_new_edges,
        kalicube_nodes_new,
        kalicube_url_mapping_new,
        min_connections,
        max_connections,
    )

    old_urls = set(pagerank_old_dict.keys())
    new_urls = set(pagerank_new_dict.keys())
    common_urls = old_urls & new_urls

    if not common_urls:
        return None

    deltas_pct = []
    pages_up = 0
    pages_down = 0
    pages_neutral = 0
    before_scores = []
    after_scores = []

    for url in common_urls:
        before = pagerank_old_dict[url]
        after = pagerank_new_dict[url]

        if return_distributions:
            before_scores.append(before)
            after_scores.append(after)

        if before > 0:
            delta_pct = ((after - before) / before) * 100
            deltas_pct.append(delta_pct)

            if delta_pct > NEUTRAL_THRESHOLD:
                pages_up += 1
            elif delta_pct < -NEUTRAL_THRESHOLD:
                pages_down += 1
            else:
                pages_neutral += 1

    if len(deltas_pct) == 0:
        return None

    result = {
        "mean_delta_pct": np.mean(deltas_pct),
        "min_delta_pct": np.min(deltas_pct),
        "max_delta_pct": np.max(deltas_pct),
        "pages_up": pages_up,
        "pages_down": pages_down,
        "pages_neutral": pages_neutral,
        "total_pages": len(common_urls),
    }

    if return_distributions:
        result["before_distribution"] = np.array(before_scores)
        result["after_distribution"] = np.array(after_scores)

    return result


def run_simulation_batch(args):
    """Parallel batch processing using cached graph"""
    global _worker_www_graph, _worker_www_nodes

    (
        sim_ids,
        kalicube_old_edges,
        kalicube_new_edges,
        kalicube_nodes_old,
        kalicube_nodes_new,
        kalicube_url_mapping_old,
        kalicube_url_mapping_new,
        min_conn,
        max_conn,
    ) = args

    if _worker_www_graph is None:
        print("  ERROR: Worker graph not initialized!")
        return []

    batch_results = []
    for sim_id in sim_ids:
        result = run_single_simulation_networkit(
            sim_id,
            _worker_www_graph,
            _worker_www_nodes,
            kalicube_old_edges,
            kalicube_new_edges,
            kalicube_nodes_old,
            kalicube_nodes_new,
            kalicube_url_mapping_old,
            kalicube_url_mapping_new,
            min_conn,
            max_conn,
            return_distributions=False,
        )
        if result is not None:
            batch_results.append(result)

    return batch_results


def analyze_csv_pair(
    www_cache_path,
    www_nodes,
    old_csv_path,
    new_csv_path,
    min_connections,
    max_connections,
    range_name,
):
    """Analyze pair with specific connection range"""
    print(f"\n  Analyzing: {os.path.basename(new_csv_path)}")
    start_time = time.time()

    # Validation checks
    assert min_connections < max_connections, "Invalid connection range"
    assert os.path.exists(old_csv_path), f"Baseline not found: {old_csv_path}"
    assert os.path.exists(new_csv_path), f"Comparison not found: {new_csv_path}"

    kalicube_graph_old, kalicube_nodes_old, kalicube_url_mapping_old = (
        load_graph_from_csv_networkit(old_csv_path, "baseline")
    )
    if kalicube_graph_old is None:
        return None

    kalicube_graph_new, kalicube_nodes_new, kalicube_url_mapping_new = (
        load_graph_from_csv_networkit(new_csv_path, "comparison")
    )
    if kalicube_graph_new is None:
        return None

    assert len(kalicube_nodes_new) > 0, "Empty comparison graph"

    kalicube_old_edges = [(u, v) for u, v in kalicube_graph_old.iterEdges()]
    kalicube_new_edges = [(u, v) for u, v in kalicube_graph_new.iterEdges()]

    del kalicube_graph_old, kalicube_graph_new
    gc.collect()

    print(
        f"    Running {TOTAL_SIMULATIONS} simulations with {min_connections}-{max_connections} connections..."
    )

    all_sim_results = []

    if USE_PARALLEL:
        all_sim_ids = list(range(TOTAL_SIMULATIONS))
        sim_id_batches = [
            all_sim_ids[i : i + BATCH_SIZE]
            for i in range(0, TOTAL_SIMULATIONS, BATCH_SIZE)
        ]

        batch_args = [
            (
                sim_ids,
                kalicube_old_edges,
                kalicube_new_edges,
                kalicube_nodes_old,
                kalicube_nodes_new,
                kalicube_url_mapping_old,
                kalicube_url_mapping_new,
                min_connections,
                max_connections,
            )
            for sim_ids in sim_id_batches
        ]

        # Use global pool with pre-initialized workers
        with Pool(
            NUM_WORKERS, initializer=init_worker, initargs=(www_cache_path, www_nodes)
        ) as pool:
            with tqdm(
                total=len(sim_id_batches),
                desc="    Batches",
                unit="batch",
                leave=False,
                ncols=100,
            ) as pbar:
                for batch_results in pool.imap_unordered(
                    run_simulation_batch, batch_args
                ):
                    all_sim_results.extend(batch_results)
                    pbar.update(1)
    else:
        # Non-parallel fallback
        www_graph = nk.readGraph(www_cache_path, nk.Format.NetworkitBinary)
        with tqdm(total=TOTAL_SIMULATIONS, desc="    Simulations", leave=False) as pbar:
            for sim_id in range(TOTAL_SIMULATIONS):
                result = run_single_simulation_networkit(
                    sim_id,
                    www_graph,
                    www_nodes,
                    kalicube_old_edges,
                    kalicube_new_edges,
                    kalicube_nodes_old,
                    kalicube_nodes_new,
                    kalicube_url_mapping_old,
                    kalicube_url_mapping_new,
                    min_connections,
                    max_connections,
                )
                if result is not None:
                    all_sim_results.append(result)
                pbar.update(1)

    if len(all_sim_results) == 0:
        return None

    elapsed = time.time() - start_time
    print(
        f"    ‚úì Completed in {elapsed / 60:.1f} min ({len(all_sim_results) / elapsed:.1f} sims/sec)"
    )

    return {
        "filename": os.path.basename(new_csv_path),
        "range_name": range_name,
        "min_connections": min_connections,
        "max_connections": max_connections,
        "total_simulations": len(all_sim_results),
        "avg_mean_delta_pct": np.mean([r["mean_delta_pct"] for r in all_sim_results]),
        "avg_min_delta_pct": np.mean([r["min_delta_pct"] for r in all_sim_results]),
        "avg_max_delta_pct": np.mean([r["max_delta_pct"] for r in all_sim_results]),
        "avg_pages_up": np.mean([r["pages_up"] for r in all_sim_results]),
        "avg_pages_down": np.mean([r["pages_down"] for r in all_sim_results]),
        "avg_pages_neutral": np.mean([r["pages_neutral"] for r in all_sim_results]),
        "std_mean": np.std([r["mean_delta_pct"] for r in all_sim_results]),
        "elapsed_seconds": elapsed,
        "sim_results": all_sim_results,
    }


def calculate_confidence_interval(data, confidence=0.95):
    """Calculate confidence interval for mean"""
    n = len(data)
    if n < 2:
        return np.mean(data), 0
    mean = np.mean(data)
    se = stats.sem(data)
    margin = se * t.ppf((1 + confidence) / 2, n - 1)
    return mean, margin


def process_strategy_with_range(
    www_cache_path,
    www_nodes,
    baseline_path,
    folder_path,
    strategy_name,
    min_conn,
    max_conn,
    range_name,
    checkpoint_mgr,
):
    """Process one strategy with specific connection range"""
    print(f"\n{'=' * 70}")
    print(f"STRATEGY: {strategy_name} | RANGE: {range_name} ({min_conn}-{max_conn})")
    print(f"{'=' * 70}")

    # Check for existing checkpoint
    checkpoint = checkpoint_mgr.load_checkpoint(range_name, strategy_name)
    if checkpoint is not None:
        print(f"  ‚úì Loaded from checkpoint!")
        return checkpoint

    if not os.path.exists(folder_path):
        print(f"  ERROR: Folder not found: {folder_path}")
        return None

    csv_files = sorted([f for f in os.listdir(folder_path) if f.endswith(".csv")])
    if len(csv_files) == 0:
        print(f"  ERROR: No CSV files found in {folder_path}")
        return None

    print(f"  Found {len(csv_files)} CSV files")

    results = []
    try:
        for csv_file in csv_files:
            csv_path = os.path.join(folder_path, csv_file)
            result = analyze_csv_pair(
                www_cache_path,
                www_nodes,
                baseline_path,
                csv_path,
                min_conn,
                max_conn,
                range_name,
            )
            if result is not None:
                results.append(result)
    except Exception as e:
        print(f"  ERROR during analysis: {e}")
        traceback.print_exc()
        if len(results) > 0:
            print(f"  Partial results: {len(results)} files analyzed")

    if len(results) == 0:
        return None

    # Aggregate results
    all_sims = []
    for r in results:
        all_sims.extend(r["sim_results"])

    mean_deltas = [s["mean_delta_pct"] for s in all_sims]
    mean_val, margin = calculate_confidence_interval(mean_deltas)

    result_summary = {
        "strategy_name": strategy_name,
        "range_name": range_name,
        "min_connections": min_conn,
        "max_connections": max_conn,
        "files_analyzed": len(results),
        "total_simulations": len(all_sims),
        "overall_mean_delta_pct": mean_val,
        "overall_std": np.std(mean_deltas),
        "confidence_interval_95": margin,
        "avg_pages_up": np.mean([s["pages_up"] for s in all_sims]),
        "avg_pages_down": np.mean([s["pages_down"] for s in all_sims]),
        "efficiency": mean_val / ((min_conn + max_conn) / 2),  # Delta per connection
        "file_results": results,
    }

    # Save checkpoint
    checkpoint_mgr.save_checkpoint(range_name, strategy_name, result_summary)

    log_memory_usage("after strategy")
    gc.collect()

    return result_summary


def create_multi_range_comparison_plot(all_results, output_dir):
    """Create comprehensive comparison across strategies and ranges"""
    print("\n  Creating multi-range comparison visualization...")

    strategies = sorted(list(set([r["strategy_name"] for r in all_results])))
    ranges = sorted(list(set([r["range_name"] for r in all_results])))

    fig, axes = plt.subplots(2, 2, figsize=(20, 14))
    fig.suptitle(
        "Multi-Range Strategy Comparison Analysis - Expert Led",
        fontsize=20,
        fontweight="bold",
    )

    # 1. Mean Delta Heatmap
    ax1 = axes[0, 0]
    heatmap_data = np.zeros((len(strategies), len(ranges)))
    for i, strategy in enumerate(strategies):
        for j, range_name in enumerate(ranges):
            matching = [
                r
                for r in all_results
                if r["strategy_name"] == strategy and r["range_name"] == range_name
            ]
            if matching:
                heatmap_data[i, j] = matching[0]["overall_mean_delta_pct"]

    im = ax1.imshow(heatmap_data, cmap="RdYlGn", aspect="auto")
    ax1.set_xticks(np.arange(len(ranges)))
    ax1.set_yticks(np.arange(len(strategies)))
    ax1.set_xticklabels(ranges, rotation=45, ha="right")
    ax1.set_yticklabels(strategies)
    ax1.set_title("Mean PageRank Delta (%) - Heatmap", fontsize=14, fontweight="bold")

    for i in range(len(strategies)):
        for j in range(len(ranges)):
            text = ax1.text(
                j,
                i,
                f"{heatmap_data[i, j]:.3f}",
                ha="center",
                va="center",
                color="black",
                fontsize=10,
            )

    plt.colorbar(im, ax=ax1, label="Mean Delta %")

    # 2. Line plot: Strategy performance across ranges
    ax2 = axes[0, 1]
    colors = ["#2E86AB", "#A23B72", "#F18F01", "#C73E1D", "#6A994E"]
    for i, strategy in enumerate(strategies):
        strategy_data = [r for r in all_results if r["strategy_name"] == strategy]
        strategy_data = sorted(strategy_data, key=lambda x: x["min_connections"])

        x_vals = [r["min_connections"] for r in strategy_data]
        y_vals = [r["overall_mean_delta_pct"] for r in strategy_data]
        yerr = [r["confidence_interval_95"] for r in strategy_data]

        ax2.errorbar(
            x_vals,
            y_vals,
            yerr=yerr,
            marker="o",
            linewidth=2.5,
            label=strategy,
            color=colors[i % len(colors)],
            capsize=5,
        )

    ax2.set_xlabel("Minimum Connections", fontsize=12, fontweight="bold")
    ax2.set_ylabel("Mean PageRank Delta (%)", fontsize=12, fontweight="bold")
    ax2.set_title(
        "Strategy Performance vs Connection Range (95% CI)",
        fontsize=14,
        fontweight="bold",
    )
    ax2.legend(loc="best")
    ax2.grid(True, alpha=0.3)

    # 3. Efficiency plot: Delta per connection
    ax3 = axes[1, 0]
    efficiency_data = {}
    for result in all_results:
        strategy = result["strategy_name"]
        if strategy not in efficiency_data:
            efficiency_data[strategy] = []
        efficiency_data[strategy].append(
            (result["min_connections"], result["efficiency"])
        )

    for i, (strategy, data) in enumerate(efficiency_data.items()):
        data = sorted(data, key=lambda x: x[0])
        x_vals = [d[0] for d in data]
        y_vals = [d[1] for d in data]
        ax3.plot(
            x_vals,
            y_vals,
            marker="s",
            linewidth=2.5,
            label=strategy,
            color=colors[i % len(colors)],
        )

    ax3.set_xlabel("Minimum Connections", fontsize=12, fontweight="bold")
    ax3.set_ylabel("Efficiency (Œî% per connection)", fontsize=12, fontweight="bold")
    ax3.set_title("ROI: PageRank Gain per Connection", fontsize=14, fontweight="bold")
    ax3.legend(loc="best")
    ax3.grid(True, alpha=0.3)

    # 4. Summary table
    ax4 = axes[1, 1]
    ax4.axis("tight")
    ax4.axis("off")

    table_data = [["Strategy", "Range", "Mean Œî%", "95% CI", "‚ÜëPages", "Efficiency"]]
    for result in sorted(
        all_results, key=lambda x: (x["range_name"], -x["overall_mean_delta_pct"])
    ):
        table_data.append(
            [
                result["strategy_name"][:12],
                result["range_name"].replace("Range_", ""),
                f"{result['overall_mean_delta_pct']:.3f}",
                f"¬±{result['confidence_interval_95']:.3f}",
                f"{result['avg_pages_up']:.1f}",
                f"{result['efficiency']:.4f}",
            ]
        )

    table = ax4.table(
        cellText=table_data,
        cellLoc="center",
        loc="center",
        colWidths=[0.22, 0.15, 0.13, 0.13, 0.12, 0.15],
    )
    table.auto_set_font_size(False)
    table.set_fontsize(7)
    table.scale(1, 2)

    # Style header
    for i in range(6):
        table[(0, i)].set_facecolor("#40466e")
        table[(0, i)].set_text_props(weight="bold", color="white")

    plt.tight_layout()

    output_path = os.path.join(output_dir, "multi_range_comparison_expert.png")
    plt.savefig(output_path, dpi=300, bbox_inches="tight")
    plt.close()

    print(f"  ‚úì Saved: {output_path}")


def create_detailed_analysis_report(all_results, output_dir):
    """Create detailed text report with statistical analysis"""
    report_path = os.path.join(output_dir, "detailed_analysis_report_expert.txt")

    with open(report_path, "w") as f:
        f.write("=" * 80 + "\n")
        f.write("MULTI-RANGE PAGERANK ANALYSIS - EXPERT LED - DETAILED REPORT\n")
        f.write("=" * 80 + "\n\n")

        # Overall statistics
        f.write("OVERALL STATISTICS\n")
        f.write("-" * 80 + "\n")
        all_deltas = [r["overall_mean_delta_pct"] for r in all_results]
        total_sims = sum([r["total_simulations"] for r in all_results])
        f.write(f"Total Runs: {len(all_results)}\n")
        f.write(f"Total Simulations: {total_sims:,}\n")
        f.write(f"Overall Mean Delta: {np.mean(all_deltas):.4f}%\n")
        f.write(f"Overall Std Dev: {np.std(all_deltas):.4f}%\n")
        f.write(f"Range: [{np.min(all_deltas):.4f}%, {np.max(all_deltas):.4f}%]\n\n")

        # Best overall
        best = max(all_results, key=lambda x: x["overall_mean_delta_pct"])
        f.write("BEST OVERALL COMBINATION\n")
        f.write("-" * 80 + "\n")
        f.write(f"Strategy: {best['strategy_name']}\n")
        f.write(
            f"Range: {best['range_name']} ({best['min_connections']}-{best['max_connections']})\n"
        )
        f.write(
            f"Mean Delta: {best['overall_mean_delta_pct']:.4f}% ¬± {best['confidence_interval_95']:.4f}%\n"
        )
        f.write(f"Efficiency: {best['efficiency']:.5f}% per connection\n")
        f.write(f"Pages Up: {best['avg_pages_up']:.1f}\n")
        f.write(f"Pages Down: {best['avg_pages_down']:.1f}\n")
        f.write(f"Total Simulations: {best['total_simulations']:,}\n\n")

        # Analysis by range
        ranges = sorted(list(set([r["range_name"] for r in all_results])))
        f.write("ANALYSIS BY CONNECTION RANGE\n")
        f.write("=" * 80 + "\n\n")

        for range_name in ranges:
            f.write(f"{range_name}\n")
            f.write("-" * 80 + "\n")
            range_results = [r for r in all_results if r["range_name"] == range_name]
            range_results = sorted(
                range_results, key=lambda x: x["overall_mean_delta_pct"], reverse=True
            )

            for i, r in enumerate(range_results, 1):
                f.write(
                    f"  {i}. {r['strategy_name']:<25} "
                    f"{r['overall_mean_delta_pct']:>8.4f}% ¬± {r['confidence_interval_95']:>6.4f}%  "
                    f"Eff: {r['efficiency']:>7.5f}  "
                    f"‚Üë{r['avg_pages_up']:>5.1f} ‚Üì{r['avg_pages_down']:>5.1f}\n"
                )
            f.write("\n")

        # Strategy trends across ranges
        f.write("STRATEGY TRENDS ACROSS RANGES\n")
        f.write("=" * 80 + "\n")
        strategies = sorted(list(set([r["strategy_name"] for r in all_results])))

        for strategy in strategies:
            f.write(f"\n{strategy}\n")
            f.write("-" * 80 + "\n")
            strat_results = [r for r in all_results if r["strategy_name"] == strategy]
            strat_results = sorted(strat_results, key=lambda x: x["min_connections"])

            deltas = [r["overall_mean_delta_pct"] for r in strat_results]
            effs = [r["efficiency"] for r in strat_results]

            # Trend analysis
            if len(deltas) >= 2:
                delta_trend = (
                    "Increasing"
                    if deltas[-1] > deltas[0]
                    else "Decreasing"
                    if deltas[-1] < deltas[0]
                    else "Stable"
                )
                eff_trend = (
                    "Increasing"
                    if effs[-1] > effs[0]
                    else "Decreasing"
                    if effs[-1] < effs[0]
                    else "Stable"
                )
            else:
                delta_trend = eff_trend = "Insufficient data"

            f.write(f"  Delta Trend: {delta_trend}\n")
            f.write(f"  Efficiency Trend: {eff_trend}\n")
            f.write(f"  Performance:\n")
            for r in strat_results:
                f.write(
                    f"    {r['range_name']}: {r['overall_mean_delta_pct']:.4f}% "
                    f"(Eff: {r['efficiency']:.5f})\n"
                )

        # Recommendations
        f.write("\n" + "=" * 80 + "\n")
        f.write("RECOMMENDATIONS\n")
        f.write("=" * 80 + "\n\n")

        # Best efficiency
        best_eff = max(all_results, key=lambda x: x["efficiency"])
        f.write(
            f"1. Most Efficient: {best_eff['strategy_name']} @ {best_eff['range_name']}\n"
        )
        f.write(f"   Efficiency: {best_eff['efficiency']:.5f}% per connection\n")
        f.write(f"   Mean Delta: {best_eff['overall_mean_delta_pct']:.4f}%\n\n")

        # Best per range
        f.write("2. Best Strategy per Range:\n")
        for range_name in ranges:
            range_results = [r for r in all_results if r["range_name"] == range_name]
            best_in_range = max(
                range_results, key=lambda x: x["overall_mean_delta_pct"]
            )
            f.write(
                f"   {range_name}: {best_in_range['strategy_name']} "
                f"({best_in_range['overall_mean_delta_pct']:.4f}%)\n"
            )

        f.write("\n3. Scalability Analysis:\n")
        for strategy in strategies:
            strat_results = [r for r in all_results if r["strategy_name"] == strategy]
            if len(strat_results) >= 3:
                strat_results = sorted(
                    strat_results, key=lambda x: x["min_connections"]
                )
                deltas = [r["overall_mean_delta_pct"] for r in strat_results]
                improvement = (
                    ((deltas[-1] - deltas[0]) / abs(deltas[0])) * 100
                    if deltas[0] != 0
                    else 0
                )
                f.write(
                    f"   {strategy}: {improvement:+.1f}% change from low to high range\n"
                )

    print(f"  ‚úì Detailed report saved: {report_path}")


# ============================================
# MAIN EXECUTION
# ============================================

if __name__ == "__main__":
    print("=" * 70)
    print("OPTIMIZED MULTI-RANGE PAGERANK ANALYSIS - EXPERT LED")
    print("=" * 70)
    print(f"\nPerformance Optimizations:")
    print(f"  ‚úì Binary graph caching (50-100x faster)")
    print(f"  ‚úì Worker pool with pre-loaded graphs")
    print(f"  ‚úì Checkpoint system for crash recovery")
    print(f"  ‚úì Memory monitoring and cleanup")
    print(f"\nConnection Ranges: {len(CONNECTION_RANGES)}")
    for min_c, max_c, name in CONNECTION_RANGES:
        print(f"  - {name}: {min_c}-{max_c} connections")
    print(f"\nStrategies: {len(COMPARISON_FOLDERS)}")
    for _, name in COMPARISON_FOLDERS:
        print(f"  - {name}")
    print(
        f"\nTotal Runs: {len(CONNECTION_RANGES)} ranges √ó {len(COMPARISON_FOLDERS)} strategies = {len(CONNECTION_RANGES) * len(COMPARISON_FOLDERS)} runs"
    )
    print(f"Simulations per run: {TOTAL_SIMULATIONS:,} (EXPERT LED)")
    print(f"Parallel workers: {NUM_WORKERS}")
    print(f"\nEstimated runtime: 50-60 hours")
    print("=" * 70)

    log_memory_usage("initial")

    # Initialize systems
    cache = GraphCache()
    checkpoint_mgr = CheckpointManager()

    # Check for existing checkpoints
    existing_checkpoints = checkpoint_mgr.list_checkpoints()
    if existing_checkpoints:
        print(f"\n‚úì Found {len(existing_checkpoints)} existing checkpoints")
        print("  (Will skip already completed runs)")

    # Load and cache WWW graph
    print("\n" + "=" * 70)
    print("LOADING WWW GRAPH")
    print("=" * 70)

    try:
        www_graph, www_nodes = load_www_graph_optimized(
            FINEWEB_WWW_PATH, use_cache=True
        )

        # Cache for workers
        www_cache_path = cache.save_graph(www_graph, "www_workers")
        if www_cache_path is None:
            www_cache_path = cache._get_cache_path("www_workers")

        print(
            f"‚úì WWW graph ready: {www_graph.numberOfNodes():,} nodes, {www_graph.numberOfEdges():,} edges"
        )

    except Exception as e:
        print(f"‚úó FATAL ERROR loading WWW graph: {e}")
        traceback.print_exc()
        exit(1)

    all_results = []
    total_runs = len(CONNECTION_RANGES) * len(COMPARISON_FOLDERS)
    current_run = 0
    failed_runs = []

    overall_start = time.time()

    # Main execution loop
    for min_conn, max_conn, range_name in CONNECTION_RANGES:
        print(f"\n\n{'#' * 70}")
        print(f"# PROCESSING CONNECTION RANGE: {range_name} ({min_conn}-{max_conn})")
        print(f"{'#' * 70}")

        for folder_path, strategy_name in COMPARISON_FOLDERS:
            current_run += 1
            print(f"\n[RUN {current_run}/{total_runs}]")

            try:
                result = process_strategy_with_range(
                    www_cache_path,
                    www_nodes,
                    BASELINE_PATH,
                    folder_path,
                    strategy_name,
                    min_conn,
                    max_conn,
                    range_name,
                    checkpoint_mgr,
                )

                if result is not None:
                    all_results.append(result)
                    print(
                        f"  ‚úì {strategy_name} @ {range_name}: "
                        f"Mean Œî = {result['overall_mean_delta_pct']:+.4f}% "
                        f"¬± {result['confidence_interval_95']:.4f}% "
                        f"(Eff: {result['efficiency']:.5f})"
                    )
                else:
                    print(f"  ‚úó No results for {strategy_name} @ {range_name}")
                    failed_runs.append((strategy_name, range_name))

            except Exception as e:
                print(f"  ‚úó ERROR: {e}")
                traceback.print_exc()
                failed_runs.append((strategy_name, range_name))
                continue

            log_memory_usage(f"run {current_run}")

    overall_elapsed = time.time() - overall_start

    # Save and visualize results
    if all_results:
        print(f"\n{'=' * 70}")
        print("SAVING RESULTS")
        print(f"{'=' * 70}")

        # CSV output
        summary_df = pd.DataFrame(
            [
                {
                    "strategy": r["strategy_name"],
                    "range": r["range_name"],
                    "min_connections": r["min_connections"],
                    "max_connections": r["max_connections"],
                    "total_simulations": r["total_simulations"],
                    "mean_delta_pct": r["overall_mean_delta_pct"],
                    "ci_95": r["confidence_interval_95"],
                    "std_delta": r["overall_std"],
                    "avg_pages_up": r["avg_pages_up"],
                    "avg_pages_down": r["avg_pages_down"],
                    "efficiency": r["efficiency"],
                    "files_analyzed": r["files_analyzed"],
                }
                for r in all_results
            ]
        )

        csv_path = os.path.join(
            MULTI_RANGE_OUTPUT_DIR, "multi_range_results_expert.csv"
        )
        summary_df.to_csv(csv_path, index=False)
        print(f"  ‚úì Results CSV: {csv_path}")

        # Create visualizations
        create_multi_range_comparison_plot(all_results, MULTI_RANGE_OUTPUT_DIR)

        # Create detailed report
        create_detailed_analysis_report(all_results, MULTI_RANGE_OUTPUT_DIR)

        # Print summary
        print(f"\n{'=' * 70}")
        print("FINAL RESULTS SUMMARY - EXPERT LED")
        print(f"{'=' * 70}\n")

        # Group by range
        for min_conn, max_conn, range_name in CONNECTION_RANGES:
            print(f"\n{range_name} ({min_conn}-{max_conn} connections):")
            print("-" * 70)
            range_results = [r for r in all_results if r["range_name"] == range_name]
            range_results = sorted(
                range_results, key=lambda x: x["overall_mean_delta_pct"], reverse=True
            )

            for i, r in enumerate(range_results, 1):
                print(
                    f"  {i}. {r['strategy_name']:<20} {r['overall_mean_delta_pct']:>8.4f}% ¬± {r['confidence_interval_95']:>6.4f}%  "
                    f"Eff: {r['efficiency']:>7.5f}  (‚Üë{r['avg_pages_up']:>5.1f} ‚Üì{r['avg_pages_down']:>5.1f})"
                )

        # Key findings
        print(f"\n{'=' * 70}")
        print("KEY FINDINGS - EXPERT LED ANALYSIS")
        print(f"{'=' * 70}")

        best_overall = max(all_results, key=lambda x: x["overall_mean_delta_pct"])
        print(f"\nüèÜ BEST PERFORMANCE:")
        print(f"   Strategy: {best_overall['strategy_name']}")
        print(
            f"   Range: {best_overall['range_name']} ({best_overall['min_connections']}-{best_overall['max_connections']})"
        )
        print(
            f"   Mean Delta: {best_overall['overall_mean_delta_pct']:+.4f}% ¬± {best_overall['confidence_interval_95']:.4f}%"
        )
        print(f"   Efficiency: {best_overall['efficiency']:.5f}% per connection")

        best_eff = max(all_results, key=lambda x: x["efficiency"])
        print(f"\nüí∞ BEST ROI (Efficiency):")
        print(f"   Strategy: {best_eff['strategy_name']}")
        print(f"   Range: {best_eff['range_name']}")
        print(f"   Efficiency: {best_eff['efficiency']:.5f}% per connection")
        print(f"   Mean Delta: {best_eff['overall_mean_delta_pct']:+.4f}%")

        print(f"\nüìä BEST STRATEGY PER RANGE:")
        for min_conn, max_conn, range_name in CONNECTION_RANGES:
            range_results = [r for r in all_results if r["range_name"] == range_name]
            best_in_range = max(
                range_results, key=lambda x: x["overall_mean_delta_pct"]
            )
            print(
                f"   {range_name}: {best_in_range['strategy_name']} ({best_in_range['overall_mean_delta_pct']:+.4f}%)"
            )

        print(f"\n‚è±Ô∏è  PERFORMANCE METRICS:")
        print(f"   Total Runtime: {overall_elapsed / 3600:.2f} hours")
        print(f"   Successful Runs: {len(all_results)}/{total_runs}")
        print(f"   Failed Runs: {len(failed_runs)}")
        print(
            f"   Total Simulations: {sum(r['total_simulations'] for r in all_results):,}"
        )

        if failed_runs:
            print(f"\n‚ö†Ô∏è  FAILED RUNS:")
            for strategy, range_name in failed_runs:
                print(f"   - {strategy} @ {range_name}")

    else:
        print("\n‚ùå No valid results obtained")

    # Cleanup
    print(f"\n{'=' * 70}")
    print("CLEANUP")
    print(f"{'=' * 70}")

    del www_graph
    gc.collect()
    log_memory_usage("final")

    print(f"\n{'=' * 70}")
    print("MULTI-RANGE EXPERT LED ANALYSIS COMPLETE!")
    print(f"{'=' * 70}")
    print(f"\n‚úì Results saved to: {MULTI_RANGE_OUTPUT_DIR}")
    print(f"‚úì Visualization: multi_range_comparison_expert.png")
    print(f"‚úì Detailed report: detailed_analysis_report_expert.txt")
    print(f"‚úì Raw data: multi_range_results_expert.csv")
    print(f"\nüìö Next Steps:")
    print(f"  1. Review the heatmap for strategy√órange performance patterns")
    print(f"  2. Check efficiency plot to optimize connection budget")
    print(f"  3. Read detailed_analysis_report_expert.txt for recommendations")
    print(f"  4. Compare with automatic-led results for strategy insights")
    print(f"{'=' * 70}")

In [None]:
# === CHECK KALICUBE PAGE COUNT ===
import pandas as pd
import numpy as np

print("=" * 70)
print("KALICUBE DATASET ANALYSIS")
print("=" * 70)

# Baseline graph
baseline_path = "/content/drive/MyDrive/WebKnoGraph/results/link_graph_edges.csv"

if os.path.exists(baseline_path):
    print(f"\nüìä BASELINE GRAPH:")
    df = pd.read_csv(baseline_path, usecols=["FROM", "TO"])
    df = df.dropna()

    # Get unique pages
    all_pages = pd.concat([df["FROM"], df["TO"]]).unique()

    print(f"   Total Pages: {len(all_pages):,}")
    print(f"   Total Edges: {len(df):,}")
    print(f"   Avg Connections per Page: {len(df) / len(all_pages):.2f}")

    # Additional stats
    from_counts = df["FROM"].value_counts()
    to_counts = df["TO"].value_counts()

    print(f"\n   Outgoing Links:")
    print(f"      Max: {from_counts.max()} (from 1 page)")
    print(f"      Mean: {from_counts.mean():.2f}")
    print(f"      Median: {from_counts.median():.0f}")

    print(f"\n   Incoming Links:")
    print(f"      Max: {to_counts.max()} (to 1 page)")
    print(f"      Mean: {to_counts.mean():.2f}")
    print(f"      Median: {to_counts.median():.0f}")
else:
    print(f"‚ùå Baseline file not found: {baseline_path}")

# Check comparison strategies
print(f"\n{'=' * 70}")
print("COMPARISON STRATEGIES:")
print(f"{'=' * 70}")

strategies = [
    (
        "/content/drive/MyDrive/WebKnoGraph/results/expert_led/high_batches/",
        "High Candidates",
    ),
    (
        "/content/drive/MyDrive/WebKnoGraph/results/expert_led/low_batches/",
        "Low Candidates",
    ),
    (
        "/content/drive/MyDrive/WebKnoGraph/results/expert_led/mixed_batches/",
        "Mixed Candidates",
    ),
    (
        "/content/drive/MyDrive/WebKnoGraph/results/expert_led/random_batches/",
        "Random Candidates",
    ),
    (
        "/content/drive/MyDrive/WebKnoGraph/results/expert_led/folder_batches/",
        "Folder Candidates",
    ),
]

for folder_path, strategy_name in strategies:
    if os.path.exists(folder_path):
        csv_files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]

        if csv_files:
            print(f"\nüìÅ {strategy_name}:")
            print(f"   CSV Files: {len(csv_files)}")

            # Analyze first file as sample
            sample_path = os.path.join(folder_path, csv_files[0])
            df_sample = pd.read_csv(sample_path, usecols=["FROM", "TO"])
            df_sample = df_sample.dropna()
            all_pages_sample = pd.concat([df_sample["FROM"], df_sample["TO"]]).unique()

            print(f"   Sample File: {csv_files[0]}")
            print(f"      Pages: {len(all_pages_sample):,}")
            print(f"      Edges: {len(df_sample):,}")
        else:
            print(f"\nüìÅ {strategy_name}: No CSV files found")
    else:
        print(f"\nüìÅ {strategy_name}: Folder not found")

print(f"\n{'=' * 70}")