In [None]:
# NetworKit-based PageRank Simulation - Multi-Strategy (Final Results Only)
# Processes multiple strategy folders with cumulative overall tracking
# NO intermediate files - only final strategy summaries

# === INSTALLATION ===
!pip install networkit pandas numpy matplotlib seaborn

import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns
import networkit as nk
import time
import gc
from pathlib import Path
from collections import defaultdict

# === BOOSTING CONFIGURATION ===
NUM_BOOSTING_ROUNDS = 20
BRIDGINGS_PER_ROUND = 20
TOTAL_SIMULATIONS = NUM_BOOSTING_ROUNDS * BRIDGINGS_PER_ROUND

# === THRESHOLD CONFIGURATION ===
NEUTRAL_THRESHOLD = 0.01  # ±0.01% considered neutral

# === SIMULATION PARAMETERS ===
MIN_CONNECTIONS = 5
MAX_CONNECTIONS = 50
TOTAL_NODES_WWW = 100000
EDGES_PER_NEW_NODE = 2
PAGERANK_TOLERANCE = 1e-3

# === FILE PATHS ===
BASELINE_PATH = "/content/drive/MyDrive/WebKnoGraph/results/link_graph_edges.csv"

# List of comparison folders to process
COMPARISON_FOLDERS = [
    "/content/drive/MyDrive/WebKnoGraph/results/automatic_led/random_batches/",
    "/content/drive/MyDrive/WebKnoGraph/results/automatic_led/high_batches/",
    "/content/drive/MyDrive/WebKnoGraph/results/automatic_led/folder_batches/",
    "/content/drive/MyDrive/WebKnoGraph/results/automatic_led/mixed_batches/",
    "/content/drive/MyDrive/WebKnoGraph/results/automatic_led/low_batches/",
]

# === CACHE ===
_www_graph_cache = None


def mount_google_drive():
    """Mount Google Drive in Colab environment"""
    try:
        from google.colab import drive

        drive.mount("/content/drive")
        print("✓ Google Drive mounted successfully!")
        return True
    except:
        print("⚠ Not in Colab - skipping drive mount")
        return False


def load_graph_from_csv_networkit(file_path):
    """Load directed graph from CSV with FROM/TO columns"""
    try:
        df = pd.read_csv(file_path, usecols=["FROM", "TO"])
        df = df.dropna()
        df["FROM"] = df["FROM"].astype(str)
        df["TO"] = df["TO"].astype(str)
    except Exception as e:
        print(f"✗ Error loading {file_path}: {str(e)}")
        return None, None, None

    from_urls = df["FROM"].values
    to_urls = df["TO"].values

    if len(from_urls) == 0:
        return None, None, None

    all_urls = np.unique(np.concatenate([from_urls, to_urls]))
    url_to_idx = {url: i for i, url in enumerate(all_urls)}

    g = nk.Graph(n=len(all_urls), weighted=False, directed=True)
    for src_url, tgt_url in zip(from_urls, to_urls):
        g.addEdge(url_to_idx[src_url], url_to_idx[tgt_url])

    return g, all_urls, url_to_idx


def create_www_graph_networkit(n_nodes, m_edges, seed=42):
    """Create Barabási-Albert graph representing WWW with caching"""
    global _www_graph_cache

    cache_key = (n_nodes, m_edges, seed)
    if _www_graph_cache is not None and _www_graph_cache[0] == cache_key:
        cached_graph = _www_graph_cache[1]
        new_graph = nk.Graph(
            n=cached_graph.numberOfNodes(), weighted=False, directed=True
        )
        for u, v in cached_graph.iterEdges():
            new_graph.addEdge(u, v)
        return new_graph

    nk.setSeed(seed, False)
    generator = nk.generators.BarabasiAlbertGenerator(
        k=m_edges, nMax=n_nodes, n0=m_edges
    )
    www_graph = generator.generate()

    cached_graph = nk.Graph(n=www_graph.numberOfNodes(), weighted=False, directed=True)
    for u, v in www_graph.iterEdges():
        cached_graph.addEdge(u, v)
    _www_graph_cache = (cache_key, cached_graph)

    return www_graph


def process_configuration_networkit(www_graph, kalicube_edges, kalicube_nodes):
    """Merge Kalicube graph with WWW and calculate PageRank"""
    kalicube_offset = www_graph.numberOfNodes()
    n_kalicube = len(kalicube_nodes)

    # Create merged graph
    merged_graph = nk.Graph(n=www_graph.numberOfNodes(), weighted=False, directed=True)
    for u, v in www_graph.iterEdges():
        merged_graph.addEdge(u, v)

    # Add Kalicube nodes
    for _ in range(n_kalicube):
        merged_graph.addNode()

    # Add Kalicube internal edges
    if kalicube_edges:
        for src, tgt in kalicube_edges:
            merged_graph.addEdge(src + kalicube_offset, tgt + kalicube_offset)

    # Create bridging connections
    n_www_sample = min(MIN_CONNECTIONS, TOTAL_NODES_WWW)
    n_kalicube_sample = min(MIN_CONNECTIONS, len(kalicube_nodes))

    www_nodes_sample = np.random.choice(
        TOTAL_NODES_WWW, size=n_www_sample, replace=False
    )
    kalicube_indices = np.random.choice(
        len(kalicube_nodes), size=n_kalicube_sample, replace=False
    )

    for www_node_id, kalicube_idx in zip(www_nodes_sample, kalicube_indices):
        merged_graph.addEdge(www_node_id, kalicube_idx + kalicube_offset)

    # Calculate PageRank
    pagerank_algo = nk.centrality.PageRank(
        merged_graph, damp=0.85, tol=PAGERANK_TOLERANCE
    )
    pagerank_algo.run()
    pagerank_scores = pagerank_algo.scores()

    # Extract Kalicube PageRank scores
    pagerank_dict = {}
    for i, url in enumerate(kalicube_nodes):
        pagerank_dict[url] = pagerank_scores[i + kalicube_offset]

    return pagerank_dict


def classify_delta(delta_pct):
    """Classify delta as positive, negative, or neutral"""
    if delta_pct > NEUTRAL_THRESHOLD:
        return "positive"
    elif delta_pct < -NEUTRAL_THRESHOLD:
        return "negative"
    else:
        return "neutral"


def run_boosting_round(
    round_id, old_edges, new_edges, old_nodes, new_nodes, page_deltas_tracker
):
    """Run one boosting round with multiple bridging simulations"""
    delta_pcts_all = []

    for bridging_id in range(BRIDGINGS_PER_ROUND):
        sim_id = round_id * BRIDGINGS_PER_ROUND + bridging_id
        sim_seed = 42 + sim_id

        np.random.seed(sim_seed)
        random.seed(sim_seed)

        # Create WWW graph
        www_graph = create_www_graph_networkit(
            TOTAL_NODES_WWW, EDGES_PER_NEW_NODE, sim_seed
        )

        # Calculate PageRank for both configurations
        pagerank_old = process_configuration_networkit(www_graph, old_edges, old_nodes)
        pagerank_new = process_configuration_networkit(www_graph, new_edges, new_nodes)

        # Get common URLs
        common_urls = set(pagerank_old.keys()) & set(pagerank_new.keys())
        if not common_urls:
            continue

        # Calculate percentage changes and track per page
        for url in common_urls:
            old_val = pagerank_old[url]
            new_val = pagerank_new[url]
            delta = new_val - old_val
            delta_pct = (delta / max(old_val, 1e-10)) * 100
            delta_pcts_all.append(delta_pct)

            # Track this page's delta across all simulations
            page_deltas_tracker[url].append(delta_pct)

    return delta_pcts_all, page_deltas_tracker


def run_boosted_comparison(baseline_data, comparison_file):
    """Run complete boosted comparison - returns only final statistics"""
    comparison_name = comparison_file.stem
    print(f"  Processing: {comparison_name}...", end=" ", flush=True)

    start_time = time.time()

    # Load comparison graph
    g_old, nodes_old, _ = baseline_data
    g_new, nodes_new, _ = load_graph_from_csv_networkit(comparison_file)

    if g_new is None:
        print(f"✗ Failed")
        return None

    # Convert to edge lists and get node lists
    old_edges = [(u, v) for u, v in g_old.iterEdges()]
    new_edges = [(u, v) for u, v in g_new.iterEdges()]
    old_nodes = nodes_old
    new_nodes = nodes_new

    del g_new
    gc.collect()

    # Track deltas per page across all simulations
    page_deltas_tracker = defaultdict(list)
    all_deltas = []

    # Run boosting rounds (no intermediate storage)
    for round_id in range(NUM_BOOSTING_ROUNDS):
        round_deltas, page_deltas_tracker = run_boosting_round(
            round_id, old_edges, new_edges, old_nodes, new_nodes, page_deltas_tracker
        )
        all_deltas.extend(round_deltas)

    if not all_deltas:
        print("✗ No results")
        return None

    # Classify pages based on their average delta
    pages_positive = 0
    pages_negative = 0
    pages_neutral = 0

    for url, deltas in page_deltas_tracker.items():
        avg_delta = np.mean(deltas)
        classification = classify_delta(avg_delta)

        if classification == "positive":
            pages_positive += 1
        elif classification == "negative":
            pages_negative += 1
        else:
            pages_neutral += 1

    total_pages = len(page_deltas_tracker)
    all_deltas = np.array(all_deltas)

    # Calculate final statistics
    final_mean = np.mean(all_deltas)
    final_max = np.max(all_deltas)
    final_min = np.min(all_deltas)
    final_std = np.std(all_deltas)

    duration = time.time() - start_time
    print(f"✓ Mean: {final_mean:+.3f}% [{duration:.1f}s]")

    return {
        "name": comparison_name,
        "duration": duration,
        "mean": final_mean,
        "max": final_max,
        "min": final_min,
        "std": final_std,
        "pages_up": pages_positive,
        "pages_down": pages_negative,
        "pages_neutral": pages_neutral,
        "total_pages": total_pages,
        "num_simulations": TOTAL_SIMULATIONS,
    }


def create_strategy_summary(all_results, output_folder, strategy_name):
    """Create summary for a single strategy - ONLY file saved per strategy"""
    print(f"\n{'=' * 70}")
    print(f"STRATEGY SUMMARY: {strategy_name}")
    print(f"{'=' * 70}")

    data = []
    for r in all_results:
        if r:
            data.append(
                {
                    "Comparison": r["name"],
                    "Mean_Delta_%": r["mean"],
                    "Max_Delta_%": r["max"],
                    "Min_Delta_%": r["min"],
                    "Std_Delta_%": r["std"],
                    "Pages_Up": r["pages_up"],
                    "Pages_Down": r["pages_down"],
                    "Pages_Neutral": r["pages_neutral"],
                    "Total_Pages": r["total_pages"],
                    "Duration_m": r["duration"] / 60,
                }
            )

    if not data:
        print("✗ No valid results")
        return None

    df = pd.DataFrame(data)
    df = df.sort_values("Mean_Delta_%", ascending=False)

    # ONLY save this one file per strategy
    summary_path = output_folder / f"STRATEGY_SUMMARY_{strategy_name}.csv"
    df.to_csv(summary_path, index=False)

    print("\nRankings by Mean Delta %:")
    for idx, row in df.iterrows():
        symbol = (
            "↑"
            if row["Mean_Delta_%"] > NEUTRAL_THRESHOLD
            else "↓"
            if row["Mean_Delta_%"] < -NEUTRAL_THRESHOLD
            else "→"
        )
        print(
            f"  {symbol} {row['Comparison']}: {row['Mean_Delta_%']:+.3f}% "
            + f"(↑{row['Pages_Up']:.0f} ↓{row['Pages_Down']:.0f} →{row['Pages_Neutral']:.0f})"
        )

    # Calculate strategy averages
    strategy_avg_mean = df["Mean_Delta_%"].mean()
    strategy_avg_max = df["Max_Delta_%"].mean()
    strategy_avg_min = df["Min_Delta_%"].mean()
    strategy_avg_up = df["Pages_Up"].mean()
    strategy_avg_down = df["Pages_Down"].mean()
    strategy_avg_neutral = df["Pages_Neutral"].mean()

    print(f"\nStrategy Averages:")
    print(f"  Avg Mean Delta:   {strategy_avg_mean:+.3f}%")
    print(f"  Avg Max Delta:    {strategy_avg_max:+.3f}%")
    print(f"  Avg Min Delta:    {strategy_avg_min:+.3f}%")
    print(f"  Avg Pages Up:     {strategy_avg_up:.1f}")
    print(f"  Avg Pages Down:   {strategy_avg_down:.1f}")
    print(f"  Avg Pages Neutral: {strategy_avg_neutral:.1f}")
    print(f"\n✓ Saved: {summary_path.name}")

    return {
        "strategy_name": strategy_name,
        "avg_mean": strategy_avg_mean,
        "avg_max": strategy_avg_max,
        "avg_min": strategy_avg_min,
        "avg_up": strategy_avg_up,
        "avg_down": strategy_avg_down,
        "avg_neutral": strategy_avg_neutral,
        "num_comparisons": len(data),
    }


def update_overall_tracker(overall_tracker_path, strategy_result):
    """Update and save overall tracker - one row per strategy with its overall averages"""

    # Load existing tracker or create new
    if overall_tracker_path.exists():
        tracker_df = pd.read_csv(overall_tracker_path)
        existing_data = tracker_df.to_dict("records")
    else:
        existing_data = []

    # Add new strategy data (these are already averages across all comparisons in the strategy)
    existing_data.append(
        {
            "Strategy": strategy_result["strategy_name"],
            "Overall_Avg_Mean_%": strategy_result["avg_mean"],
            "Overall_Avg_Max_%": strategy_result["avg_max"],
            "Overall_Avg_Min_%": strategy_result["avg_min"],
            "Overall_Avg_Pages_Up": strategy_result["avg_up"],
            "Overall_Avg_Pages_Down": strategy_result["avg_down"],
            "Overall_Avg_Pages_Neutral": strategy_result["avg_neutral"],
            "Num_Comparisons": strategy_result["num_comparisons"],
        }
    )

    # Create DataFrame
    tracker_df = pd.DataFrame(existing_data)

    # Save updated tracker
    tracker_df.to_csv(overall_tracker_path, index=False)

    print(f"\n{'=' * 70}")
    print(f"OVERALL AVERAGES FOR STRATEGY: {strategy_result['strategy_name']}")
    print(f"{'=' * 70}")
    print(f"Comparison Files:        {strategy_result['num_comparisons']}")
    print(
        f"Total Simulations:       {strategy_result['num_comparisons'] * TOTAL_SIMULATIONS:,}"
    )
    print(f"\nOverall Averages (across all simulations in this strategy):")
    print(f"  Overall Avg Mean:        {strategy_result['avg_mean']:+.3f}%")
    print(f"  Overall Avg Max:         {strategy_result['avg_max']:+.3f}%")
    print(f"  Overall Avg Min:         {strategy_result['avg_min']:+.3f}%")
    print(f"  Overall Avg Pages Up:    {strategy_result['avg_up']:.1f}")
    print(f"  Overall Avg Pages Down:  {strategy_result['avg_down']:.1f}")
    print(f"  Overall Avg Pages Neutral: {strategy_result['avg_neutral']:.1f}")
    print(f"\n✓ Saved to: {overall_tracker_path.name}")
    print(f"{'=' * 70}")


def process_strategy_folder(
    baseline_data, comparison_folder, main_output_folder, overall_tracker_path
):
    """Process a single strategy folder"""
    comparison_folder = Path(comparison_folder)
    strategy_name = comparison_folder.name

    print(f"\n\n{'#' * 70}")
    print(f"STRATEGY: {strategy_name}")
    print(f"{'#' * 70}")

    # Find comparison files
    comparison_files = list(comparison_folder.glob("*.csv"))

    if not comparison_files:
        print(f"✗ No CSV files in: {comparison_folder}")
        return None

    print(f"✓ Found {len(comparison_files)} comparison files")

    # Run comparisons for this strategy (no intermediate files)
    all_results = []
    for i, comp_file in enumerate(comparison_files, 1):
        result = run_boosted_comparison(baseline_data, comp_file)
        all_results.append(result)

        # Clear cache
        global _www_graph_cache
        _www_graph_cache = None
        gc.collect()

    # Create strategy summary (ONLY file saved)
    strategy_result = create_strategy_summary(
        all_results, main_output_folder, strategy_name
    )

    if strategy_result:
        # Update overall tracker
        update_overall_tracker(overall_tracker_path, strategy_result)

    return strategy_result


if __name__ == "__main__":
    print("=" * 70)
    print("NetworKit PageRank Simulation - MULTI-STRATEGY (FINAL RESULTS ONLY)")
    print("=" * 70)
    print(f"NetworKit version: {nk.__version__}")
    print(f"Boosting Rounds: {NUM_BOOSTING_ROUNDS}")
    print(f"Bridgings per Round: {BRIDGINGS_PER_ROUND}")
    print(f"Total Simulations per Comparison: {TOTAL_SIMULATIONS}")
    print(f"Neutral Threshold: ±{NEUTRAL_THRESHOLD}%")
    print(f"Strategies to Process: {len(COMPARISON_FOLDERS)}")
    print("\n⚠ NO INTERMEDIATE FILES - Only final strategy summaries will be saved")

    mount_google_drive()

    # Verify baseline
    baseline_path = Path(BASELINE_PATH)
    if not baseline_path.exists():
        print(f"\n✗ Baseline not found: {BASELINE_PATH}")
        exit(1)

    print(f"\n✓ Baseline: {baseline_path.name}")

    # Load baseline once
    print("\nLoading baseline graph...")
    baseline_data = load_graph_from_csv_networkit(baseline_path)
    if baseline_data[0] is None:
        print("✗ Failed to load baseline")
        exit(1)

    g, nodes, _ = baseline_data
    print(f"✓ Loaded: {g.numberOfNodes():,} nodes, {g.numberOfEdges():,} edges")

    # Create main output folder
    main_output_folder = Path(COMPARISON_FOLDERS[0]).parent / "FINAL_STRATEGY_RESULTS"
    main_output_folder.mkdir(exist_ok=True, parents=True)
    print(f"✓ Output folder: {main_output_folder}")

    # Initialize overall tracker path
    overall_tracker_path = main_output_folder / "OVERALL_AVERAGES_TRACKER.csv"

    # Process each strategy folder
    all_strategy_results = []
    for i, folder in enumerate(COMPARISON_FOLDERS, 1):
        print(f"\n\n{'█' * 70}")
        print(f"PROCESSING STRATEGY {i}/{len(COMPARISON_FOLDERS)}")
        print(f"{'█' * 70}")

        strategy_result = process_strategy_folder(
            baseline_data, folder, main_output_folder, overall_tracker_path
        )

        if strategy_result:
            all_strategy_results.append(strategy_result)

    # Final summary
    print(f"\n\n{'=' * 70}")
    print("✓ ALL STRATEGIES PROCESSED")
    print(f"{'=' * 70}")
    print(
        f"Strategies Completed: {len(all_strategy_results)}/{len(COMPARISON_FOLDERS)}"
    )
    print(f"\nFinal output files:")
    print(f"  • 1 summary CSV per strategy ({len(all_strategy_results)} files)")
    print(f"  • 1 overall tracker CSV (cumulative)")
    print(f"\nSaved to: {main_output_folder}")
    print(f"{'=' * 70}")