In [None]:
# NetworKit-based PageRank Simulation - Clean Version
# Calculates Mean Delta %, Max Delta %, Min Delta % only

# === INSTALLATION ===
!pip install networkit pandas numpy matplotlib seaborn

import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns
import networkit as nk
import time
import gc
from pathlib import Path

# === CONFIGURATION ===
BASELINE_PATH = "/content/drive/MyDrive/WebKnoGraph/results/link_graph_edges.csv"
COMPARISON_FOLDER = "/content/drive/MyDrive/WebKnoGraph/results/expert_led/low_batches/"

NUM_SIMULATIONS = 100
MIN_CONNECTIONS = 5
MAX_CONNECTIONS = 50
TOTAL_NODES_WWW = 100000
EDGES_PER_NEW_NODE = 2
PAGERANK_TOLERANCE = 1e-6

_www_graph_cache = None


def mount_google_drive():
    try:
        from google.colab import drive

        drive.mount("/content/drive")
        print("Google Drive mounted successfully!")
        return True
    except:
        print("Not in Colab - skipping drive mount")
        return False


def load_graph_from_csv_networkit(file_path):
    try:
        df = pd.read_csv(file_path, usecols=["FROM", "TO"])
        df = df.dropna()
        df["FROM"] = df["FROM"].astype(str)
        df["TO"] = df["TO"].astype(str)
    except Exception as e:
        print(f"Error loading {file_path}: {str(e)}")
        return None, None, None

    from_urls = df["FROM"].values
    to_urls = df["TO"].values

    if len(from_urls) == 0:
        return None, None, None

    all_urls = np.unique(np.concatenate([from_urls, to_urls]))
    url_to_idx = {url: i for i, url in enumerate(all_urls)}

    g = nk.Graph(n=len(all_urls), weighted=False, directed=True)
    for src_url, tgt_url in zip(from_urls, to_urls):
        g.addEdge(url_to_idx[src_url], url_to_idx[tgt_url])

    return g, all_urls, url_to_idx


def create_www_graph_networkit(n_nodes, m_edges, seed=42):
    global _www_graph_cache

    cache_key = (n_nodes, m_edges, seed)
    if _www_graph_cache is not None and _www_graph_cache[0] == cache_key:
        cached_graph = _www_graph_cache[1]
        new_graph = nk.Graph(
            n=cached_graph.numberOfNodes(), weighted=False, directed=True
        )
        for u, v in cached_graph.iterEdges():
            new_graph.addEdge(u, v)
        return new_graph

    nk.setSeed(seed, False)
    generator = nk.generators.BarabasiAlbertGenerator(
        k=m_edges, nMax=n_nodes, n0=m_edges
    )
    www_graph = generator.generate()

    cached_graph = nk.Graph(n=www_graph.numberOfNodes(), weighted=False, directed=True)
    for u, v in www_graph.iterEdges():
        cached_graph.addEdge(u, v)
    _www_graph_cache = (cache_key, cached_graph)
    return www_graph


def process_configuration_networkit(www_graph, kalicube_edges, kalicube_nodes):
    kalicube_offset = www_graph.numberOfNodes()
    n_kalicube = len(kalicube_nodes)

    merged_graph = nk.Graph(n=www_graph.numberOfNodes(), weighted=False, directed=True)
    for u, v in www_graph.iterEdges():
        merged_graph.addEdge(u, v)

    for _ in range(n_kalicube):
        merged_graph.addNode()

    if kalicube_edges:
        for src, tgt in kalicube_edges:
            merged_graph.addEdge(src + kalicube_offset, tgt + kalicube_offset)

    n_www_sample = min(MIN_CONNECTIONS, TOTAL_NODES_WWW)
    n_kalicube_sample = min(MIN_CONNECTIONS, len(kalicube_nodes))

    www_nodes_sample = np.random.choice(
        TOTAL_NODES_WWW, size=n_www_sample, replace=False
    )
    kalicube_indices = np.random.choice(
        len(kalicube_nodes), size=n_kalicube_sample, replace=False
    )

    for www_node_id, kalicube_idx in zip(www_nodes_sample, kalicube_indices):
        merged_graph.addEdge(www_node_id, kalicube_idx + kalicube_offset)

    pagerank_algo = nk.centrality.PageRank(
        merged_graph, damp=0.85, tol=PAGERANK_TOLERANCE
    )
    pagerank_algo.run()
    pagerank_scores = pagerank_algo.scores()

    pagerank_dict = {}
    for i, url in enumerate(kalicube_nodes):
        pagerank_dict[url] = pagerank_scores[i + kalicube_offset]

    return pagerank_dict


def run_single_simulation(sim_id, old_edges, new_edges, old_nodes, new_nodes):
    sim_seed = 42 + sim_id
    np.random.seed(sim_seed)
    random.seed(sim_seed)

    www_graph = create_www_graph_networkit(
        TOTAL_NODES_WWW, EDGES_PER_NEW_NODE, sim_seed
    )

    pagerank_old = process_configuration_networkit(www_graph, old_edges, old_nodes)
    pagerank_new = process_configuration_networkit(www_graph, new_edges, new_nodes)

    # Get common URLs
    common_urls = set(pagerank_old.keys()) & set(pagerank_new.keys())
    if not common_urls:
        return None

    # Calculate percentage changes
    delta_pcts = []
    for url in common_urls:
        old_val = pagerank_old[url]
        new_val = pagerank_new[url]
        delta = new_val - old_val
        delta_pct = (delta / max(old_val, 1e-10)) * 100
        delta_pcts.append(delta_pct)

    delta_pcts = np.array(delta_pcts)

    return {
        "Simulation": sim_id + 1,
        "Mean_Delta_%": np.mean(delta_pcts),
        "Max_Delta_%": np.max(delta_pcts),
        "Min_Delta_%": np.min(delta_pcts),
    }


def run_comparison(baseline_data, comparison_file, output_folder):
    comparison_name = comparison_file.stem
    print(f"\n{'=' * 70}")
    print(f"Starting: {comparison_name}")
    print(f"{'=' * 70}")

    start_time = time.time()

    # Load comparison graph
    g_old, nodes_old, _ = baseline_data
    g_new, nodes_new, _ = load_graph_from_csv_networkit(comparison_file)

    if g_new is None:
        print(f"Failed to load {comparison_file.name}")
        return None

    print(f"Baseline: {g_old.numberOfNodes():,} nodes, {g_old.numberOfEdges():,} edges")
    print(
        f"Comparison: {g_new.numberOfNodes():,} nodes, {g_new.numberOfEdges():,} edges"
    )

    # Convert to edge lists
    old_edges = [(u, v) for u, v in g_old.iterEdges()]
    new_edges = [(u, v) for u, v in g_new.iterEdges()]

    del g_new
    gc.collect()

    print(f"\nRunning {NUM_SIMULATIONS} simulations...")

    # Run simulations
    results = []
    for i in range(NUM_SIMULATIONS):
        result = run_single_simulation(i, old_edges, new_edges, nodes_old, nodes_new)
        if result:
            results.append(result)
        if (i + 1) % 10 == 0:
            print(f"  Completed {i + 1}/{NUM_SIMULATIONS}")

    if not results:
        print("No valid results")
        return None

    results_df = pd.DataFrame(results)

    # Save
    safe_name = comparison_name.replace(" ", "_").replace("/", "-")
    summary_path = output_folder / f"{safe_name}_summary.csv"
    results_df.to_csv(summary_path, index=False)

    # Statistics
    mean_val = results_df["Mean_Delta_%"].mean()
    max_val = results_df["Max_Delta_%"].mean()
    min_val = results_df["Min_Delta_%"].mean()

    print(f"\nResults:")
    print(f"  Mean Delta: {mean_val:.2f}%")
    print(f"  Max Delta: {max_val:.2f}%")
    print(f"  Min Delta: {min_val:.2f}%")

    # Validation
    print(f"\nValidation:")
    v1 = max_val >= mean_val
    v2 = mean_val >= min_val
    v3 = max_val >= min_val
    v4 = min_val <= mean_val <= max_val

    print(f"  Max >= Mean: {'PASS' if v1 else 'FAIL'}")
    print(f"  Mean >= Min: {'PASS' if v2 else 'FAIL'}")
    print(f"  Max >= Min: {'PASS' if v3 else 'FAIL'}")
    print(f"  Min <= Mean <= Max: {'PASS' if v4 else 'FAIL'}")

    all_pass = v1 and v2 and v3 and v4
    print(f"  Overall: {'ALL PASSED' if all_pass else 'SOME FAILED'}")

    duration = time.time() - start_time
    print(f"\nCompleted in {duration:.1f}s")

    return {
        "name": comparison_name,
        "duration": duration,
        "mean": mean_val,
        "max": max_val,
        "min": min_val,
        "valid": all_pass,
    }


def create_master_summary(all_results, output_folder):
    print(f"\n{'=' * 70}")
    print("MASTER SUMMARY")
    print(f"{'=' * 70}")

    data = []
    for r in all_results:
        if r:
            data.append(
                {
                    "Comparison": r["name"],
                    "Mean_Delta_%": r["mean"],
                    "Max_Delta_%": r["max"],
                    "Min_Delta_%": r["min"],
                    "Valid": r["valid"],
                }
            )

    if not data:
        print("No valid results")
        return

    df = pd.DataFrame(data)
    df = df.sort_values("Mean_Delta_%", ascending=False)

    df.to_csv(output_folder / "MASTER_SUMMARY.csv", index=False)

    print("\nRankings:")
    for _, row in df.iterrows():
        symbol = "+" if row["Mean_Delta_%"] > 0 else "-"
        valid = "[OK]" if row["Valid"] else "[!]"
        print(f"  {valid} {symbol} {row['Comparison']}: {row['Mean_Delta_%']:.2f}%")

    valid_count = df["Valid"].sum()
    print(f"\nValid: {valid_count}/{len(df)}")


def calculate_final_averages(all_results, output_folder):
    """
    Calculate final averages across all comparison files for the entire strategy.

    Args:
        all_results: List of result dictionaries from run_comparison()
        output_folder: Path object for saving results
    """
    print(f"\n{'=' * 70}")
    print("FINAL STRATEGY AVERAGES")
    print(f"{'=' * 70}")

    # Filter out None results
    valid_results = [r for r in all_results if r is not None]

    if not valid_results:
        print("No valid results to calculate averages")
        return None

    # Extract metrics
    mean_deltas = [r["mean"] for r in valid_results]
    max_deltas = [r["max"] for r in valid_results]
    min_deltas = [r["min"] for r in valid_results]

    # Calculate final averages
    final_mean_avg = np.mean(mean_deltas)
    final_max_avg = np.mean(max_deltas)
    final_min_avg = np.mean(min_deltas)

    # Overall average (grand mean)
    overall_avg = np.mean([final_mean_avg, final_max_avg, final_min_avg])

    # Calculate standard deviations
    mean_std = np.std(mean_deltas)
    max_std = np.std(max_deltas)
    min_std = np.std(min_deltas)

    # Print results
    print(f"\nComparison Files Processed: {len(valid_results)}")
    print(f"Total Simulations: {len(valid_results) * NUM_SIMULATIONS}")
    print(f"\nFinal Averages:")
    print(f"  Average Mean Delta: {final_mean_avg:.2f}% (±{mean_std:.2f})")
    print(f"  Average Max Delta:  {final_max_avg:.2f}% (±{max_std:.2f})")
    print(f"  Average Min Delta:  {final_min_avg:.2f}% (±{min_std:.2f})")
    print(f"\nOverall Strategy Average: {overall_avg:.2f}%")

    # Validation
    print(f"\nValidation:")
    v1 = final_max_avg >= final_mean_avg
    v2 = final_mean_avg >= final_min_avg
    v3 = final_max_avg >= final_min_avg

    print(f"  Max >= Mean: {'PASS' if v1 else 'FAIL'}")
    print(f"  Mean >= Min: {'PASS' if v2 else 'FAIL'}")
    print(f"  Max >= Min: {'PASS' if v3 else 'FAIL'}")

    all_pass = v1 and v2 and v3
    print(f"  Overall: {'ALL PASSED' if all_pass else 'SOME FAILED'}")

    # Save to CSV
    final_summary = {
        "Metric": [
            "Average_Mean_Delta_%",
            "Average_Max_Delta_%",
            "Average_Min_Delta_%",
            "Overall_Strategy_Average_%",
        ],
        "Value": [final_mean_avg, final_max_avg, final_min_avg, overall_avg],
        "Std_Dev": [mean_std, max_std, min_std, np.nan],
    }

    final_df = pd.DataFrame(final_summary)
    final_path = output_folder / "FINAL_STRATEGY_AVERAGES.csv"
    final_df.to_csv(final_path, index=False)

    print(f"\nSaved to: {final_path.name}")

    return {
        "final_mean_avg": final_mean_avg,
        "final_max_avg": final_max_avg,
        "final_min_avg": final_min_avg,
        "overall_avg": overall_avg,
        "mean_std": mean_std,
        "max_std": max_std,
        "min_std": min_std,
        "num_comparisons": len(valid_results),
        "valid": all_pass,
    }


if __name__ == "__main__":
    print("NetworKit PageRank Simulation")
    print(f"NetworKit version: {nk.__version__}")

    mount_google_drive()

    # Verify baseline
    baseline_path = Path(BASELINE_PATH)
    if not baseline_path.exists():
        print(f"Baseline not found: {BASELINE_PATH}")
        exit(1)

    print(f"\nBaseline: {baseline_path.name}")

    # Find comparison files
    comparison_folder = Path(COMPARISON_FOLDER)
    comparison_files = list(comparison_folder.glob("*.csv"))

    if not comparison_files:
        print(f"No CSV files in: {COMPARISON_FOLDER}")
        exit(1)

    print(f"Found {len(comparison_files)} comparison files")

    # Create output folder
    output_folder = comparison_folder / "simulation_results"
    output_folder.mkdir(exist_ok=True)

    # Load baseline
    print("\nLoading baseline...")
    baseline_data = load_graph_from_csv_networkit(baseline_path)
    if baseline_data[0] is None:
        print("Failed to load baseline")
        exit(1)

    g, nodes, _ = baseline_data
    print(f"Loaded: {g.numberOfNodes():,} nodes, {g.numberOfEdges():,} edges")

    # Run comparisons
    all_results = []
    for i, comp_file in enumerate(comparison_files, 1):
        print(f"\n\n{'#' * 70}")
        print(f"Comparison {i}/{len(comparison_files)}")
        print(f"{'#' * 70}")

        result = run_comparison(baseline_data, comp_file, output_folder)
        all_results.append(result)

        _www_graph_cache = None
        gc.collect()

    # Create master summary
    create_master_summary(all_results, output_folder)

    # Calculate final averages
    final_results = calculate_final_averages(all_results, output_folder)

    print(f"\n{'=' * 70}")
    print("COMPLETE")
    print(f"Results: {output_folder}")
    print(f"{'=' * 70}")