In [None]:
# NetworKit-based PageRank Simulation - Real Web Network Version
# This uses your 500K real web page network instead of synthetic data

# Installation (run first)
# !pip install networkit pandas numpy matplotlib seaborn

import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
from collections import defaultdict
import seaborn as sns
import networkit as nk
import time
import gc
import os

# --- User Input for Graph Files ---
old_graph_filename = "link_graph_edges.csv"
new_graph_filename = "240_best_updated_link_graph_1.csv"

# REAL WEB NETWORK FILE - Your downloaded FineWeb network
real_web_network_file = "fineweb_500k_pages.csv"

# Number of simulations to run
NUM_SIMULATIONS = 100

# Connection range for WWW-Kalicube interconnections
MIN_CONNECTIONS = 5
MAX_CONNECTIONS = 50

# Simulation parameters
TOTAL_NODES_WWW = None  # Will be set from real network
EDGES_PER_NEW_NODE = 2  # Not used with real network
PAGERANK_TOLERANCE = 1e-6

# Global WWW graph cache
_www_graph_cache = None


def load_real_web_network(network_file, sample_size=100000):
    """
    Load REAL web network from your downloaded FineWeb data
    Replaces synthetic Barabasi-Albert network
    """
    print(f"Loading REAL web network from: {network_file}")

    try:
        # Check if file exists
        if not os.path.exists(network_file):
            print(f"Error: Real web network file not found: {network_file}")
            print("Available CSV files:")
            csv_files = [f for f in os.listdir(".") if f.endswith(".csv")]
            for f in csv_files:
                print(f"   - {f}")
            return None, None, None

        # Load the real web links
        print("Loading page-to-page links...")
        df = pd.read_csv(network_file, usecols=["FROM", "TO"])
        df = df.dropna()
        df["FROM"] = df["FROM"].astype(str)
        df["TO"] = df["TO"].astype(str)

        print(f"Loaded {len(df):,} real web page links")

        # Sample if too large for memory
        if sample_size and len(df) > sample_size:
            df = df.sample(n=sample_size, random_state=42)
            print(f"Sampled down to {sample_size:,} links for simulation")

        # Create URL to node mapping
        from_pages = df["FROM"].values
        to_pages = df["TO"].values

        # Get unique pages
        all_pages = np.unique(np.concatenate([from_pages, to_pages]))
        page_to_idx = {page: i for i, page in enumerate(all_pages)}

        # Create NetworKit graph
        n_nodes = len(all_pages)
        g = nk.Graph(n=n_nodes, weighted=False, directed=True)

        # Add edges efficiently
        for src_page, tgt_page in zip(from_pages, to_pages):
            src_idx = page_to_idx[src_page]
            tgt_idx = page_to_idx[tgt_page]
            g.addEdge(src_idx, tgt_idx)

        print(f"Created REAL web network:")
        print(f"   Nodes (web pages): {n_nodes:,}")
        print(f"   Edges (hyperlinks): {g.numberOfEdges():,}")
        print(f"   Network type: REAL web pages")

        return g, all_pages, page_to_idx

    except Exception as e:
        print(f"Error loading real web network: {e}")
        return None, None, None


def create_www_graph_networkit_real(n_nodes=None, m_edges=None, seed=42):
    """
    Create WWW graph using REAL web network data instead of synthetic
    FIXED: Proper caching to avoid reloading network every simulation
    """
    global _www_graph_cache
    global TOTAL_NODES_WWW

    # FIXED: Use consistent cache key (ignore seed for real network)
    cache_key = "real_web_network"
    if _www_graph_cache is not None and _www_graph_cache[0] == cache_key:
        # Silent cache usage - no print statement
        cached_graph = _www_graph_cache[1]
        new_graph = nk.Graph(
            n=cached_graph.numberOfNodes(), weighted=False, directed=True
        )
        for u, v in cached_graph.iterEdges():
            new_graph.addEdge(u, v)
        return new_graph

    # Load real web network ONLY ONCE
    print("Loading real web network (ONCE)...")
    real_graph, pages, page_mapping = load_real_web_network(
        real_web_network_file, sample_size=100000
    )

    if real_graph is None:
        print("Falling back to synthetic network...")
        nk.setSeed(seed, False)
        generator = nk.generators.BarabasiAlbertGenerator(k=2, nMax=100000, n0=2)
        synthetic_graph = generator.generate()
        TOTAL_NODES_WWW = synthetic_graph.numberOfNodes()
        return synthetic_graph

    # Update global parameter
    TOTAL_NODES_WWW = real_graph.numberOfNodes()
    print(f"Set TOTAL_NODES_WWW to {TOTAL_NODES_WWW:,} nodes")

    # Cache the result with consistent key
    cached_graph = nk.Graph(n=real_graph.numberOfNodes(), weighted=False, directed=True)
    for u, v in real_graph.iterEdges():
        cached_graph.addEdge(u, v)
    _www_graph_cache = (cache_key, cached_graph)
    print("Network cached for future simulations")

    return real_graph


def load_graph_from_csv_networkit(file_name):
    """
    Ultra-fast graph loading with NetworKit.
    NetworKit is significantly faster than graph-tool for large networks.
    """
    try:
        # Read only what we need
        df = pd.read_csv(file_name, usecols=["FROM", "TO"])
        df = df.dropna()
        df["FROM"] = df["FROM"].astype(str)
        df["TO"] = df["TO"].astype(str)

    except FileNotFoundError:
        print(f"Error: {file_name} not found. Please ensure the file exists.")
        return None, None, None
    except KeyError as e:
        print(f"Error: Required column {e} not found in {file_name}")
        return None, None, None
    except Exception as e:
        print(f"Error loading {file_name}: {str(e)}")
        return None, None, None

    # Extract URLs and create mapping
    from_urls = df["FROM"].values
    to_urls = df["TO"].values

    if len(from_urls) == 0 or len(to_urls) == 0:
        print(f"Error: No valid edges found in {file_name}")
        return None, None, None

    # Get unique URLs
    all_urls = np.unique(np.concatenate([from_urls, to_urls]))
    url_to_idx = {url: i for i, url in enumerate(all_urls)}

    # Create NetworKit graph
    n_nodes = len(all_urls)
    g = nk.Graph(n=n_nodes, weighted=False, directed=True)

    # Add edges efficiently
    for src_url, tgt_url in zip(from_urls, to_urls):
        src_idx = url_to_idx[src_url]
        tgt_idx = url_to_idx[tgt_url]
        g.addEdge(src_idx, tgt_idx)

    return g, all_urls, url_to_idx


def process_configuration_networkit(
    www_graph, kalicube_edges, kalicube_nodes, kalicube_url_mapping
):
    """
    Process configuration using NetworKit's high-performance algorithms.
    """
    kalicube_offset = www_graph.numberOfNodes()
    n_kalicube = len(kalicube_nodes)

    # Create merged graph
    merged_graph = nk.Graph(n=www_graph.numberOfNodes(), weighted=False, directed=True)
    # Copy all edges from www_graph
    for u, v in www_graph.iterEdges():
        merged_graph.addEdge(u, v)

    # Add Kalicube nodes
    for _ in range(n_kalicube):
        merged_graph.addNode()

    # Add Kalicube edges
    if kalicube_edges:
        for src, tgt in kalicube_edges:
            merged_graph.addEdge(src + kalicube_offset, tgt + kalicube_offset)

    # Add interconnections between WWW and Kalicube
    n_www_sample = min(MIN_CONNECTIONS, TOTAL_NODES_WWW if TOTAL_NODES_WWW else 100000)
    n_kalicube_sample = min(MIN_CONNECTIONS, len(kalicube_nodes))

    # Sample nodes for interconnection
    www_nodes_sample = np.random.choice(
        TOTAL_NODES_WWW if TOTAL_NODES_WWW else 100000, size=n_www_sample, replace=False
    )
    kalicube_indices = np.random.choice(
        len(kalicube_nodes), size=n_kalicube_sample, replace=False
    )

    # Add interconnection edges
    for www_node_id, kalicube_idx in zip(www_nodes_sample, kalicube_indices):
        kalicube_node_id = kalicube_idx + kalicube_offset
        merged_graph.addEdge(www_node_id, kalicube_node_id)

    # Run PageRank using NetworKit's optimized implementation
    pagerank_algo = nk.centrality.PageRank(
        merged_graph, damp=0.85, tol=PAGERANK_TOLERANCE
    )
    pagerank_algo.run()
    pagerank_scores = pagerank_algo.scores()

    # Extract results for Kalicube nodes
    pagerank_dict = {}
    for i, url in enumerate(kalicube_nodes):
        vertex_id = i + kalicube_offset
        pagerank_dict[url] = pagerank_scores[vertex_id]

    return pagerank_dict


def create_comparison_dataframe_networkit(
    pagerank_old_dict, pagerank_new_dict, simulation
):
    """
    Create comparison dataframe with optimized numpy operations.
    """
    # Find common URLs
    old_urls = set(pagerank_old_dict.keys())
    new_urls = set(pagerank_new_dict.keys())
    common_urls = old_urls & new_urls

    if not common_urls:
        return pd.DataFrame()

    # Convert to arrays for fast processing
    urls = list(common_urls)
    n_urls = len(urls)

    pagerank_before = np.array([pagerank_old_dict[url] for url in urls])
    pagerank_after = np.array([pagerank_new_dict[url] for url in urls])

    # Fast ranking using argsort
    rank_before = np.empty(n_urls)
    rank_after = np.empty(n_urls)

    rank_before[np.argsort(-pagerank_before)] = np.arange(1, n_urls + 1)
    rank_after[np.argsort(-pagerank_after)] = np.arange(1, n_urls + 1)

    # Calculate changes
    pagerank_delta = pagerank_after - pagerank_before
    pagerank_delta_pct = (pagerank_delta / np.maximum(pagerank_before, 1e-10)) * 100
    rank_change = rank_after - rank_before
    rank_change_pct = (rank_change / np.maximum(rank_before, 1e-10)) * 100

    # Create DataFrame
    comparison_df = pd.DataFrame(
        {
            "URL": urls,
            "PageRank_Before": pagerank_before,
            "PageRank_After": pagerank_after,
            "Rank_Before": rank_before,
            "Rank_After": rank_after,
            "PageRank_Delta": pagerank_delta,
            "PageRank_Delta_%": pagerank_delta_pct,
            "Rank_Change": rank_change,
            "Rank_Change_%": rank_change_pct,
            "Simulation": simulation + 1,
        }
    )

    return comparison_df


def run_single_simulation_networkit(
    simulation_id,
    kalicube_old_edges,
    kalicube_new_edges,
    kalicube_nodes_old,
    kalicube_nodes_new,
    kalicube_url_mapping_old,
    kalicube_url_mapping_new,
):
    """
    Run single simulation with NetworKit's high-performance algorithms.
    Modified to use real web network with FIXED caching.
    """
    sim_seed = 42 + simulation_id
    np.random.seed(sim_seed)
    random.seed(sim_seed)

    # Create REAL WWW graph instead of synthetic (now properly cached)
    www_graph = create_www_graph_networkit_real(seed=sim_seed)

    # Process both configurations
    pagerank_old_dict = process_configuration_networkit(
        www_graph, kalicube_old_edges, kalicube_nodes_old, kalicube_url_mapping_old
    )

    pagerank_new_dict = process_configuration_networkit(
        www_graph, kalicube_new_edges, kalicube_nodes_new, kalicube_url_mapping_new
    )

    # Create comparison
    comparison_df = create_comparison_dataframe_networkit(
        pagerank_old_dict, pagerank_new_dict, simulation_id
    )

    if comparison_df.empty:
        return None, None

    # Calculate summary metrics
    total_before = comparison_df["PageRank_Before"].sum()
    total_after = comparison_df["PageRank_After"].sum()
    total_delta = total_after - total_before
    delta_pct = (total_delta / total_before) * 100 if total_before > 0 else 0

    rank_changes = comparison_df["Rank_Change"].values
    rank_improvements = np.sum(rank_changes < 0)
    rank_drops = np.sum(rank_changes > 0)
    rank_unchanged = np.sum(rank_changes == 0)
    avg_rank_change = np.mean(rank_changes)

    result = {
        "Simulation": simulation_id + 1,
        "Total_Before": total_before,
        "Total_After": total_after,
        "Total_Delta": total_delta,
        "Delta_Percent": delta_pct,
        "Rank_Improvements": rank_improvements,
        "Rank_Drops": rank_drops,
        "Rank_Unchanged": rank_unchanged,
        "Avg_Rank_Change": avg_rank_change,
    }

    return result, comparison_df


def run_batch_simulations_networkit(
    start_idx,
    end_idx,
    kalicube_old_edges,
    kalicube_new_edges,
    kalicube_nodes_old,
    kalicube_nodes_new,
    kalicube_url_mapping_old,
    kalicube_url_mapping_new,
):
    """
    Run batch of simulations with NetworKit optimization.
    """
    batch_results = []
    batch_comparisons = []

    for sim_id in range(start_idx, end_idx):
        result, comparison_df = run_single_simulation_networkit(
            sim_id,
            kalicube_old_edges,
            kalicube_new_edges,
            kalicube_nodes_old,
            kalicube_nodes_new,
            kalicube_url_mapping_old,
            kalicube_url_mapping_new,
        )

        if result is not None:
            batch_results.append(result)
            batch_comparisons.append(comparison_df)

        # Progress indicator
        if (sim_id - start_idx + 1) % 5 == 0:
            print(f"    Completed {sim_id - start_idx + 1} simulations in batch")

    return batch_results, batch_comparisons


def check_available_files():
    """Helper function to check available CSV files"""
    csv_files = [f for f in os.listdir(".") if f.endswith(".csv")]
    print("Available CSV files:")
    for f in csv_files:
        print(f"  - {f}")
    return csv_files


def plot_results_networkit(results_df):
    """
    Create visualization of results using matplotlib/seaborn.
    """
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))

    # Distribution of delta percentages
    ax1.hist(results_df["Delta_Percent"], bins=20, alpha=0.7, color="skyblue")
    ax1.set_xlabel("Delta Percentage (%)")
    ax1.set_ylabel("Frequency")
    ax1.set_title("Distribution of PageRank Changes")
    ax1.axvline(0, color="red", linestyle="--", alpha=0.8)

    # Rank improvements vs drops
    improvements = results_df["Rank_Improvements"].mean()
    drops = results_df["Rank_Drops"].mean()
    unchanged = results_df["Rank_Unchanged"].mean()

    ax2.bar(
        ["Improvements", "Drops", "Unchanged"],
        [improvements, drops, unchanged],
        color=["green", "red", "gray"],
        alpha=0.7,
    )
    ax2.set_ylabel("Average Count")
    ax2.set_title("Average Rank Changes per Simulation")

    # Time series of delta percentages
    ax3.plot(results_df["Simulation"], results_df["Delta_Percent"], "o-", alpha=0.7)
    ax3.set_xlabel("Simulation Number")
    ax3.set_ylabel("Delta Percentage (%)")
    ax3.set_title("PageRank Changes Over Simulations")
    ax3.axhline(0, color="red", linestyle="--", alpha=0.5)

    # Box plot of key metrics
    metrics_data = [results_df["Delta_Percent"], results_df["Avg_Rank_Change"]]
    ax4.boxplot(metrics_data, labels=["Delta %", "Avg Rank Change"])
    ax4.set_title("Distribution of Key Metrics")
    ax4.axhline(0, color="red", linestyle="--", alpha=0.5)

    plt.tight_layout()
    plt.savefig(
        "networkit_real_web_simulation_results.png", dpi=300, bbox_inches="tight"
    )
    plt.show()


if __name__ == "__main__":
    print("Starting NetworKit PageRank simulation with REAL WEB NETWORK!")
    print(f"NetworKit version: {nk.__version__}")

    # Check available files
    print("\nChecking available files...")
    available_files = check_available_files()

    # Validate Kalicube files
    if new_graph_filename not in available_files:
        print(f"\nWarning: {new_graph_filename} not found!")
        print("Available options:")
        for f in available_files:
            print(f"  - {f}")
        print(
            "\nPlease update the 'new_graph_filename' variable or ensure the file exists."
        )
        exit(1)

    # Check for real web network file
    if real_web_network_file not in available_files:
        print(f"\nReal web network file not found: {real_web_network_file}")
        print("Looking for FineWeb network files...")
        fineweb_files = [
            f
            for f in available_files
            if "fineweb" in f.lower() and "pages" in f.lower()
        ]
        if fineweb_files:
            real_web_network_file = fineweb_files[0]
            print(f"Found: {real_web_network_file}")
        else:
            print(
                "No FineWeb network files found. Using synthetic network as fallback."
            )

    start_time = time.time()

    print("\nLoading graphs with NetworKit optimization...")

    # Load old Kalicube graph
    kalicube_graph_old, kalicube_nodes_old, kalicube_url_mapping_old = (
        load_graph_from_csv_networkit(old_graph_filename)
    )
    if kalicube_graph_old is None:
        print(f"Failed to load old graph from {old_graph_filename}. Exiting.")
        exit(1)
    print(f"Loaded OLD Kalicube graph from: {old_graph_filename}")

    # Load new Kalicube graph
    kalicube_graph_new, kalicube_nodes_new, kalicube_url_mapping_new = (
        load_graph_from_csv_networkit(new_graph_filename)
    )
    if kalicube_graph_new is None:
        print(f"Failed to load new graph from {new_graph_filename}. Exiting.")
        exit(1)
    print(f"Loaded NEW Kalicube graph from: {new_graph_filename}")

    print("Pre-processing graph data...")
    # Convert NetworKit edges to list format
    kalicube_old_edges = [(u, v) for u, v in kalicube_graph_old.iterEdges()]
    kalicube_new_edges = [(u, v) for u, v in kalicube_graph_new.iterEdges()]

    print("\nNetwork Statistics:")
    print("=" * 50)
    num_nodes_old = kalicube_graph_old.numberOfNodes()
    num_edges_old = kalicube_graph_old.numberOfEdges()
    print(f"OLD Kalicube Graph: {num_nodes_old:,} nodes, {num_edges_old:,} edges")

    num_nodes_new = kalicube_graph_new.numberOfNodes()
    num_edges_new = kalicube_graph_new.numberOfEdges()
    print(f"NEW Kalicube Graph: {num_nodes_new:,} nodes, {num_edges_new:,} edges")

    print(f"WWW Graph: REAL web network from {real_web_network_file}")
    print("=" * 50)

    # Clean up
    del kalicube_graph_old, kalicube_graph_new
    gc.collect()

    print(f"Running {NUM_SIMULATIONS} NetworKit simulations with REAL web data...")

    # Run simulations in batches
    BATCH_SIZE = 5
    all_results = []
    all_comparison_dfs = []

    for batch_start in range(0, NUM_SIMULATIONS, BATCH_SIZE):
        batch_end = min(batch_start + BATCH_SIZE, NUM_SIMULATIONS)
        print(
            f"Processing batch {batch_start // BATCH_SIZE + 1}: simulations {batch_start + 1}-{batch_end}"
        )

        batch_results, batch_comparisons = run_batch_simulations_networkit(
            batch_start,
            batch_end,
            kalicube_old_edges,
            kalicube_new_edges,
            kalicube_nodes_old,
            kalicube_nodes_new,
            kalicube_url_mapping_old,
            kalicube_url_mapping_new,
        )

        all_results.extend(batch_results)
        all_comparison_dfs.extend(batch_comparisons)

        # Show batch results
        for result in batch_results:
            effect_symbol = (
                "+"
                if result["Total_Delta"] > 0
                else "-"
                if result["Total_Delta"] < 0
                else "="
            )
            print(
                f"Sim {result['Simulation']:3d}: {effect_symbol} PageRank:{result['Total_Delta']:+.6f} "
                f"({result['Delta_Percent']:+.2f}%) | Ranks: "
                f"{result['Rank_Improvements']}up {result['Rank_Drops']}down {result['Rank_Unchanged']}same"
            )

        # Memory cleanup
        gc.collect()

    end_time = time.time()
    print(f"All REAL WEB simulations completed in {end_time - start_time:.2f} seconds!")
    print(
        f"Average time per simulation: {(end_time - start_time) / NUM_SIMULATIONS:.2f} seconds"
    )

    # Process and save results
    if all_results:
        results_df = pd.DataFrame(all_results)
        all_comparisons_df = pd.concat(all_comparison_dfs, ignore_index=True)

        results_df.to_csv("simulation_summary_real_web.csv", index=False)
        all_comparisons_df.to_csv("all_simulations_detailed_real_web.csv", index=False)

        print("Saved REAL WEB simulation results:")
        print(" - simulation_summary_real_web.csv: Overall metrics")
        print(" - all_simulations_detailed_real_web.csv: Detailed results")

        # Generate visualization
        print("\nGenerating result visualizations...")
        plot_results_networkit(results_df)

        print("\nREAL WEB Network Statistical Analysis:")
        print("=" * 50)
        print(f"Mean overall delta: {results_df['Total_Delta'].mean():.6f}")
        print(f"Std dev overall delta: {results_df['Total_Delta'].std():.6f}")
        print(f"Mean delta percentage: {results_df['Delta_Percent'].mean():.2f}%")
        print(f"Std dev delta percentage: {results_df['Delta_Percent'].std():.2f}%")

        positive_outcomes = (results_df["Total_Delta"] > 0).sum()
        negative_outcomes = (results_df["Total_Delta"] < 0).sum()
        neutral_outcomes = (results_df["Total_Delta"] == 0).sum()

        print(f"\nREAL WEB Outcome Distribution:")
        print("=" * 35)
        print(
            f" - Positive outcomes: {positive_outcomes}/{NUM_SIMULATIONS} "
            f"({positive_outcomes / NUM_SIMULATIONS * 100:.1f}%)"
        )
        print(
            f" - Negative outcomes: {negative_outcomes}/{NUM_SIMULATIONS} "
            f"({negative_outcomes / NUM_SIMULATIONS * 100:.1f}%)"
        )
        print(
            f" - Neutral outcomes: {neutral_outcomes}/{NUM_SIMULATIONS} "
            f"({neutral_outcomes / NUM_SIMULATIONS * 100:.1f}%)"
        )

        print(f"\nREAL WEB NETWORK SIMULATION COMPLETE!")
        print(f"Total simulation time: {end_time - start_time:.1f} seconds")
        print(f"Analyzed PageRank on real web pages from FineWeb dataset")
        print(f"Results are based on actual web topology, not synthetic!")

    else:
        print("No valid simulation results generated.")

    # Clear cache and cleanup
    _www_graph_cache = None
    gc.collect()
    print("Memory cleaned up. REAL WEB simulation complete!")