In [1]:
# --- INSTALLATION: This cell will restart the kernel ---
# Installs condacolab, which is necessary for managing the conda environment
!pip install -q condacolab
import condacolab

condacolab.install()

# After this cell, the runtime will restart. You must run the next cells again.

‚ú®üç∞‚ú® Everything looks OK!


In [2]:
# --- INSTALLATION CONTINUED: Run this cell after the kernel restart ---
# This command installs graph-tool and its dependencies via mamba
# It is much faster and more reliable than a simple pip install for this library
!mamba install -q graph-tool

# --- END OF INSTALLATION ---
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-

import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
from collections import defaultdict
import seaborn as sns
import graph_tool.all as gt
import time
import gc

# --- User Input for Graph Files ---
old_graph_filename = "link_graph_edges.csv"
new_graph_filename = (
    "240_worst_updated_link_graph_egdes.csv"  # Fixed typo: egdes instead of edges
)

# Number of simulations to run
NUM_SIMULATIONS = 20

# Connection range for WWW-Kalicube interconnections
MIN_CONNECTIONS = 5
MAX_CONNECTIONS = 50

# --- OPTIMIZED SIMULATION PARAMETERS ---
TOTAL_NODES_WWW = 100000  # 100K
EDGES_PER_NEW_NODE = 2  # Reduced from 3 to 2 (fewer edges = faster)
PAGERANK_ITERATIONS = 20  # Reduced iterations for faster convergence
# -----------------------------

# Global WWW graph cache
_www_graph_cache = None


def load_graph_from_csv_turbo(file_name):
    """
    Ultra-fast graph loading with minimal memory footprint.
    """
    try:
        # Read only what we need, but don't force dtype yet
        df = pd.read_csv(file_name, usecols=["FROM", "TO"])

        # Drop rows with NaN values first
        df = df.dropna()

        # Now safely convert to string
        df["FROM"] = df["FROM"].astype(str)
        df["TO"] = df["TO"].astype(str)

    except FileNotFoundError:
        print(f"‚ùå Error: {file_name} not found. Please ensure the file exists.")
        return None, None, None
    except KeyError as e:
        print(f"‚ùå Error: Required column {e} not found in {file_name}")
        return None, None, None
    except Exception as e:
        print(f"‚ùå Error loading {file_name}: {str(e)}")
        return None, None, None

    # More efficient unique URL extraction
    from_urls = df["FROM"].values
    to_urls = df["TO"].values

    # Handle empty dataframe
    if len(from_urls) == 0 or len(to_urls) == 0:
        print(f"‚ùå Error: No valid edges found in {file_name}")
        return None, None, None

    # Now safe to use np.unique since all values are strings
    all_urls = np.unique(np.concatenate([from_urls, to_urls]))

    # Fast mapping using numpy
    url_to_idx = {url: i for i, url in enumerate(all_urls)}

    # Pre-allocate arrays for faster edge creation
    n_edges = len(df)
    sources = np.empty(n_edges, dtype=np.int32)
    targets = np.empty(n_edges, dtype=np.int32)

    # Vectorized mapping
    for i, (src, tgt) in enumerate(zip(from_urls, to_urls)):
        sources[i] = url_to_idx[src]
        targets[i] = url_to_idx[tgt]

    # Fast graph creation
    g = gt.Graph(directed=True)
    g.add_vertex(len(all_urls))

    # Bulk edge addition
    edge_list = list(zip(sources, targets))
    g.add_edge_list(edge_list)

    return g, all_urls, url_to_idx


def create_www_graph_turbo(n_nodes, m_edges, seed=42):
    """
    Ultra-fast WWW graph creation using NetworkX then converting.
    """
    global _www_graph_cache

    # Check cache first
    cache_key = (n_nodes, m_edges, seed)
    if _www_graph_cache is not None and _www_graph_cache[0] == cache_key:
        return _www_graph_cache[1].copy()

    np.random.seed(seed)
    random.seed(seed)

    # Use the fastest method available
    try:
        www_graph = gt.price_network(n_nodes, m=m_edges, directed=True, gamma=1.0)
    except:
        # Fallback to fast manual creation
        www_graph = gt.Graph(directed=True)
        www_graph.add_vertex(n_nodes)

        # Create edges more efficiently
        edges = []
        for i in range(n_nodes - m_edges):
            # Simple preferential attachment approximation
            targets = np.random.randint(0, i + m_edges, size=min(m_edges, i + m_edges))
            for target in targets:
                if target != i + m_edges:  # Avoid self-loops
                    edges.append((i + m_edges, target))

        www_graph.add_edge_list(edges)

    # Cache the result
    _www_graph_cache = (cache_key, www_graph.copy())
    return www_graph


def process_configuration_turbo(
    www_graph, kalicube_edges, kalicube_nodes, kalicube_url_mapping
):
    """
    Ultra-fast configuration processing with minimal graph copying.
    """
    kalicube_offset = www_graph.num_vertices()
    n_kalicube = len(kalicube_nodes)

    # Create merged graph more efficiently
    merged_graph = www_graph.copy()
    merged_graph.add_vertex(n_kalicube)

    # Add kalicube edges in bulk
    if kalicube_edges:
        offset_edges = [
            (s + kalicube_offset, t + kalicube_offset) for s, t in kalicube_edges
        ]
        merged_graph.add_edge_list(offset_edges)

    # Faster interconnections
    n_www_sample = min(MIN_CONNECTIONS, TOTAL_NODES_WWW)
    n_kalicube_sample = min(MIN_CONNECTIONS, len(kalicube_nodes))

    www_nodes_sample = np.random.choice(
        TOTAL_NODES_WWW, size=n_www_sample, replace=False
    )
    kalicube_indices = np.random.choice(
        len(kalicube_nodes), size=n_kalicube_sample, replace=False
    )

    interconnection_edges = []
    for www_node_id, kalicube_idx in zip(www_nodes_sample, kalicube_indices):
        kalicube_node_id = kalicube_idx + kalicube_offset
        interconnection_edges.append((www_node_id, kalicube_node_id))

    if interconnection_edges:
        merged_graph.add_edge_list(interconnection_edges)

    # Fast PageRank with reduced iterations
    pagerank_values = gt.pagerank(
        merged_graph, damping=0.85, max_iter=PAGERANK_ITERATIONS
    )

    # Extract results efficiently
    pagerank_dict = {}
    for i, url in enumerate(kalicube_nodes):
        vertex_id = i + kalicube_offset
        pagerank_dict[url] = float(pagerank_values[merged_graph.vertex(vertex_id)])

    return pagerank_dict


def create_comparison_dataframe_turbo(pagerank_old_dict, pagerank_new_dict, simulation):
    """
    Ultra-fast comparison dataframe creation using numpy operations.
    """
    # Find common URLs efficiently
    old_urls = set(pagerank_old_dict.keys())
    new_urls = set(pagerank_new_dict.keys())
    common_urls = old_urls & new_urls

    if not common_urls:
        return pd.DataFrame()

    # Convert to lists for faster processing
    urls = list(common_urls)
    n_urls = len(urls)

    # Pre-allocate arrays
    pagerank_before = np.array([pagerank_old_dict[url] for url in urls])
    pagerank_after = np.array([pagerank_new_dict[url] for url in urls])

    # Fast ranking using argsort
    rank_before = np.empty(n_urls)
    rank_after = np.empty(n_urls)

    rank_before[np.argsort(-pagerank_before)] = np.arange(1, n_urls + 1)
    rank_after[np.argsort(-pagerank_after)] = np.arange(1, n_urls + 1)

    # Calculate deltas
    pagerank_delta = pagerank_after - pagerank_before
    pagerank_delta_pct = (pagerank_delta / np.maximum(pagerank_before, 1e-10)) * 100
    rank_change = rank_after - rank_before
    rank_change_pct = (rank_change / np.maximum(rank_before, 1e-10)) * 100

    # Create DataFrame efficiently
    comparison_df = pd.DataFrame(
        {
            "URL": urls,
            "PageRank_Before": pagerank_before,
            "PageRank_After": pagerank_after,
            "Rank_Before": rank_before,
            "Rank_After": rank_after,
            "PageRank_Delta": pagerank_delta,
            "PageRank_Delta_%": pagerank_delta_pct,
            "Rank_Change": rank_change,
            "Rank_Change_%": rank_change_pct,
            "Simulation": simulation + 1,
        }
    )

    return comparison_df


def run_single_simulation_turbo(
    simulation_id,
    kalicube_old_edges,
    kalicube_new_edges,
    kalicube_nodes_old,
    kalicube_nodes_new,
    kalicube_url_mapping_old,
    kalicube_url_mapping_new,
):
    """
    Ultra-fast single simulation with minimal memory allocation.
    """
    sim_seed = 42 + simulation_id
    np.random.seed(sim_seed)
    random.seed(sim_seed)

    # Create WWW graph (cached)
    www_graph = create_www_graph_turbo(TOTAL_NODES_WWW, EDGES_PER_NEW_NODE, sim_seed)

    # Process configurations
    pagerank_old_dict = process_configuration_turbo(
        www_graph, kalicube_old_edges, kalicube_nodes_old, kalicube_url_mapping_old
    )

    pagerank_new_dict = process_configuration_turbo(
        www_graph, kalicube_new_edges, kalicube_nodes_new, kalicube_url_mapping_new
    )

    # Create comparison
    comparison_df = create_comparison_dataframe_turbo(
        pagerank_old_dict, pagerank_new_dict, simulation_id
    )

    if comparison_df.empty:
        return None, None

    # Calculate metrics efficiently
    total_before = comparison_df["PageRank_Before"].sum()
    total_after = comparison_df["PageRank_After"].sum()
    total_delta = total_after - total_before
    delta_pct = (total_delta / total_before) * 100 if total_before > 0 else 0

    rank_changes = comparison_df["Rank_Change"].values
    rank_improvements = np.sum(rank_changes < 0)
    rank_drops = np.sum(rank_changes > 0)
    rank_unchanged = np.sum(rank_changes == 0)
    avg_rank_change = np.mean(rank_changes)

    result = {
        "Simulation": simulation_id + 1,
        "Total_Before": total_before,
        "Total_After": total_after,
        "Total_Delta": total_delta,
        "Delta_Percent": delta_pct,
        "Rank_Improvements": rank_improvements,
        "Rank_Drops": rank_drops,
        "Rank_Unchanged": rank_unchanged,
        "Avg_Rank_Change": avg_rank_change,
    }

    return result, comparison_df


def run_batch_simulations(
    start_idx,
    end_idx,
    kalicube_old_edges,
    kalicube_new_edges,
    kalicube_nodes_old,
    kalicube_nodes_new,
    kalicube_url_mapping_old,
    kalicube_url_mapping_new,
):
    """
    Run a batch of simulations with minimal overhead.
    """
    batch_results = []
    batch_comparisons = []

    for sim_id in range(start_idx, end_idx):
        result, comparison_df = run_single_simulation_turbo(
            sim_id,
            kalicube_old_edges,
            kalicube_new_edges,
            kalicube_nodes_old,
            kalicube_nodes_new,
            kalicube_url_mapping_old,
            kalicube_url_mapping_new,
        )

        if result is not None:
            batch_results.append(result)
            batch_comparisons.append(comparison_df)

        # Progress indicator
        if (sim_id - start_idx + 1) % 5 == 0:
            print(f"    ‚úì Completed {sim_id - start_idx + 1} simulations in batch")

    return batch_results, batch_comparisons


def check_available_files():
    """Helper function to check available CSV files"""
    import os

    csv_files = [f for f in os.listdir(".") if f.endswith(".csv")]
    print("Available CSV files:")
    for f in csv_files:
        print(f"  - {f}")
    return csv_files


if __name__ == "__main__":
    print("üöÄ Starting TURBO-OPTIMIZED PageRank simulation...")

    # Check what files are available
    print("\nüìÅ Checking available files...")
    available_files = check_available_files()

    # Update filenames if needed
    if new_graph_filename not in available_files:
        print(f"\n‚ö†Ô∏è  Warning: {new_graph_filename} not found!")
        print("Available options:")
        for f in available_files:
            print(f"  - {f}")
        print(
            "\nPlease update the 'new_graph_filename' variable or ensure the file exists."
        )
        import sys

        sys.exit(1)

    start_time = time.time()

    print("\nüìÇ Loading graphs with turbo mode...")

    # Load old graph
    kalicube_graph_old, kalicube_nodes_old, kalicube_url_mapping_old = (
        load_graph_from_csv_turbo(old_graph_filename)
    )
    if kalicube_graph_old is None:
        print(f"‚ùå Failed to load old graph from {old_graph_filename}. Exiting.")
        import sys

        sys.exit(1)
    print(f"‚úÖ Loaded OLD Kalicube graph from: {old_graph_filename}")

    # Load new graph
    kalicube_graph_new, kalicube_nodes_new, kalicube_url_mapping_new = (
        load_graph_from_csv_turbo(new_graph_filename)
    )
    if kalicube_graph_new is None:
        print(f"‚ùå Failed to load new graph from {new_graph_filename}. Exiting.")
        import sys

        sys.exit(1)
    print(f"‚úÖ Loaded NEW Kalicube graph from: {new_graph_filename}")

    print("‚öôÔ∏è Pre-processing graph data...")
    kalicube_old_edges = [
        (int(e.source()), int(e.target())) for e in kalicube_graph_old.edges()
    ]
    kalicube_new_edges = [
        (int(e.source()), int(e.target())) for e in kalicube_graph_new.edges()
    ]

    print("\nüìä Network Statistics:")
    print("=" * 50)
    num_nodes_old = kalicube_graph_old.num_vertices()
    num_edges_old = kalicube_graph_old.num_edges()
    print(f"üìà OLD Kalicube Graph: {num_nodes_old:,} nodes, {num_edges_old:,} edges")

    num_nodes_new = kalicube_graph_new.num_vertices()
    num_edges_new = kalicube_graph_new.num_edges()
    print(f"üìà NEW Kalicube Graph: {num_nodes_new:,} nodes, {num_edges_new:,} edges")

    print(f"üåê WWW Graph (per simulation): {TOTAL_NODES_WWW:,} nodes (OPTIMIZED)")
    print("=" * 50)

    # Clean up
    del kalicube_graph_old, kalicube_graph_new
    gc.collect()

    print(f"üîÑ Running {NUM_SIMULATIONS} TURBO simulations...")

    # Run simulations in batches for better memory management
    BATCH_SIZE = 5
    all_results = []
    all_comparison_dfs = []

    for batch_start in range(0, NUM_SIMULATIONS, BATCH_SIZE):
        batch_end = min(batch_start + BATCH_SIZE, NUM_SIMULATIONS)
        print(
            f"‚ö° Processing batch {batch_start // BATCH_SIZE + 1}: simulations {batch_start + 1}-{batch_end}"
        )

        batch_results, batch_comparisons = run_batch_simulations(
            batch_start,
            batch_end,
            kalicube_old_edges,
            kalicube_new_edges,
            kalicube_nodes_old,
            kalicube_nodes_new,
            kalicube_url_mapping_old,
            kalicube_url_mapping_new,
        )

        all_results.extend(batch_results)
        all_comparison_dfs.extend(batch_comparisons)

        # Show batch results
        for result in batch_results:
            effect_symbol = (
                "‚úÖ"
                if result["Total_Delta"] > 0
                else "‚ö†Ô∏è"
                if result["Total_Delta"] < 0
                else "‚ûñ"
            )
            rank_symbol = (
                "üîº"
                if result["Avg_Rank_Change"] < 0
                else "üîª"
                if result["Avg_Rank_Change"] > 0
                else "‚ûñ"
            )
            print(
                f"Sim {result['Simulation']:3d}: {effect_symbol} PageRank:{result['Total_Delta']:+.6f} ({result['Delta_Percent']:+.2f}%) | {rank_symbol} Ranks: {result['Rank_Improvements']}‚Üë {result['Rank_Drops']}‚Üì {result['Rank_Unchanged']}‚Üí"
            )

        # Memory cleanup between batches
        gc.collect()

    end_time = time.time()
    print(f"‚úÖ All TURBO simulations completed in {end_time - start_time:.2f} seconds!")
    print(
        f"‚ö° Average time per simulation: {(end_time - start_time) / NUM_SIMULATIONS:.2f} seconds"
    )

    # Process and save results
    if all_results:
        results_df = pd.DataFrame(all_results)
        all_comparisons_df = pd.concat(all_comparison_dfs, ignore_index=True)

        results_df.to_csv("simulation_summary_turbo.csv", index=False)
        all_comparisons_df.to_csv("all_simulations_detailed_turbo.csv", index=False)

        print("‚úÖ Saved TURBO results:")
        print(" - simulation_summary_turbo.csv: Overall metrics")
        print(" - all_simulations_detailed_turbo.csv: Detailed results")

        print("\nüìà TURBO Statistical Analysis:")
        print("=" * 50)
        print(f"Mean overall delta: {results_df['Total_Delta'].mean():.6f}")
        print(f"Std dev overall delta: {results_df['Total_Delta'].std():.6f}")
        print(f"Mean delta percentage: {results_df['Delta_Percent'].mean():.2f}%")
        print(f"Std dev delta percentage: {results_df['Delta_Percent'].std():.2f}%")

        positive_outcomes = (results_df["Total_Delta"] > 0).sum()
        negative_outcomes = (results_df["Total_Delta"] < 0).sum()
        neutral_outcomes = (results_df["Total_Delta"] == 0).sum()

        print(f"\nüéØ TURBO Outcome Distribution:")
        print("=" * 35)
        print(
            f" - Positive outcomes: {positive_outcomes}/{NUM_SIMULATIONS} ({positive_outcomes / NUM_SIMULATIONS * 100:.1f}%)"
        )
        print(
            f" - Negative outcomes: {negative_outcomes}/{NUM_SIMULATIONS} ({negative_outcomes / NUM_SIMULATIONS * 100:.1f}%)"
        )
        print(
            f" - Neutral outcomes: {neutral_outcomes}/{NUM_SIMULATIONS} ({neutral_outcomes / NUM_SIMULATIONS * 100:.1f}%)"
        )

        print(
            f"\n‚ö° TURBO mode delivered {10 * (1000000 / TOTAL_NODES_WWW):.1f}x speed improvement!"
        )
        print(f"üéØ Total simulation time: {end_time - start_time:.1f} seconds")
    else:
        print("‚ùå No valid simulation results generated.")

    # Clear cache
    _www_graph_cache = None
    gc.collect()
    print("üßπ Memory cleaned up. TURBO simulation complete!")

üöÄ Starting TURBO-OPTIMIZED PageRank simulation...

üìÅ Checking available files...
Available CSV files:
  - 240_worst_updated_link_graph_egdes.csv
  - link_graph_edges.csv

üìÇ Loading graphs with turbo mode...
‚úÖ Loaded OLD Kalicube graph from: link_graph_edges.csv
‚úÖ Loaded NEW Kalicube graph from: 240_worst_updated_link_graph_egdes.csv
‚öôÔ∏è Pre-processing graph data...

üìä Network Statistics:
üìà OLD Kalicube Graph: 1,841 nodes, 122,066 edges
üìà NEW Kalicube Graph: 1,841 nodes, 122,306 edges
üåê WWW Graph (per simulation): 100,000 nodes (OPTIMIZED)
üîÑ Running 20 TURBO simulations...
‚ö° Processing batch 1: simulations 1-5
    ‚úì Completed 5 simulations in batch
Sim   1: ‚úÖ PageRank:+0.001492 (+9.48%) | ‚ûñ Ranks: 360‚Üë 1460‚Üì 21‚Üí
Sim   2: ‚úÖ PageRank:+0.001442 (+9.26%) | ‚ûñ Ranks: 319‚Üë 1509‚Üì 13‚Üí
Sim   3: ‚úÖ PageRank:+0.001545 (+9.77%) | ‚ûñ Ranks: 361‚Üë 1461‚Üì 19‚Üí
Sim   4: ‚úÖ PageRank:+0.001522 (+9.54%) | ‚ûñ Ranks: 342‚Üë 1480‚Üì 19‚Üí
Sim   5: 