In [None]:
# Google Drive Folder-Level PageRank Analysis with REAL WWW Data
# Uses FineWeb dataset CSV with FROM and TO columns as WWW graph
# Calculates overall averages across all files in the strategy

# === INSTALLATION CELL (Run first) ===
!pip install networkit pandas numpy

# === MOUNT GOOGLE DRIVE ===
from google.colab import drive

drive.mount("/content/drive")

# === MAIN CODE ===
import pandas as pd
import numpy as np
import random
import networkit as nk
import os
import gc
from pathlib import Path

# ============================================
# USER CONFIGURATION
# ============================================
BASELINE_PATH = "/content/drive/MyDrive/WebKnoGraph/results/link_graph_edges.csv"
COMPARISON_FOLDER = (
    "/content/drive/MyDrive/WebKnoGraph/results/automatic_led/high_batches/"
)

# NEW: Path to FineWeb WWW graph CSV
FINEWEB_WWW_PATH = "/content/drive/MyDrive/WebKnoGraph/results/fineweb_500k_pages.csv"

NUM_SIMULATIONS = 10

# Simulation Parameters
MIN_CONNECTIONS = 5
MAX_CONNECTIONS = 50
PAGERANK_TOLERANCE = 1e-6

# ============================================
# CORE FUNCTIONS
# ============================================

_www_graph_cache = None


def load_graph_from_csv_networkit(file_path, graph_name="graph"):
    """Load graph from CSV file."""
    try:
        print(f"  Loading {graph_name} from {os.path.basename(file_path)}...")
        df = pd.read_csv(file_path, usecols=["FROM", "TO"])
        df = df.dropna()
        df["FROM"] = df["FROM"].astype(str)
        df["TO"] = df["TO"].astype(str)

        if len(df) == 0:
            print(f"  ERROR: No valid edges found in {file_path}")
            return None, None, None

        from_urls = df["FROM"].values
        to_urls = df["TO"].values
        all_urls = np.unique(np.concatenate([from_urls, to_urls]))
        url_to_idx = {url: i for i, url in enumerate(all_urls)}

        g = nk.Graph(n=len(all_urls), weighted=False, directed=True)
        for src_url, tgt_url in zip(from_urls, to_urls):
            g.addEdge(url_to_idx[src_url], url_to_idx[tgt_url])

        print(f"    Loaded: {len(all_urls):,} nodes, {len(df):,} edges")
        return g, all_urls, url_to_idx
    except Exception as e:
        print(f"  ERROR loading {file_path}: {str(e)}")
        return None, None, None


def load_www_graph_networkit(www_csv_path):
    """Load REAL WWW graph from FineWeb dataset with caching."""
    global _www_graph_cache

    if _www_graph_cache is not None and _www_graph_cache[0] == www_csv_path:
        print("  Using cached WWW graph")
        cached_graph = _www_graph_cache[1]
        new_graph = nk.Graph(
            n=cached_graph.numberOfNodes(), weighted=False, directed=True
        )
        for u, v in cached_graph.iterEdges():
            new_graph.addEdge(u, v)
        return new_graph, _www_graph_cache[2]

    print("\nLoading REAL WWW graph from FineWeb dataset...")
    www_graph, www_nodes, www_url_mapping = load_graph_from_csv_networkit(
        www_csv_path, graph_name="WWW graph"
    )

    if www_graph is None:
        raise ValueError(f"Failed to load WWW graph from {www_csv_path}")

    # Cache the graph
    cached_graph = nk.Graph(n=www_graph.numberOfNodes(), weighted=False, directed=True)
    for u, v in www_graph.iterEdges():
        cached_graph.addEdge(u, v)
    _www_graph_cache = (www_csv_path, cached_graph, www_nodes)

    print(f"  WWW graph cached successfully")
    return www_graph, www_nodes


def process_configuration_networkit(
    www_graph, www_nodes, kalicube_edges, kalicube_nodes, kalicube_url_mapping
):
    """Process configuration and calculate PageRank."""
    kalicube_offset = www_graph.numberOfNodes()
    n_kalicube = len(kalicube_nodes)
    n_www = www_graph.numberOfNodes()

    # Create merged graph
    merged_graph = nk.Graph(n=n_www, weighted=False, directed=True)
    for u, v in www_graph.iterEdges():
        merged_graph.addEdge(u, v)

    # Add Kalicube nodes
    for _ in range(n_kalicube):
        merged_graph.addNode()

    # Add Kalicube internal edges
    if kalicube_edges:
        for src, tgt in kalicube_edges:
            merged_graph.addEdge(src + kalicube_offset, tgt + kalicube_offset)

    # Connect WWW to Kalicube
    n_www_sample = min(MAX_CONNECTIONS, n_www)
    n_kalicube_sample = min(MAX_CONNECTIONS, n_kalicube)

    www_nodes_sample = np.random.choice(n_www, size=n_www_sample, replace=False)
    kalicube_indices = np.random.choice(
        n_kalicube, size=n_kalicube_sample, replace=False
    )

    for www_node_id, kalicube_idx in zip(www_nodes_sample, kalicube_indices):
        kalicube_node_id = kalicube_idx + kalicube_offset
        merged_graph.addEdge(www_node_id, kalicube_node_id)

    # Calculate PageRank
    pagerank_algo = nk.centrality.PageRank(
        merged_graph, damp=0.85, tol=PAGERANK_TOLERANCE
    )
    pagerank_algo.run()
    pagerank_scores = pagerank_algo.scores()

    # Extract Kalicube PageRank scores
    pagerank_dict = {}
    for i, url in enumerate(kalicube_nodes):
        vertex_id = i + kalicube_offset
        pagerank_dict[url] = pagerank_scores[vertex_id]

    return pagerank_dict


def run_single_simulation_networkit(
    simulation_id,
    www_graph,
    www_nodes,
    kalicube_old_edges,
    kalicube_new_edges,
    kalicube_nodes_old,
    kalicube_nodes_new,
    kalicube_url_mapping_old,
    kalicube_url_mapping_new,
):
    """Run single simulation."""
    sim_seed = 42 + simulation_id
    np.random.seed(sim_seed)
    random.seed(sim_seed)

    # Calculate PageRank for old configuration
    pagerank_old_dict = process_configuration_networkit(
        www_graph,
        www_nodes,
        kalicube_old_edges,
        kalicube_nodes_old,
        kalicube_url_mapping_old,
    )

    # Calculate PageRank for new configuration
    pagerank_new_dict = process_configuration_networkit(
        www_graph,
        www_nodes,
        kalicube_new_edges,
        kalicube_nodes_new,
        kalicube_url_mapping_new,
    )

    # Compare results
    old_urls = set(pagerank_old_dict.keys())
    new_urls = set(pagerank_new_dict.keys())
    common_urls = old_urls & new_urls

    if not common_urls:
        return None

    deltas_pct = []
    for url in common_urls:
        before = pagerank_old_dict[url]
        after = pagerank_new_dict[url]
        if before > 0:
            delta_pct = ((after - before) / before) * 100
            deltas_pct.append(delta_pct)

    if len(deltas_pct) == 0:
        return None

    return {
        "mean_delta_pct": np.mean(deltas_pct),
        "min_delta_pct": np.min(deltas_pct),
        "max_delta_pct": np.max(deltas_pct),
    }


def analyze_csv_pair(www_graph, www_nodes, old_csv_path, new_csv_path):
    """Analyze a pair of CSV files."""
    print(f"\nAnalyzing: {os.path.basename(new_csv_path)}")

    kalicube_graph_old, kalicube_nodes_old, kalicube_url_mapping_old = (
        load_graph_from_csv_networkit(old_csv_path, "baseline Kalicube")
    )
    if kalicube_graph_old is None:
        print(f"  Failed to load old graph")
        return None

    kalicube_graph_new, kalicube_nodes_new, kalicube_url_mapping_new = (
        load_graph_from_csv_networkit(new_csv_path, "comparison Kalicube")
    )
    if kalicube_graph_new is None:
        print(f"  Failed to load new graph")
        return None

    kalicube_old_edges = [(u, v) for u, v in kalicube_graph_old.iterEdges()]
    kalicube_new_edges = [(u, v) for u, v in kalicube_graph_new.iterEdges()]

    del kalicube_graph_old, kalicube_graph_new
    gc.collect()

    print(f"  Running {NUM_SIMULATIONS} simulations...")
    sim_results = []

    for sim_id in range(NUM_SIMULATIONS):
        result = run_single_simulation_networkit(
            sim_id,
            www_graph,
            www_nodes,
            kalicube_old_edges,
            kalicube_new_edges,
            kalicube_nodes_old,
            kalicube_nodes_new,
            kalicube_url_mapping_old,
            kalicube_url_mapping_new,
        )
        if result is not None:
            sim_results.append(result)

        if (sim_id + 1) % 20 == 0:
            print(f"    Progress: {sim_id + 1}/{NUM_SIMULATIONS}")

    if len(sim_results) == 0:
        print(f"  No valid results")
        return None

    avg_mean = np.mean([r["mean_delta_pct"] for r in sim_results])
    avg_min = np.mean([r["min_delta_pct"] for r in sim_results])
    avg_max = np.mean([r["max_delta_pct"] for r in sim_results])

    return {
        "filename": os.path.basename(new_csv_path),
        "avg_mean_delta_pct": avg_mean,
        "avg_min_delta_pct": avg_min,
        "avg_max_delta_pct": avg_max,
        "sim_results": sim_results,
    }


def validate_results(result):
    """Validate that max >= mean >= min."""
    if result is None:
        return False

    avg_max = result["avg_max_delta_pct"]
    avg_mean = result["avg_mean_delta_pct"]
    avg_min = result["avg_min_delta_pct"]

    if not (avg_max >= avg_mean >= avg_min):
        print(f"  WARNING: Validation failed for {result['filename']}")
        print(f"     Max: {avg_max:.2f}%, Mean: {avg_mean:.2f}%, Min: {avg_min:.2f}%")
        return False

    return True


# ============================================
# MAIN EXECUTION
# ============================================

if __name__ == "__main__":
    print("=" * 70)
    print("PAGERANK ANALYSIS WITH REAL WWW DATA (FineWeb)")
    print("=" * 70)

    # Validate paths
    if not os.path.exists(BASELINE_PATH):
        print(f"\nERROR: Baseline file not found: {BASELINE_PATH}")
        exit(1)

    if not os.path.exists(FINEWEB_WWW_PATH):
        print(f"\nERROR: FineWeb WWW file not found: {FINEWEB_WWW_PATH}")
        exit(1)

    if not os.path.exists(COMPARISON_FOLDER):
        print(f"\nERROR: Comparison folder not found: {COMPARISON_FOLDER}")
        exit(1)

    print(f"\nWWW Graph Source: {os.path.basename(FINEWEB_WWW_PATH)}")
    print(f"Baseline: {os.path.basename(BASELINE_PATH)}")

    # Load REAL WWW graph (only once, then cached)
    www_graph, www_nodes = load_www_graph_networkit(FINEWEB_WWW_PATH)

    # Find comparison files
    csv_files = sorted([f for f in os.listdir(COMPARISON_FOLDER) if f.endswith(".csv")])

    if len(csv_files) == 0:
        print(f"\nERROR: No CSV files found in {COMPARISON_FOLDER}")
        exit(1)

    print(f"\nFound {len(csv_files)} CSV files in comparison folder")
    print("=" * 70)

    results = []
    all_simulation_results = []

    for new_csv_filename in csv_files:
        new_csv_path = os.path.join(COMPARISON_FOLDER, new_csv_filename)
        result = analyze_csv_pair(www_graph, www_nodes, BASELINE_PATH, new_csv_path)

        if result is not None and validate_results(result):
            results.append(result)
            all_simulation_results.extend(result["sim_results"])
            print(f"  âœ“ Valid results obtained")

    print("\n" + "=" * 70)
    print("INDIVIDUAL FILE RESULTS")
    print("=" * 70)

    if len(results) == 0:
        print("\nNo valid results obtained")
    else:
        print(f"\n{'File':<50} {'Avg Mean':<12} {'Avg Min':<12} {'Avg Max':<12}")
        print("-" * 90)

        for result in results:
            print(
                f"{result['filename']:<50} "
                f"{result['avg_mean_delta_pct']:>10.2f}% "
                f"{result['avg_min_delta_pct']:>10.2f}% "
                f"{result['avg_max_delta_pct']:>10.2f}%"
            )

        print("-" * 90)
        print(f"\nSuccessfully analyzed {len(results)}/{len(csv_files)} files")

    # Calculate overall averages
    if len(all_simulation_results) > 0:
        print("\n" + "=" * 70)
        print("OVERALL AVERAGES")
        print("=" * 70)

        overall_mean = np.mean([r["mean_delta_pct"] for r in all_simulation_results])
        overall_min = np.mean([r["min_delta_pct"] for r in all_simulation_results])
        overall_max = np.mean([r["max_delta_pct"] for r in all_simulation_results])

        print(f"\nTotal simulations across all files: {len(all_simulation_results)}")
        print(f"Total files analyzed: {len(results)}")
        print(f"\nOverall Average Mean Delta: {overall_mean:>10.2f}%")
        print(f"Overall Average Min Delta:  {overall_min:>10.2f}%")
        print(f"Overall Average Max Delta:  {overall_max:>10.2f}%")

        # Additional statistics
        all_means = [r["mean_delta_pct"] for r in all_simulation_results]
        print(f"\nStandard Deviation (Mean): {np.std(all_means):>10.2f}%")
        print(f"Median (Mean):             {np.median(all_means):>10.2f}%")

    _www_graph_cache = None
    gc.collect()
    print("\n" + "=" * 70)
    print("Analysis complete!")
    print("=" * 70)