In [40]:
import networkx as nx
import numpy as np
import pandas as pd
import matplotlib as plt

In [41]:
edges = pd.read_csv('HepPh/edges.csv')
nodes = pd.read_csv('HepPh/nodes.csv')

print(edges.shape, edges.columns)
print(nodes.shape)

(421578, 2) Index(['source', 'target'], dtype='object')
(34546, 4)


In [42]:
edge_list = edges[['source', 'target']].values.tolist()

G = nx.DiGraph()

G.add_edges_from(edge_list)

for _, row in nodes.iterrows():
    G.add_node(row['index'], name=row['name'], date=row['date'], pos=row['_pos'])


print('Number of edges: ', G.number_of_edges())
print('Number of nodes: ', G.number_of_nodes())

Number of edges:  421578
Number of nodes:  34546


In [None]:
import numpy as np

# Function for random selection based on in-degree
def ran_indeg_strategy(G, num_pivots):
    # Get the in-degree of each node
    indegrees = dict(G.in_degree())
    all_nodes = list(G.nodes)
    
    # Create a list of probabilities proportional to node in-degrees
    degree_sum = sum(indegrees.values())
    probabilities = [indegrees[node] / degree_sum for node in all_nodes]
    
    # Randomly select pivots with probability proportional to their in-degree
    pivots = np.random.choice(all_nodes, size=num_pivots, replace=False, p=probabilities)
    
    return pivots

In [None]:
# Function for random selection based on out-degree
def ran_outdeg_strategy(G, num_pivots):
    # Get the out-degree of each node
    outdegrees = dict(G.out_degree())
    all_nodes = list(G.nodes)
    
    # Create a list of probabilities proportional to node out-degrees
    degree_sum = sum(outdegrees.values())
    probabilities = [outdegrees[node] / degree_sum for node in all_nodes]
    
    # Randomly select pivots with probability proportional to their out-degree
    pivots = np.random.choice(all_nodes, size=num_pivots, replace=False, p=probabilities)
    
    return pivots

In [47]:
def pagerank_pivot_selection(G, num_pivots):
    """
    Select pivots based on PageRank scores.
    
    Parameters:
    - G: NetworkX directed graph (DiGraph)
    - num_pivots: Number of pivots to select
    
    Returns:
    - List of pivot nodes
    """
    # Calculate PageRank scores
    pagerank_scores = nx.pagerank(G)
    
    # Sort nodes by PageRank scores in descending order
    sorted_nodes = sorted(pagerank_scores, key=pagerank_scores.get, reverse=True)
    
    # Select the top nodes as pivots
    pivots = sorted_nodes[:num_pivots]
    return pivots

In [48]:
pivots = pagerank_pivot_selection(G, 3)
pivots

[3892, 2274, 9250]

In [None]:

def estimate_closeness_centrality(G, pivots):
    closeness_estimates = {}
    for node in G.nodes:
        distances = [nx.shortest_path_length(G, source=pivot, target=node) for pivot in pivots]
        avg_distance = sum(distances) / len(pivots)
        closeness_estimates[node] = 1 / avg_distance if avg_distance > 0 else 0
    return closeness_estimates

In [None]:
# Euclidean distance between exact and estimated centrality
def euclidean_distance(exact, estimated):
    return np.linalg.norm(exact-estimated)

In [54]:
def run_experiment(G, pivot_strategy, runs = 20):
    print('Calculating Closeness centrality')
    exact_closeness = nx.closeness_centrality(G)
    print('Closeness centrality: ', exact_closeness)
    num_pivots_list = [int(G.nodes() / 20 * i) for i in range(1, 21)]
    results = []
    for i in range(runs):
        for num_pivots in num_pivots_list:
            pivots = pivot_strategy(G, num_pivots)
            estimated_closeness = estimate_closeness_centrality(G, pivots)
            euclidean_dist = euclidean_distance(exact_closeness, estimated_closeness)
            results.append({
                "num_pivots": num_pivots,
                "euclidean_distance": euclidean_dist
            })

results = run_experiment(G, pagerank_pivot_selection)
results

Calculating Closeness centrality


KeyboardInterrupt: 