In [37]:
import os
import networkx as nx
import numpy as np

In [None]:
def calculate_distance_metrics(G: nx.Graph) -> dict:
    """
    Calculate the distance metrics of the graph G.
    """
    distances = []
    for node in G.nodes():
        this_node_to_all = nx.shortest_path_length(G, source=node)
        for n, d in this_node_to_all.items():
            if d != 0:
                distances.append(d)
    return distances

def CCDF(G: nx.Graph) -> list:
    """
    Calculate the CCDF of the graph G.
    """
    degree_sequence = sorted([d for n, d in G.degree()], reverse=True)
    degreeCount = np.array(np.unique(degree_sequence, return_counts=True)).T
    ccdf = np.array([[degreeCount[i,0], np.sum(degreeCount[i:,1])] for i in range(len(degreeCount))])
    return ccdf

def generate_metrics_statistics(metric_name: str, data_list: list) -> str:
    """
    Generate the statistics of metric in the graph. Return in a string format.
    """
    result = f"### {metric_name} Analysis ###\n"
    result += f"Average: {np.mean(data_list)}\n"
    result += f"Median: {np.median(data_list)}\n"
    result += f"Standard Deviation: {np.std(data_list)}\n"
    result += f"Variance: {np.var(data_list)}\n"
    result += f"Max: {np.max(data_list)}\n"
    result += f"Min: {np.min(data_list)}\n"
    result += f"CCDF: {np.sum(data_list)}\n"
    return result

def generate_graph_metrics(G: nx.Graph) -> dict:
    """
    Generate the metrics of the graph G.
    """
    result = {}
    result["Degree"] = generate_metrics_statistics("Degree", list(dict(G.degree()).values()))
    result["Distances"] = generate_metrics_statistics("Clustering", calculate_distance_metrics(G))
    result["Betweenness"] = generate_metrics_statistics("Betweenness", list(nx.betweenness_centrality(G).values()))
    result["Closeness"] = generate_metrics_statistics("Closeness", list(nx.closeness_centrality(G).values()))
    result["Eigenvector"] = generate_metrics_statistics("Eigenvector", list(nx.eigenvector_centrality(G).values()))
    result["PageRank"] = generate_metrics_statistics("PageRank", list(nx.pagerank(G).values()))
    return result


In [None]:
# all as data
files = os.listdir('data/as')

G = nx.DiGraph()

# read the graph
len_files, i = len(files), 0
for file in files:
    print('Reading file: ', i, 'Out of ', len_files)
    G.add_edges_from(nx.read_edgelist('data/as/' + file, create_using=nx.DiGraph(), nodetype=int).edges())
    i += 1

In [38]:
# Degree analysis. max, min, avg, std, median, CCDF
degree = [d for n, d in G.degree()]
print('Degree analysis')
print('Max: ', max(degree))
print('Min: ', min(degree))
print('Avg: ', sum(degree) / len(degree))
print('Std: ', np.std(degree))
print('Median: ', np.median(degree))

Degree analysis
Max:  3704
Min:  2
Avg:  11.831259720062208
Std:  63.49473339592536
Median:  6.0


In [61]:
# Distance analysis. max, min, avg, std, median, CCDF
distances = []
for node in G.nodes():
    this_node_to_all = nx.shortest_path_length(G, source=node)
    for n, d in this_node_to_all.items():
        if d != 0:
            distances.append(d)
print('Distance analysis')
print('Max: ', max(distances))
print('Min: ', min(distances))
print('Avg: ', sum(distances) / len(distances))
print('Std: ', np.std(distances))
print('Median: ', np.median(distances))



Distance analysis
Max:  8
Min:  1
Avg:  3.4712252225556175
Std:  0.8476635946952388
Median:  3.0


In [63]:
# Size of connected components analysis. max, min, avg, std, median, CCDF
components = [len(c) for c in sorted(nx.strongly_connected_components(G), key=len, reverse=True)]
print('Size of connected components analysis')
print('Max: ', max(components))
print('Min: ', min(components))
print('Avg: ', sum(components) / len(components))
print('Std: ', np.std(components))
print('Median: ', np.median(components))


Size of connected components analysis
Max:  7716
Min:  7716
Avg:  7716.0
Std:  0.0
Median:  7716.0


In [65]:
# Clustering coefficient analysis. max, min, avg, std, median, CCDF
clustering = nx.clustering(G)
print('Clustering coefficient analysis')
print('Max: ', max(clustering.values()))
print('Min: ', min(clustering.values()))
print('Avg: ', sum(clustering.values()) / len(clustering.values()))
print('Std: ', np.std(list(clustering.values())))
print('Median: ', np.median(list(clustering.values())))


Clustering coefficient analysis
Max:  1.0
Min:  0
Avg:  0.3915524437675338
Std:  0.41536848103878343
Median:  0.2559602302459445


In [66]:
# Betweenness centrality analysis. max, min, avg, std, median, CCDF
betweenness = nx.betweenness_centrality(G)
print('Betweenness centrality analysis')
print('Max: ', max(betweenness.values()))
print('Min: ', min(betweenness.values()))
print('Avg: ', sum(betweenness.values()) / len(betweenness.values()))
print('Std: ', np.std(list(betweenness.values())))
print('Median: ', np.median(list(betweenness.values())))


Betweenness centrality analysis
Max:  0.31277529553324546
Min:  0.0
Avg:  0.0003203558753637029
Std:  0.004862849332956846
Median:  0.0
