In [1]:
import os
import networkx as nx
import numpy as np
import pickle
import matplotlib.pyplot as plt

In [2]:
def calculate_distance_metrics(G: nx.Graph) -> dict:
    """
    Calculate the distance metrics of the graph G.
    """
    distances = []
    for node in G.nodes():
        this_node_to_all = nx.shortest_path_length(G, source=node)
        for n, d in this_node_to_all.items():
            if d != 0:
                distances.append(d)
    return distances

def plot_CCDF(data: list, title: str, xlabel: str, ylabel: str, save_path: str) -> None:
    """
    Plot the CCDF of a given array.
    """
    ccdf = CCDF(data)
    plt.figure(figsize=(10, 6))
    plt.loglog(range(1, len(ccdf) + 1), ccdf, 'o', markersize=3)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    if not os.path.exists(os.path.dirname(save_path)):
        os.makedirs(os.path.dirname(save_path))
    plt.savefig(save_path)

def CCDF(data: list) -> list:
    """
    Calculate the CCDF of a given array.
    """
    data = np.array(data)
    data.sort()
    s = data.sum()
    cdf = data.cumsum(0) / s
    ccdf = 1 - cdf
    return ccdf

def generate_metrics_statistics(metric_name: str, data_list: list) -> str:
    """
    Generate the statistics of metric in the graph. Return in a string format.
    """
    result = f"### {metric_name} Analysis ###\n"
    result += f"Average: {np.mean(data_list)}\n"
    result += f"Median: {np.median(data_list)}\n"
    result += f"Standard Deviation: {np.std(data_list)}\n"
    result += f"Variance: {np.var(data_list)}\n"
    result += f"Max: {np.max(data_list)}\n"
    result += f"Min: {np.min(data_list)}\n"
    result += f"CCDF: {np.sum(data_list)}\n"
    return result

def generate_graph_metrics(G: nx.Graph, save_path: str ='') -> dict:
    """
    Generate the metrics of the graph G.
    """
    result = {}
    result["Degree"] = list(dict(G.degree()).values())
    result["Distances"] = calculate_distance_metrics(G)
    if not G.is_directed():
        result['SizeConnectedComponent'] = [len(c) for c in sorted(nx.connected_components(G), key=len, reverse=True)]
    else:
        result['SizeConnectedComponent'] = [len(c) for c in sorted(nx.strongly_connected_components(G), key=len, reverse=True)]
    result["Clustering"] = list(nx.clustering(G).values())
    result["Betweenness"] = list(nx.betweenness_centrality(G).values())
    result["Closeness"] = list(nx.closeness_centrality(G).values())

    # Save the metrics in a file
    if save_path == '':
        save_path = os.path.join(os.getcwd(), 'results', G.name.replace(' ', '_'))
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    with open(os.path.join(save_path, 'metrics.pickle'), 'wb') as f:
        pickle.dump(result, f)

    return result

def generate_metrics_statistics_file(G: nx.Graph, filename: str = '') -> None:
    """
    Generate the metrics statistics of the graph G and save in a file.
    """
    result = ""
    metrics = generate_graph_metrics(G)
    for metric_name, metric_data in metrics.items():
        result += generate_metrics_statistics(metric_name, metric_data)
        result += "\n\n"
    if filename == '':
        filename = os.path.join(os.getcwd(), 'results', G.name.replace(' ', '_'), 'metrics_statistics.txt')
    if not os.path.exists(os.path.dirname(filename)):
        os.makedirs(os.path.dirname(filename))
    with open(filename, 'w') as f:
        f.write(result)

    # Generate the CCDF plots
    for metric_name, metric_data in metrics.items():
        plot_CCDF(metric_data, metric_name, metric_name, 'CCDF', os.path.join(os.getcwd(), 'results/' + G.name.replace(' ', '_') + '/Images/' + metric_name.replace(' ', '_') + '_ccdf.png'))


In [None]:
# all as data
files = os.listdir('data/as')

G = nx.Graph()
G.name = 'Autonomous System'

# read the graph
len_files, i = len(files), 1
for file in files:
    print('Reading file: ', i, 'Out of ', len_files, end='\r')
    G.add_edges_from(nx.read_edgelist('data/as/' + file, create_using=nx.Graph(), nodetype=int).edges())
    i += 1
    
print("\nEdges: ", G.number_of_edges(), "Nodes: ", G.number_of_nodes())

# generate the metrics statistics
print('Generating metrics statistics...')
generate_metrics_statistics_file(G)
print('Done!')

In [None]:
# wikipedia data of only the edges
file = 'data/wikispeedia_paths-and-graph/links.tsv'

G = nx.read_edgelist(file, create_using=nx.DiGraph(), nodetype=str)
G.name = 'Wikipedia'

print(G.number_of_nodes(), G.number_of_edges())

# generate the metrics statistics
print('Generating metrics statistics...')
generate_metrics_statistics_file(G)
print('Done!')

In [None]:
# twitter data of only the edges
files = os.listdir('data/twitter')
files = [file for file in files if file.endswith('.edges')]
files = files[:100]

G = nx.DiGraph()
G.name = 'Twitter'

# read the graph
len_files, i = len(files), 1
for file in files:
    print('Reading file: ', i, 'Out of ', len_files, end='\r')
    G.add_edges_from(nx.read_edgelist('data/twitter/' + file, create_using=nx.DiGraph(), nodetype=int).edges())
    i += 1

print(G.number_of_nodes(), G.number_of_edges())

# generate the metrics statistics
print('Generating metrics statistics...')
generate_metrics_statistics_file(G)
print('Done!')