In [None]:
import pandas as pd
import numpy as np
import networkx as nx
import json
import glob
import multiprocessing
import matplotlib.pyplot as plt
import os

In [None]:
from mult_func import get_neighbors, sample_network, pick_ego_network

In [None]:
data_sources = glob.glob("./Data/*")
data_sources

In [None]:
ds_to_files = {os.path.basename(data_src): glob.glob(os.path.join(data_src, "*edges*.csv")) for data_src in data_sources}
ds_to_files

In [None]:
if not os.path.exists("./outputs"):
    os.makedirs("./outputs")

In [None]:
np.random.seed(123)

In [None]:
def read_network_edges(in_source_file):
    df = pd.read_csv(in_source_file)
    df.rename(columns={df.columns[0]: "Source", df.columns[1]: "Target"}, inplace=True)
    num_edges = df.shape[0]
    num_nodes = len(set(df["Source"].unique()).union(df["Target"].unique()))
    if np.max(df.max()) == num_nodes - 1:
        print(f"Nodes indexed 0 to {num_nodes}")
    else:
        print("ERROR: Node index is incompatible")
        raise Exception("ERROR: Node index is incompatible")
    return df, num_nodes, num_edges

In [None]:
def generate_samples(in_edges_df, in_num_nodes, in_num_networks, in_sampled_net_size):
    param_list = ((r, in_edges_df, in_sampled_net_size) for r in np.random.choice(in_num_nodes, size=in_num_networks, replace=False))
    
    with multiprocessing.Pool(multiprocessing.cpu_count() - 1) as P:
        net_list = P.starmap(sample_network, param_list)
        
    net_list = pd.DataFrame(net_list, columns=["root","cc_mean","cc_std", "nodes", "G"])
    net_list.index.name = "idxOld"
    
    return net_list

In [None]:
def check_isomorphism(in_net_list):
    similar_graphs = {}
    skip_graphs = set()
    for idxA in range(in_net_list.shape[0]):
        graphA = in_net_list.iloc[idxA]["G"]
        for idxB in range(idxA + 1, in_net_list.shape[0]):
            if idxB in skip_graphs:
                continue
            graphB = in_net_list.iloc[idxB]["G"]
            if nx.faster_could_be_isomorphic(graphA, graphB):
                print("Degree Sequences are equal: ", idxA, idxB)
                if nx.fast_could_be_isomorphic(graphA, graphB):
                    print("Triangle Sequences are equal: ", idxA, idxB)
                    if nx.could_be_isomorphic(graphA, graphB):
                        print("Clique Sequences are equal: ", idxA, idxB)
                        # if nx.is_isomorphic(graphA, graphB):
                        #     print("Is isomorphic: ", idxA, graphB)
                        if idxA not in similar_graphs:
                            similar_graphs[idxA] = []
                        similar_graphs[idxA].append(idxB)
                        skip_graphs.add(idxB)
    return similar_graphs, skip_graphs

In [None]:
num_networks_per_type = 30
for data_src in ds_to_files:
    for data_file in ds_to_files[data_src]:
        print(data_file)
        data_file_name = os.path.basename(data_file)[:-4]
        if not os.path.exists(f"./outputs/{data_file_name}"):
            os.makedirs(f"./outputs/{data_file_name}")
        df, num_nodes, num_edges = read_network_edges(data_file)
        for net_size in [100, 200, 300, 500]:
            net_list = generate_samples(df, num_nodes, num_networks_per_type, net_size)
            similar_graphs, skip_graphs = check_isomorphism(net_list)
            net_list = net_list.drop(skip_graphs).reset_index(drop=True).rename_axis("idx")
            # save info csv
            net_list[["root","cc_mean","cc_std"]].to_csv(f"./outputs/{data_file_name}/info_{data_file_name}_n{net_size}.csv")
            # save histogram
            net_list["cc_mean"].hist()
            plt.title("#nodes:{} #samples:{} , cc_mean ~ N({:.2f}, {:.2f})".format(net_size, num_networks_per_type, net_list["cc_mean"].mean(), net_list["cc_mean"].std()))
            plt.xlabel(f"Mean Clustering Coefficient for samples from {data_file_name}")
            plt.ylabel("Count")
            plt.tight_layout()
            plt.savefig(f"./outputs/{data_file_name}/cchist_{data_file_name}_n{net_size}.png")
            plt.show()
            # save each edges csv file
            for idx in range(net_list.shape[0]):
                nx.write_edgelist(net_list.iloc[idx]["G"], f"./outputs/{data_file_name}/edges_{data_file_name}_n{net_size}_i{idx}.csv", delimiter=",", data=False)