In [None]:
from notebook_prelude import *

## WL phi feature map distributions

In [None]:
os.makedirs('tmp/phi-distributions', exist_ok = True)

def print_phi_distribution(title, X_phi, Y, sort_y = True, add_class_label = True, add_class_line = True, figsize = (15, 12), flip_axis = False, draw_vertice_count_line = True, h_lines = [], sns_color_palette = "bright", class_v_line_kwargs=dict(linestyle = 'solid', color = 'black', linewidth = 1, alpha = 0.1), class_h_line_kwargs = dict(linestyle ='solid', linewidth = 1, alpha = 0.6)):
    if sort_y:
        # Use a stable sort algorithm
        Y_sorted_indices = np.argsort(Y, kind = 'mergesort')
        Y = np.array(Y)[Y_sorted_indices]
        X_phi = [phi[Y_sorted_indices] for phi in X_phi]

    cmap_ = sns.color_palette(sns_color_palette, max(len(set(Y)), len(h_lines[0])))
    clazz_2_color_map = dict()
    
    counter = 0
    for clazz in Y:
        if clazz not in clazz_2_color_map:
            clazz_2_color_map[clazz] = counter
            counter += 1
    colors = [clazz_2_color_map[y] for y in Y]
    occurences = {clazz: -1 for clazz in set(Y)}
    
    for idx, y in enumerate(Y):
        if occurences[y] == -1: occurences[y] = idx

    ax_labels = ['phi index', 'graph']
    ax_line = 'v' if flip_axis else 'x'
    
    if flip_axis:
        ax_labels = list(reversed(ax_labels))
    
    fig, axes = plt.subplots(ncols = len(X_phi), figsize = figsize, sharey = True)
    
    axes[0].set_ylabel(ax_labels[1])
    
    num_graphs, num_vertices = X_phi[0].shape
    
    for h, (phi, ax) in enumerate(zip(X_phi, axes)):
        non_zero = phi.nonzero()

        if flip_axis:
            non_zero = reversed(non_zero)

        y, x = non_zero

        # Plot hlines (eg. max phi index of class)
        if len(h_lines):
            for class_idx, hline in enumerate(h_lines[h]):
                if isinstance(hline, tuple):
                    hline, line_color = hline
                else:
                    line_color = cmap_[class_idx]
                ax.axhline(hline, color = line_color, **class_h_line_kwargs)
        
        colors_ = [cmap_[colors[x_]] for x_ in x]
        ax.scatter(x = x, y = y, c = colors_, cmap=cmap_, s = 1)
        ax.set_title('Iteration: {}'.format(h))

    for ax in axes:
        ax.set_xlabel(ax_labels[0])
        ax.grid('off')
        ax.set_xticks([])
        ax.set_yticks([])
        
        # Add class occurrence lines
        for clazz, occurence_idx in occurences.items():
            if add_class_line:
                getattr(ax, 'ax{}line'.format(ax_line))(occurence_idx, **class_v_line_kwargs)
            if add_class_label:
                x, y = 1, occurence_idx
                if flip_axis:
                    y, x = x, y
                ax.text(x = 1, y = occurence_idx, s = clazz, color = 'red')

        if draw_vertice_count_line:
            ax.set_ylim(0, num_vertices * 1.03)
            ax.axhline(num_vertices - (num_vertices / 100), color = 'red', linewidth=1)
        
    
    fig.suptitle(title)
    fig.tight_layout()
    fig.subplots_adjust(top = .9)
    
    return fig, ax

def filter(cache_file):
    is_split = 'splitted' in cache_file
    is_ling_spam = 'ling-spam' in cache_file
    is_dataset = lambda dataset: filename_utils.get_dataset_from_filename(cache_file) == dataset
    is_concept_graph = 'concept-map' in cache_file
    is_same_label = 'same-label' in cache_file
    return is_split# and is_same_label #and is_dataset('webkb')# and is_concept_graph


NUM_ITERATIONS = 3

for cache_file in dataset_helper.get_all_cached_graph_phi_datasets():
    if not filter(cache_file): continue
    print(cache_file)
    def plot(X_phi, Y, suffix = '', h_lines = [], plot_kwargs = dict()):
        filename = 'tmp/phi-distributions/{}{}.png'.format(cache_file.split('/')[-1], suffix)
        assert np.array_equal(np.sort(Y), Y)
        if os.path.exists(filename):
            return
        print(filename.split('/')[-1])
        kwargs = dict(dict(sort_y = False, add_class_label = False, flip_axis = True, add_class_line = True, h_lines = h_lines), **plot_kwargs)
        fig, ax = print_phi_distribution('File: {} {} (#vertices: {})'.format(cache_file.split('/')[-1], suffix, X_phi[0].shape[1]), X_phi, Y, **kwargs)
        fig.savefig(filename)
        plt.close(fig)

    def get_hlines(phis, Y):
        highest_per_class = []
        for phi in phis:
            non_zero = phi.nonzero()
            non_zero_y, non_zero_x = non_zero 
            highest_per_class_ = collections.defaultdict(lambda: -1)
            for non_zero_y, non_zero_x in zip(non_zero_y, non_zero_x):
                clazz = Y[non_zero_y]
                highest_per_class_[clazz] = max(highest_per_class_[clazz], non_zero_x)
            highest_per_class.append(sorted(list(highest_per_class_.values())))
        return highest_per_class
        
    phi_res = dataset_helper.get_dataset_cached(cache_file, check_validity=False)
    if len(phi_res) == 2:
        phi_train, Y_train = phi_res
        h_lines = get_hlines(phi_train, Y_train)
        plot(*phi_res, h_lines = h_lines)
    elif len(phi_res) == 6:
        phi_train, phi_test, X_train, X_test, Y_train, Y_test = phi_res
        h_lines = get_hlines(phi_train, Y_train)
        kwargs = dict()
        if 'same-label' in cache_file:
            kwargs['draw_vertice_count_line'] = False
        if len(set(Y_train)) > 20:
            kwargs['add_class_line'] = False
        for res in [(phi_train, Y_train, '_train'), [phi_test, Y_test, '_test']]:
            plot(*res, h_lines = h_lines, plot_kwargs=kwargs)
    else:
        assert False
    #break
print("Finished")

### Copy saved figures into categorized folder

In [None]:
FOLDER_PHI_DIST = 'tmp/phi-distributions'

data = collections.defaultdict(lambda: [])
for file in glob('{}/*.png'.format(FOLDER_PHI_DIST)):
    data['file'].append(file)
df = pd.DataFrame(data)
df['dataset'] = df.file.apply(filename_utils.get_dataset_from_filename)
df['same_label'] = df.file.str.contains('same-label') | df.file.str.contains('same_label')
df['type'] = df.file.str.extract(r'dataset_graph_(.+?)_')
df['window_size'] = df.file.str.extract(r'dataset_graph_cooccurrence_(.+?)_')
df['split'] = df.file.str.contains('splitted')
df['test_train'] = df.file.str.extract(r'phi.npy_(.+?).png$')
df['original_file'] = df.file.str.extract(r'/([^/]+?\.npy)').str.replace('.splitted', '')

for idx, item in df.iterrows():
    old_folder = item.file.rsplit("/", 1)[0]
    old_filename = item.file.rsplit("/", 1)[1]
    new_filename = '{old_folder}/_/{t.dataset}/{t.type}/{t.same_label}/{old_filename}'.format(t = item, old_folder = old_folder, old_filename = old_filename)
    folder = new_filename.rsplit('/', 1)[0]
    os.makedirs(folder, exist_ok = True)
    shutil.copyfile(src = item.file, dst=new_filename)

## Statistics about predictions

by size etc.

In [None]:
def get_graphs(X_test):
    graph_helper.convert_graphs_to_adjs_tuples(X_test)
    return np.array(X_test, dtype=object)

prediction_results = {}
for prediction_filename, prediction in helper.log_progress(list(results_helper.get_predictions())):
    is_graph_type = 'graph' in prediction_filename
    if not is_graph_type: continue
    results = results_helper.get_result_for_prediction(prediction_filename)
    if not results:
        print("Results for predictions could not be found: {}".format(prediction_filename))
    prediction_results[prediction_filename] = [prediction['results'][attr] for attr in ['Y_real', 'Y_pred', 'X_test']]
    prediction_results[prediction_filename][2] = get_graphs(prediction_results[prediction_filename][2])

In [None]:
data = collections.defaultdict(lambda: [])
counter = 0

for prediction_filename, (Y_real, Y_pred, X_test) in helper.log_progress(list(prediction_results.items())):
    if 'gram' in prediction_filename: continue
    if not 'graph' in prediction_filename: continue
    
    Y_real, Y_pred, X_test = np.array(Y_real, dtype=object), np.array(Y_pred, dtype=object), np.array(X_test, dtype=object)
    
    df_graphs = pd.DataFrame({
        'num_nodes': [len(labels) for adj, labels  in X_test],
        'num_edges': [len(adj.nonzero()[0]) for adj, labels  in X_test]
    })
    
    third_quartile = df_graphs.num_nodes.quantile(q = 0.75)
    
    df_graphs_big = df_graphs[df_graphs.num_nodes >= third_quartile]
    df_graphs = df_graphs[df_graphs.num_nodes < third_quartile]
    
    NUM_BIN_SMALL = 20
    NUM_BIN_BIGGER = 1
    
    df_graphs['bins'], bins = pd.cut(df_graphs.num_nodes, NUM_BIN_SMALL, retbins=True)
    df_graphs['bin_type'] = 'small'
    df_graphs_big['bins'], bins = pd.cut(df_graphs_big.num_nodes, NUM_BIN_BIGGER, retbins=True)
    df_graphs_big['bin_type'] = 'big'
    df_new = df_graphs.append(df_graphs_big)
    
    for bin_, df_graph_quartile in df_new.sort_values('bins').groupby('bins'):
        indices = df_graph_quartile.index.tolist()
        Y_real_quart, Y_pred_quart, X_test_quart = [x[indices] for x in [Y_real, Y_pred, X_test]]
        f1 = metrics.f1_score(y_true=Y_real_quart, y_pred=Y_pred_quart, average = 'macro')
        data['file'].append(prediction_filename)
        data['bin'].append(bin_)
        data['bin_type'].append(df_graph_quartile.bin_type.unique()[0])
        data['f1'].append(f1)
        data['num_elements'].append(len(indices))
    
df_bins = pd.DataFrame(data)

In [None]:
df_bins['mean_bin'] = df_bins.bin.apply(lambda x: np.rint(x.mid))
df_bins['filename_only'] = df_bins.file.apply(lambda x: x.split('/')[-1])
df_bins['dataset'] = df_bins.filename_only.apply(filename_utils.get_dataset_from_filename)
df_bins['type'] = df_bins.filename_only.str.extract(r'dataset_graph_(.+?)_')
df_bins['combined'] = df_bins.filename_only.str.contains('combined')
df_bins['same_label'] = df_bins.filename_only.str.contains('same_label')
df_bins['kernel'] = df_bins.filename_only.apply(results_helper.get_kernel_from_filename)

In [None]:
FOLDER_BIN_IMGS = 'tmp/bin-scores'
os.makedirs(FOLDER_BIN_IMGS, exist_ok=True)

df_bins = df_bins.reset_index(drop = True)

df_bins['graph_file'] = df_bins.filename_only.apply(lambda x: re.findall(r'(dataset_graph_.*?.npy)', x)[0])
df_bins_filtered = df_bins.groupby('graph_file').filter(lambda x: len(x.same_label.unique()) > 1)
df_bins_filtered = df_bins_filtered[df_bins_filtered.combined == False]
colors = sns.color_palette('deep')
for (graph_file), df in df_bins_filtered.groupby(['graph_file']):
    fig, ax = plt.subplots()
    for idx, ((filename, same_label), df_) in enumerate(df.groupby(['filename_only', 'same_label'])):
        df_ = df_.set_index('mean_bin').sort_index()
        df_.f1.plot(kind = 'bar', ax = ax, color = colors[idx], title = 'File: {}'.format(filename), label = 'same-label' if same_label else 'normal', alpha = 0.9)#, marker = 'o', markersize = 5)
    handles, labels = ax.get_legend_handles_labels()
    ax.legend(handles, labels)
    
    plt.show()
    #break

    #break
    fig.savefig('{}/{}.png'.format(FOLDER_BIN_IMGS, graph_file))
    plt.close(fig)

## Similar graphs by WL feature map


### Retrieve graph datasets (both cmap and coo)

Retrieve phi feature maps for coo and cmap, then get number of non-zero elements per graph (= per row of the feature map)

And calculate the gram matrix, then find the most similar graphs per graph (= per row of the gram matrix)


In [None]:
dataset_name = 'ng20'
dataset_name = 'ling-spam'

results = collections.defaultdict(lambda: {})
for graph_phi_file in dataset_helper.get_all_cached_graph_phi_datasets(dataset_name=dataset_name):
    #if 'concept' not in graph_phi_file: continue
    print('Processing: {}'.format(graph_phi_file.split('/')[-1]))
    phi, Y = dataset_helper.get_dataset_cached(graph_phi_file, check_validity=False)
    
    for h, phi_used in enumerate(phi):
        print('\th={}'.format(h))
        # Generate kernel matrix
        gram_matrix = phi_used.dot(phi_used.T).toarray()
        results[graph_phi_file][h] = {}
        results[graph_phi_file][h]['phi_used'] = phi_used
        # Vector with the number of non-zero elements per row
        # ie. non_zero_elements[idx] = len(phi_used[idx].nonzero())
        results[graph_phi_file][h]['non_zero_elements'] = np.squeeze(np.asarray(np.sum(phi_used, axis = 1).T))
        results[graph_phi_file][h]['num_elements'] = phi_used.shape[0]
        results[graph_phi_file][h]['found_counter'] = collections.Counter()
        results[graph_phi_file][h]['most_similar_scores'] = []
        results[graph_phi_file][h]['similarity_pairs'] = []

        for idx, row in enumerate(gram_matrix):
            indices = np.argsort(row)[-10:]
            # Search for index of this graph in the similar graph indices,
            # it should be the most similar graph (because it's the same graph!)
            results[graph_phi_file][h]['found_counter']['found' if idx in indices else 'not_found'] += 1
            results[graph_phi_file][h]['most_similar_scores'].append(row[indices].tolist())
            results[graph_phi_file][h]['similarity_pairs'].append(indices)
print('Finished')

### Plot sparsity of feature maps

In [None]:
for graph_cache_file, iterations in results.items():
    fig, ax = plt.subplots(figsize = EXPORT_FIG_SIZE_BIG)
    if 'gml' not in graph_cache_file and 'cooccurrence_1_all_ling' not in graph_cache_file: continue
    for iteration, metrics in iterations.items():
        if iteration != 0: continue
        df = pd.DataFrame(metrics['non_zero_elements'], columns=['non_zero_elements'])
        df.non_zero_elements.plot(kind='hist', bins = 40, alpha = 0.8, ax = ax, title = 'Histogram of non-zero entries in feature map (per row/element)')
    
    ax.set_xlim((0, 1000))
    ax.set_xlabel('# of non-zero elements per row')
    ax.legend(['Co-occurence graphs', 'Concept maps'])
    plt.show()
    fig.savefig('tmp/feature-map-sparsity-{}.png'.format(dataset_name), dpi = EXPORT_DPI)
    plt.close(fig)

### Get graph dataset for the feature map

In [None]:
graph_phi_file = 'data/CACHE/dataset_graph_gml_ling-spam-single.phi.npy'

In [None]:
filename = graph_phi_file.split('/')[-1].split('.phi')[0]
print(filename)
candidates = [x for x in dataset_helper.get_all_cached_graph_datasets() if filename in x]
assert len(candidates)
X, Y = dataset_helper.get_dataset_cached(candidates[0])

In [None]:
for i in range(10):
    fig, ax = plt.subplots()
    choice = np.random.choice(len(X))
    graph = nx.Graph(X[choice])
    res = [(node, val) for node, val in nx.pagerank(graph).items()]
    nodes = [node for node, val in res]
    node_vals = np.array([val for node, val in res])
    node_sizes = np.exp(node_vals * 20) * 20 / len(nodes) * 3
    node_sizes = [0 for x in nodes]
    nx.draw_networkx(graph, nodelist = nodes, with_labels=False, node_size = node_sizes, node_color='#000000')
    ax.set_title('Graph#={}, connected_components={}'.format(choice, nx.number_connected_components(graph)))
    ax.grid('off')
    ax.set_xticks([])
    ax.set_yticks([])

    for spine in ax.spines.values():
        spine.set_color('#FFFFFF')
    plt.show()


In [None]:
X_text, _ = dataset_helper.get_dataset(dataset_name)

### Plot similar graphs

In [None]:
# X
similarity_pairs = results[graph_phi_file]['similarity_pairs']
similarity_scores = np.array(results[graph_phi_file]['most_similar_scores'])
phi_used = results[graph_phi_file]['phi_used']

# Check that the similarity score can not be greater than the number of nodes!
for idx, graph, most_similar, most_similar_score in zip(range(len(X)), X, similarity_pairs, similarity_scores):
    assert max(most_similar_score) <= len(graph.nodes())

In [None]:
gram_matrix = phi_used.dot(phi_used.T).toarray()

In [None]:
similarity_pairs = results[graph_phi_file]['similarity_pairs']
similarity_scores = np.array(results[graph_phi_file]['most_similar_scores'])

def get_similarity(graph1_idx, graph2_idx):
    return gram_matrix[graph1_idx, graph2_idx]

def get_non_zero_phi_elements(idx):
    return phi_used[idx].nonzero()[1]

def plot_similar_graphs(graph_idx, num_to_plot = 2):
    most_similar = np.argsort(gram_matrix[graph_idx])
    filtered = [idx for idx in most_similar if nx.number_of_nodes(X[idx]) > 0 and idx != graph_idx and get_similarity(graph_idx, idx) != 0]
    similar_graph_idxs = np.array(filtered[-num_to_plot:])
    graph_idxs = [graph_idx] + similar_graph_idxs.tolist()
    similarities = [get_similarity(graph_idx, idx) for idx in graph_idxs]
    similar_labels = [set(X[idx].nodes()) & set(X[graph_idx].nodes()) for idx in graph_idxs]
    reference_graph = X[graph_idx]
    
    print(phi_used.shape[0], gram_matrix.shape[0])
    print('Graph={}'.format(graph_idx))
    print('NonZeroPhi={}'.format(get_non_zero_phi_elements(graph_idx)))
    print('SimilarOwn={}'.format(get_similarity(graph_idx, graph_idx)))
    print('SimilarIdxs={}'.format(similar_graph_idxs))
    print('Similarities={}'.format(similarities))
    print('SimilarLabels={}'.format(similar_labels))
    
    for similar_graph_idx, graph, text in [(graph_idx, X[graph_idx], X_text[graph_idx]) for graph_idx in graph_idxs]:
        graph = graph.copy()
        graph.remove_nodes_from(set(graph.nodes()) - set(reference_graph.nodes()))
        print(Y[graph_idx], Y[similar_graph_idx])
        fig, ax = plt.subplots(figsize = EXPORT_FIG_SIZE)
        nx.draw_circular(graph, ax = ax, node_size = 14, with_labels = True, node_color = '#000000')
        ax.text(0, 0, str(similar_graph_idx))

for i in range(3):
    random_choice = 1000
    while nx.number_of_nodes(X[random_choice]) > 10:
        random_choice = np.random.randint(0, len(X))
    #random_choice = 1
    plot_similar_graphs(random_choice)
    plt.show()


## Augment edges

Add an edge from node1 to node2 if they are connected by a path of length N

In [None]:
WALK_LENGTH = 2
dataset_name = 'ng20'

for graph_cache_file in dataset_helper.get_all_cached_graph_datasets(dataset_name=dataset_name):
    if 'coo' not in graph_cache_file or 'all' in graph_cache_file: continue
    print(graph_cache_file)
    X_old, Y_old = dataset_helper.get_dataset_cached(graph_cache_file)
    
    # TODO
    X_old, Y_old = X_old[:10], Y_old[:10]
    
    X, Y = copy.deepcopy(X_old), copy.deepcopy(Y_old)
    for idx, graph in enumerate(X):
        if idx % 100 == 0: sys.stdout.write('\r{:3.0f}%'.format(idx / len(X) * 100))
        if graph.number_of_edges() == 0 or graph.number_of_nodes() == 0: continue
        shortest_paths = nx.all_pairs_shortest_path(graph, cutoff=WALK_LENGTH)
        for source, target_dict in shortest_paths.items():
            for target, path in target_dict.items():
                graph.add_edge(source, target, attr_dict = {'weight': 1 / len(path)})
    break

## Distribution of classes per dataset

In [None]:
df = pd.DataFrame(columns = ['dataset', 'label', 'counts'])
df['counts'] = df['counts'].astype(np.uint64)

for dataset in dataset_helper.get_all_available_dataset_names():
    if 'ana' in dataset: continue
    #if dataset not in dataset_helper.DATASETS_LIMITED: continue
    X, Y = dataset_helper.get_dataset(dataset)
    label_counter = collections.Counter(Y)
    df = df.append(pd.DataFrame([(dataset, label, count) for label, count in label_counter.items()], columns = df.columns), )

for dataset, items in df.groupby('dataset'):
    fig, ax = plt.subplots(figsize = EXPORT_FIG_SIZE_BIG)
    num_els = items.counts.sum()
    stdd = (items.counts / num_els).std()
    items = items.set_index('label')
    items.sort_values('counts').counts.plot(kind = 'barh', title = 'Dataset: {}, stdd/#docs: {:.2f}'.format(dataset, stdd))
    plt.show()
    plt.close(fig)


## Graph statistics

Retrieves concept-maps and coo-graphs graph datasets

In [None]:
# Tuples of: (dataset_name, graph_type, (X, Y))
# For cooccurrence graphs, it will hold a (random) choice for each window size
graph_datasets = []
for dataset in dataset_helper.get_all_available_dataset_names():
    if dataset not in dataset_helper.DATASETS_LIMITED: continue
    print('{:30} start'.format(dataset))
    graph_cache_files = dataset_helper.get_all_cached_graph_datasets(dataset_name=dataset)
    gml_graph_cache = [x for x in graph_cache_files if 'concept' in x][0]
    coo_graph_caches = [x for x in graph_cache_files if 'cooc' in x]
    
    def get_window_size(graph_cache_file):
        return graph_cache_file.split('cooccurrence_')[1].split('_')[0]
    
    coo_graphs_by_window_size = collections.defaultdict(lambda: [])
    for cache_file in coo_graph_caches:
        coo_graphs_by_window_size[get_window_size(cache_file)].append(cache_file)
    
    X_cmap, Y_cmap = dataset_helper.get_dataset_cached(gml_graph_cache)
    X_cmap = [x if isinstance(x, nx.Graph) else x[0] for x in X_cmap]
    graph_datasets.append((dataset, 'CMap', (X_cmap, Y_cmap)))
    for window_size, cached_files in sorted(coo_graphs_by_window_size.items(), key=lambda x: x[0]):
        # Take random element from the co-occurence graph datasets
        coo_graph_cache = np.random.choice(cached_files)
        print('\tRetrieving co-occurence graphs for window_size={} ({})'.format(window_size, coo_graph_cache))
        graph_datasets.append((dataset, 'Coo - {}'.format(window_size), dataset_helper.get_dataset_cached(coo_graph_cache)))
    print('{:30} finished'.format(dataset))

### Connected components

In [None]:
data = collections.defaultdict(lambda: [])
for dataset, graph_type, (X, Y) in graph_datasets:
    if graph_type != 'CMap': continue
    print(dataset, graph_type)
    for graph in X:
        data['dataset'].append(dataset)
        data['graph_type'].append(graph_type)
        data['connected_components'].append(nx.number_connected_components(graph))
        data['mean_connected_components_size'].append(np.mean([len(x) for x in nx.connected_components(graph)]) / nx.number_of_nodes(graph))

df_connected_components = pd.DataFrame(data)

In [None]:
for dataset, df_dataset in df_connected_components.groupby('dataset'):
    #break
    for graph_type, df in df_dataset.groupby('graph_type'):
        if graph_type != 'CMap': continue
        
        # Mean connected components size
        fig, ax = plt.subplots(figsize = EXPORT_FIG_SIZE)
        df.mean_connected_components_size.plot(kind = 'hist', bins = 60, logy = True, normed = False, title = 'Dataset: {}, Graph Type: {}\nmedian={:.1f}, mean={:.1f}'.format(dataset, 'Concept Map', df.mean_connected_components_size.median(), df.mean_connected_components_size.mean()))
        ax.set_xlabel('mean connected component size')
        ax.set_ylabel('Frequency')
        fig.tight_layout()
        plt.show()
        plt.close(fig)
        
        fig, ax = plt.subplots(figsize = (EXPORT_FIG_WIDTH, EXPORT_FIG_HEIGHT - 1))
        df.connected_components.plot(kind = 'hist', bins = 60, logy = True, normed = False, title = 'Dataset: {}, Graph Type: {}\nmedian={:.0f}, mean={:.1f}'.format(dataset, 'Concept Map', df.connected_components.median(), df.connected_components.mean()))
        ax.set_xlabel('# connected components')
        ax.set_ylabel('Frequency (log)')
        #ax.set_xticks(x_ticks)
        fig.tight_layout()
        #for ext in ['pdf', 'png']:
        for ext in ['png']:
            fig.savefig('tmp/hist-connected-components-{}-{}.{}'.format(dataset, graph_type, ext))
        plt.show()
        plt.close(fig)

In [None]:
data_ = collections.defaultdict(lambda: [])
for dataset, df_ in df_connected_components.groupby('dataset'):
    data_['dataset'].append(dataset)
    data_['connected_components_percentage_over_1'].append(len(df_[df_.connected_components > 1]) / len(df_) * 100)
df_ = pd.DataFrame(data_).set_index('dataset').sort_index(ascending = False)

fig, ax = plt.subplots(figsize = (EXPORT_FIG_WIDTH, EXPORT_FIG_HEIGHT - 2.3))
df_.connected_components_percentage_over_1.plot(kind = 'barh', ax = ax)
ax.set_xlim(0, 100)
ax.grid('off')
ax.set_xlabel('% of graphs with more than one connected component')
fig.tight_layout()
fig.savefig('tmp/percentage_more_than_one_connected_component.pdf')

### Density, #nodes, #edges histograms

In [None]:
NUM_BINS = 60
alpha = 0.6

graph_metrics = [
    #('density', lambda graph: nx.density(graph) if graph.number_of_nodes() > 0 else 0.0),
    #('number of nodes', lambda graph: graph.number_of_nodes()),
    #('number of edges', lambda graph: graph.number_of_edges()),
    #('connected components', lambda graph: nx.number_connected_components(graph)),
    #('num_nodes_div_num_edges', lambda graph:  graph.number_of_nodes() / graph.number_of_edges() if graph.number_of_edges() > 0 else -99),
    ('#edges / #nodes', lambda graph:  graph.number_of_edges() / graph.number_of_nodes() if graph.number_of_nodes() > 0 else -99)
]

for metric_name, metric in graph_metrics:
    metric_name_clean = re.sub(r'[^a-zA-Z\d]', '', metric_name)
    graph_metrics = []
    for dataset, graph_type, (X, Y) in graph_datasets:
        graph_metrics += [(dataset, graph_type, metric(graph)) for graph in X]

    df = pd.DataFrame(graph_metrics, columns = ['dataset', 'graph_type', 'graph_metric'])
    df = df[df.graph_metric > -10]
    
    fig, ax = plt.subplots(figsize = (EXPORT_FIG_WIDTH, EXPORT_FIG_HEIGHT - 1))
    
    metrics_ = df.graph_metric.tolist()
    binwidth = (max(metrics_) - min(metrics_)) / NUM_BINS
    bins = np.arange(min(metrics_), max(metrics_) + binwidth, binwidth)
    a = df.groupby('graph_type').graph_metric.plot(kind = 'hist',bins = bins, alpha = alpha, ax = ax, logy = True, legend = True)
    medians = df.groupby('graph_type').graph_metric.median()
    for median in medians:
        left, top = ax.transAxes.transform((0, 2.2))
        ax.axvline(median, ymax = 1, linewidth=1, alpha = alpha, color='b', linestyle='dashed')
        #ax.text(median, top, s = '{:.2f}'.format(median), fontdict={'horizontalalignment': 'center'}) #, transform = ax.transAxes)
    ax.set_xlabel(metric_name)
    ax.grid('off')
    fig.tight_layout()
    plt.show()
    fig.savefig('tmp/graph-statistics/hist-{}.pdf'.format(metric_name_clean))
    plt.close(fig)

### Plot examples of graph types
concept map and co-occurrence

In [None]:
df = pd.DataFrame(graph_datasets, columns = ['dataset', 'graph_type', 'graph_dataset'])

In [None]:
NUM_GRAPHS_PER_TYPE = 3

for dataset, data in df.groupby('dataset'):
    fig, axes = plt.subplots(ncols=data.graph_type.value_counts().size, nrows=NUM_GRAPHS_PER_TYPE)

    for idx, row_ax in enumerate(axes):
        print('Row: {}/{}'.format(idx + 1, len(axes)))
        for (_, item), ax in zip(data.iterrows(), row_ax):
            if idx == 0:
                ax.set_title(item.graph_type)
            X, Y = item.graph_dataset

            random_graph = None
            while not random_graph or nx.number_of_nodes(random_graph) not in range(5, 10):
                random_graph = np.random.choice(X)
                
            nx.draw_networkx(random_graph, ax = ax, node_size = 14, with_labels = False, node_color = '#000000')#, style = 'dotted')
            
            ax.set_xticks([])
            ax.set_yticks([])
            ax.grid('off')
            for spine in ax.spines.values():
                spine.set_color('#FFFFFF')
    
    fig.tight_layout(h_pad=3, w_pad = 3)
    fig.savefig('tmp/graph-examples.png')
    plt.close(fig)

## Plot unique word counts

In [None]:
word_counts = []
for dataset_name in dataset_helper.get_all_available_dataset_names():
    X, Y = dataset_helper.get_dataset(dataset_name)
    text = []
    for t in X:
        text.append(t)
    text = ' '.join(text)
    text = text.lower().replace('\n', ' ')
    words = [x.strip() for x in text.split() if x.strip() != '']
    unique_words = set(words)
    word_counts.append((dataset_name, len(unique_words), len(words)))

In [None]:

for dataset_name in dataset_helper.get_all_available_dataset_names():
    if 'ana' in dataset_name: continue
    print(dataset_name)
    X, Y = dataset_helper.get_dataset(dataset_name)
    X_pp = preprocessing.preprocess_text_spacy(X, concat = False, only_nouns = False)
    break

In [None]:

df = pd.DataFrame(word_counts, columns = ['dataset', 'unique_words', 'words']).set_index('dataset').sort_values('unique_words')
df['unique_words_ratio'] = df.unique_words / df.words

fig, ax = plt.subplots(figsize = (12, 6))
df[['unique_words', 'words']].plot(kind = 'barh', logx = True, title = 'Unique word count', ax = ax)
fig, ax = plt.subplots(figsize = (12, 6))
df.unique_words_ratio.plot(kind = 'barh', title = '#Unique words/#words', ax = ax);

## Merge node labels

In [None]:
num_labels = len(labels)

for (n, treshold), lookup in results.items():
    cliques = coreference.get_cliques_from_lookup(lookup)
    similarity_counter = {'similar': len(lookup.keys()), 'unsimilar': num_labels - len(lookup.keys())}
    clique_lenghts = [len(x) for x in list(cliques.values())]
    fig, axes = plt.subplots(1, 2, figsize = (14, 6))
    fig.suptitle('Treshold: {}, N={}'.format(treshold, n), fontsize = 16)

    pd.DataFrame(clique_lenghts).plot(ax = axes[0], kind = 'hist', logy = True, legend = False, title = "Histogram of clique lengths".format(treshold))
    pd.DataFrame(list(similarity_counter.items()), columns = ['name', 'count']).set_index('name').plot(ax = axes[1], kind = 'bar', legend = False, title = '# of labels that have been merged vs. not merged')
    fig.tight_layout()
    fig.subplots_adjust(top=0.85)
    fig.savefig('tmp/{:.5f}.{}.png'.format(treshold, n), dpi = 120)
    plt.close(fig)

In [None]:
def plot_by(df, by, bins = 15, title = '', figsize = (12, 5), fontsize = 16):
    fig, ax = plt.subplots(figsize = figsize)

    data = []
    labels = []
    for n, vals in df.groupby(by):
        labels.append(n)
        data.append(vals.clique_length)
    ax.hist(data, bins = bins, alpha=0.7, label=labels, log = True)
    fig.suptitle(title, fontsize = fontsize)
    ax.legend(loc='upper right', fontsize = fontsize)
    ax.set_xlabel('clique sizes')
    fig.tight_layout()
    fig.subplots_adjust(top=0.9)
    return fig, ax
fig, ax = plot_by(df, 'n', title = 'Clique size histogram by n (all thresholds together)')
fig.savefig('tmp/clique_size_by_n_all_thresholds.png', dpi = 120)
fig, ax = plot_by(df, 'threshold', title = 'Clique size histogram by threshold (all n together)')
fig.savefig('tmp/clique_size_by_threshold_all_n.png', dpi = 120)
fig, ax = plot_by(df[df.threshold == 0.6], 'n', title = 'Clique size histogram by n (threshold=0.6)')
fig.savefig('tmp/clique_size_by_n_threshold_0.6.png', dpi = 120)
plt.show()

In [None]:
for lookup_file in glob('data/embeddings/graph-embeddings/*.threshold-*.*.label-lookup.npy'):
    threshold, topn = get_treshold_and_topn_from_lookupfilename(lookup_file)
    with open(lookup_file, 'rb') as f:
        lookup = pickle.load(f)
    for key in lookup.values():
        if not isinstance(key, (str, int)):
            print("?")
            break
    fig, axes = coreference.plot_lookup_histogram(lookup=lookup, title = 'threshold={}, topn={}'.format(threshold, topn))
    plt.show()
    plt.close(fig)

## Create small dataset

In [None]:
graph_cache_file = 'dataset_graph_gml_ng20-single.npy'
X, Y = dataset_helper.get_dataset_cached('data/CACHE/{}'.format(graph_cache_file))
X, Y = np.array(X, dtype=object), np.array(Y, dtype=object)
sss = sklearn.model_selection.StratifiedShuffleSplit(n_splits = 40, random_state=42)
for train_index, test_index in sss.split(X, Y):
    X_test, Y_test = X[test_index], Y[test_index]
    break
with open('data/CACHE/dataset_graph_gml_small-single.npy', 'wb') as f:
    pickle.dump((X_test.tolist(), Y_test.tolist()), f)    

## Label occurrence histogram

In [None]:
counts = collections.defaultdict(lambda: [])
for dataset_name in dataset_helper.get_all_available_dataset_names():
    if dataset_name not in ['ling-spam', 'ng20', 'webkb', 'reuters-21578']: continue
    
    graph_cache_files = dataset_helper.get_all_cached_graph_datasets(dataset_name)
    if not len(graph_cache_files): continue
    
    has_already = collections.defaultdict(lambda: False)
    for graph_cache_file in graph_cache_files:
        graph_type = graph_helper.get_graph_type_from_filename(graph_cache_file)
        assert graph_type
        if has_already[TYPE_CONCEPT_MAP] and has_already[TYPE_COOCCURRENCE]: break
        if has_already[graph_type]: continue
        has_already[graph_type] = True

        print('Loading dataset: {}'.format(graph_cache_file))
        X_old, _ = dataset_helper.get_dataset_cached(graph_cache_file)
        label_counter = collections.Counter()
        graph_helper.convert_graphs_to_adjs_tuples(X_old)
        for adj, labels in X_old:
            label_counter.update(labels)
        counts_ = list(label_counter.values())
        counts['dataset'] += [dataset_name] * len(counts_)
        counts['type'] += [graph_type] * len(counts_)
        counts['counts'] += counts_
print('Finished')

In [None]:
df = pd.DataFrame(counts)
fig, axes = plt.subplots(nrows=2, ncols=2)
axes = df.hist(log = True, bins = 120, by = 'dataset', ax = axes)

for ax in axes.flatten():
    ax.set_xlim((0, 10000))

fig.tight_layout()
fig.savefig('tmp/label-distribution-per-dataset.png', dpi = EXPORT_DPI)

In [None]:
data = collections.defaultdict(lambda: [])
for dataset, df_ in df.groupby('dataset'):
    data['dataset'].append(dataset)
    for t, df__ in df_.groupby('type'):
        data['percentage_labels_once_{}'.format(t)].append(len(df__[df__.counts == 1]) / len(df__) * 100)

df_ = pd.DataFrame(data).set_index('dataset').sort_index(ascending = False)

if False:
    fig, ax = plt.subplots(figsize=(EXPORT_FIG_WIDTH, EXPORT_FIG_HEIGHT - 1.5))
    df_.plot(kind = 'barh', ax = ax)
    ax.set_xlabel('% of labels that occur once')
    ax.set_ylabel('')
    ax.set_xlim(0, 110)
    x_ticks = np.array(range(11)) * 10
    ax.set_xticks(x_ticks);
    fig.tight_layout()
    fig.savefig('tmp/percentage_one_label_occurrence.pdf')

In [None]:
df_ = df_.sort_index()
data = []
for dataset, df__ in df_connected_components.groupby('dataset'):
    percentage = len(df__[df__.connected_components <= 1]) / len(df__)
    data.append(percentage)
    print(dataset, percentage)
    #df_.loc[dataset][] = percentage
df_['percentage_of_one_connected_component_concept-map'] = data
df_