In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import networkx as nx
from scipy.sparse import lil_matrix, csr_matrix, vstack
import copy
import os
from wl import *

In [3]:
if 0 == 1:
    for start, end in [(0, 7)]:
        print("# YES", test_graph_names[start:end])
        k, _, label_lookup = get_wl_for_graph_names(test_graph_names[start:end], fn = WL_compute)
        k_org, _ = get_wl_for_graph_names(test_graph_names[start:end], fn = WL_compute_original)
        print(k_org[-1] == k[-1])

## Retrieve graphs from Tobias' concept-graph extraction library

In [4]:
import networkx as nx
import matplotlib.pyplot as plt
from glob import glob
GRAPH_DIR = 'extract-concept-graphs/code/data/ng20/{}_graphs'

def get_graphs(directory):
    graphs = {}
    for topic in os.listdir(directory):
        graph_dir = os.path.join(directory, topic)
        if not os.path.isdir(graph_dir): continue
        graph_file = glob(os.path.join(graph_dir, '*.gml'))[0]
        with open(graph_file) as f:
            graph = f.read().split('\n')
        for idx, line in enumerate(graph):
            if line.startswith('label'):
                next_line = graph[idx + 1]
                label = next_line.replace('name', 'label')
                graph[idx] = label
        new_graph_file = graph_file.replace('.gml', '.renamed.gml')
        with open(new_graph_file, 'w') as f:
            f.write('\n'.join(graph))
        graph = nx.read_gml(new_graph_file, label = 'label')
        if graph.number_of_nodes() > 0 and graph.number_of_edges() > 0:
            graphs[topic] = graph
        else:
            graphs[topic] = None
            print("Found empty graph: {}".format(graph_file))
    return graphs
graphs_test_docs = get_graphs(GRAPH_DIR.format('test'))
graphs_train_docs = get_graphs(GRAPH_DIR.format('train'))
assert graphs_test_docs.keys() == graphs_train_docs.keys()
assert len(graphs_test_docs.keys()) > 0

all_nodes = [val for sublist in [g.nodes() for g in list(graphs_train_docs.values()) + list(graphs_test_docs.values()) if g is not None] for val in sublist]

Found empty graph: extract-concept-graphs/code/data/ng20/test_graphs/rec.sport.baseball/ng-20.graph.gml
Found empty graph: extract-concept-graphs/code/data/ng20/test_graphs/sci.med/ng-20.graph.gml
Found empty graph: extract-concept-graphs/code/data/ng20/test_graphs/sci.space/ng-20.graph.gml
Found empty graph: extract-concept-graphs/code/data/ng20/test_graphs/talk.politics.guns/ng-20.graph.gml
Found empty graph: extract-concept-graphs/code/data/ng20/test_graphs/talk.politics.misc/ng-20.graph.gml


## Some helper

In [5]:
def get_wl_for_graphs(graphs, all_nodes = all_nodes,  h = 10, fn = WL_compute):
    nodes = [g.nodes() for g in graphs]
    adjs = [nx.adjacency_matrix(g).toarray() for g in graphs]
    return fn(adjs, nodes, all_nodes = all_nodes, h = h)

def get_wl_for_graph_names(graph_names, add_all_nodes = False, h = 10, fn = WL_compute):
    graphs = [graphs_test_docs[x] for x in graph_names]
    graphs[0] = graphs[0].copy()
    if add_all_nodes:
        for node in all_nodes:
            graphs[0].add_node(node)
    return get_wl_for_graphs(graphs, all_nodes, h = h, fn = fn)

test_graph_names = list(sorted(graphs_test_docs.keys()))

## Calculate gram-matrix for WL kernel for the training concept-graphs

In [98]:
#%%time
from sklearn import svm

augmented_graph = graphs_train_docs[list(graphs_train_docs.keys())[0]]
#for node in all_nodes:
#    augmented_graph.add_node(node)
adjs = []
nodes = []
for topic, graph in sorted(graphs_train_docs.items(), key = lambda x: x[0]):
    adjs.append(nx.adjacency_matrix(graph).toarray())
    nodes.append(graph.nodes())
print('Calculating WL_train: start')
K_train, phi_list_train, label_lookup_train, label_counters_train = WL_compute(adjs, nodes, 4, all_nodes = all_nodes, DEBUG = True)
print('Calculating WL_train: end')

Calculating WL_train: start
Number of original labels 5529
K original is computed
Iteration 0: phi is computed
Number of compressed labels 5187
Itaration 0: phi sparse saved
Iteration 0: K is computed
Iteration 1: phi is computed
Number of compressed labels 5187
Itaration 1: phi sparse saved
Iteration 1: K is computed
Iteration 2: phi is computed
Number of compressed labels 5187
Itaration 2: phi sparse saved
Iteration 2: K is computed
Iteration 3: phi is computed
Number of compressed labels 5187
Itaration 3: phi sparse saved
Iteration 3: K is computed
Calculating WL_train: end


In [7]:
phi_list_train[0].shape

(10994, 20)

## Train SVM classifier on the gram-matrix

In [8]:
clf = svm.SVC(kernel = 'precomputed', verbose = True, probability=True, )
clf.fit(K_train[-1], sorted(graphs_train_docs.keys()))

[LibSVM]

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto',
  kernel='precomputed', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=True)

## Test classifier

In [207]:
RECOMPUTE = True
all_graphs_test = [('test', topic, graph) for topic, graph in graphs_test_docs.items()]
all_graphs_train = [('train', topic, graph) for topic, graph in graphs_train_docs.items()]

def print_predict(X_test):
    print('Predicted:\t{}'.format(clf.predict(X_test)[0]))
    
vals = []
for set_, test_topic, graph in sorted(all_graphs_train, key = lambda x: x[0]):
    if graph is None: continue
    print('Testing graph:\t{:40} ({})'.format(test_topic, set_))
    test_graph = graph
    K_test, phi_list_test = WL_compute_new(
        ad_list=[nx.adjacency_matrix(test_graph).toarray()],
        node_label=[test_graph.nodes()],
        all_nodes= all_nodes,
        h = 4,
        k_prev = np.copy(K_train),
        phi_prev = np.copy(phi_list_train),
        label_counters_prev = label_counters_train,
        label_lookups_prev = np.copy(label_lookup_train)
    )

    X_test = K_test[-1][-1, :-1].reshape(1, -1)
    vals.append((K_test, phi_list_test, X_test))
    print_predict(X_test)
    
    if RECOMPUTE and True:
        adjs_test =  adjs + [nx.adjacency_matrix(test_graph).toarray()]
        nodes_test =  nodes + [test_graph.nodes()]
        K_test, phi_list_test, label_lookup_test, label_counters_test = WL_compute(adjs_test, nodes_test, 4, all_nodes = all_nodes, DEBUG = False)
        X_test = K_test[-1][-1, :-1].reshape(1, -1)
        print_predict(X_test)
        vals.append((K_test, phi_list_test, X_test))
    

    probs_test = clf.predict_proba(X_test)[0]
    probs_zipped = list(zip(clf.classes_, probs_test))
    print()
    #for clazz, prob in sorted(probs_zipped, key = lambda x: x[1])[:5]:
    #    print('\t{:.3f}\t{}'.format(prob, clazz))


Testing graph:	soc.religion.christian                   (train)
Predicted:	soc.religion.christian
Predicted:	soc.religion.christian

Testing graph:	sci.electronics                          (train)
Predicted:	sci.electronics
Predicted:	sci.electronics

Testing graph:	comp.os.ms-windows.misc                  (train)
Predicted:	comp.os.ms-windows.misc
Predicted:	comp.os.ms-windows.misc

Testing graph:	sci.crypt                                (train)
Predicted:	sci.crypt
Predicted:	sci.crypt

Testing graph:	talk.politics.misc                       (train)
Predicted:	talk.politics.misc
Predicted:	talk.politics.misc

Testing graph:	comp.windows.x                           (train)
Predicted:	comp.windows.x
Predicted:	comp.windows.x

Testing graph:	sci.med                                  (train)
Predicted:	sci.med
Predicted:	sci.med

Testing graph:	talk.politics.mideast                    (train)
Predicted:	talk.politics.mideast
Predicted:	talk.politics.mideast

Testing graph:	comp.sys.mac.ha

In [205]:
K_test, phi_list_test, X_test = vals[-2]
K_real, phi_list_real, X_real = vals[-1]

for i, (a, b) in enumerate(zip(phi_list_test, phi_list_real)):
    b = b.todense()[:a.shape[0]] #.astype(np.float32)
    if not np.array_equal(a.astype(np.int32) - b.astype(np.int32), np.zeros(b.shape, dtype = np.int32)):
        break
        
#print(lil_matrix(phi_list_test[0][:,5]), '\n', phi_list_train[0][:,5])
for i, (a, b) in enumerate(zip(K_test, K_real)):
    print(a.dtype, b.dtype)
    if not np.array_equal(a, b):
        print("K different!")


float32 float32
float32 float32
float32 float32
float32 float32
float32 float32


## Draw graphs

In [None]:
if 0 == 1:
    fig = plt.figure(figsize=(30, 30))
    pos=nx.spring_layout(g_mult_dir)
    nx.draw(g_mult_dir, pos = pos)

    edge_labels=dict([((u,v,),d['name'])
                 for u,v,d in g_mult_dir.edges(data=True)])
    nx.draw_networkx_edge_labels(g_mult_dir,pos,edge_labels=edge_labels)
    nx.draw_networkx_nodes(g_mult_dir, pos = pos, label='name', )
    nx.draw_networkx_labels(g_mult_dir, pos = pos)
    plt.show()

    graphs = sorted(list(nx.weakly_connected_component_subgraphs(g_mult_dir)), key = len)#, reverse = True)

    for graph in graphs[-4:]:
        fig = plt.figure(figsize=(20, 20))
        pos=nx.spring_layout(graph)
        #nx.draw(graph, pos = pos)
        edge_labels=dict([((u,v,),d['name'])
                     for u,v,d in graph.edges(data=True)])
        nx.draw_networkx_edge_labels(graph, pos, edge_labels=edge_labels)
        nx.draw_networkx_edges(graph, pos, arrows = True)
        nx.draw_networkx_nodes(graph, pos = pos, label='name')
        nx.draw_networkx_labels(graph, pos = pos)
        plt.show()
