In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
EXPORT_DPI = 100
EXPORT_FIG_SIZE = (8, 4 )
EXPORT_FIG_WIDTH, EXPORT_FIG_HEIGHT = EXPORT_FIG_SIZE

sns.set('notebook', 'whitegrid')

## Statistics about edges

In [None]:
import dataset_helper
import collections
import networkx as nx
import sys

WALK_LENGTH = 2
dataset_name = 'ng20'

for graph_cache_file in dataset_helper.get_all_cached_graph_datasets(dataset_name=dataset_name):
    if 'coo' not in graph_cache_file or 'all' in graph_cache_file: continue
    print(graph_cache_file)
    X, Y = dataset_helper.get_dataset_cached(graph_cache_file)
    for idx, graph in enumerate(X):
        if idx % 100 == 0: sys.stdout.write('\r{:3.0f}%'.format(idx / len(X) * 100))
        if graph.number_of_edges() == 0 or graph.number_of_nodes() == 0: continue
        shortest_paths = nx.all_pairs_shortest_path(graph, cutoff=WALK_LENGTH)
        for source, target_dict in shortest_paths.items():
            for target, path in target_dict.items():
                if len(path) < 2: continue
                graph.add_edge(source, target, attr_dict = {'weight': 1 / len(path)})
    break

In [None]:
import numpy as np

for dataset in ['ng20']:
    graph_cache_files = dataset_helper.get_all_cached_graph_datasets(dataset_name=dataset)
    gml_graph_cache = [x for x in graph_cache_files if 'gml' in x][0]
    coo_graph_caches = [x for x in graph_cache_files if 'cooccurrence_1' in x and 'all' not in x]
    coo_graph_cache = np.random.choice(coo_graph_caches)
    print(gml_graph_cache, coo_graph_cache)
    
    X_gml, Y = dataset_helper.get_dataset_cached(gml_graph_cache)
    X_coo, _ = dataset_helper.get_dataset_cached(coo_graph_cache)
    print(len(X_gml), len(X_coo))

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

figsize = (16, 4)
NUM_BINS = 40
alpha = 0.7

graph_metrics = [
    ('density', lambda graph: nx.density(graph) if graph.number_of_nodes() > 0 else 0.0),
    ('number of nodes', lambda graph: graph.number_of_nodes()),
    ('number of edges', lambda graph: graph.number_of_edges()),
    ('#nodes/#edges', lambda graph:  graph.number_of_nodes() / graph.number_of_edges() if graph.number_of_edges() > 0 else -99),
    ('#edges/#nodes', lambda graph:  graph.number_of_edges() / graph.number_of_nodes() if graph.number_of_nodes() > 0 else -99)
]

for metric_name, metric in graph_metrics:
    graph_metrics = []
    for graph_type, graphs in [('concept-map', X_gml), ('co-occurrence (ws=1)', X_coo)]:
        graph_metrics += [(graph_type, metric(graph)) for graph in graphs]

    df = pd.DataFrame(graph_metrics, columns = ['graph_type', 'graph_metric'])
    df = df[df.graph_metric > -10]
    fig, ax = plt.subplots(figsize = figsize)
    metrics_ = df.graph_metric.tolist()
    binwidth = (max(metrics_) - min(metrics_)) / NUM_BINS
    bins = np.arange(min(metrics_), max(metrics_) + binwidth, binwidth)
    df.groupby('graph_type').graph_metric.plot(kind = 'hist', bins = bins, alpha = alpha, ax = ax, title = 'Histogram of {}'.format(metric_name), logy = True, legend = True)
    ax.set_xlabel(metric_name)
    plt.show()
    plt.close(fig)
    #ax = sns.violinplot(x = 'graph_type', y = 'graph_densities', data=df, cut = 0, inner = 'quartile')

In [None]:
import matplotlib.pyplot as plt

figsize = (10, 10)

NUM_GRAPHS = 2
used_figsize = (18, NUM_GRAPHS * 5)
used_figsize = EXPORT_FIG_SIZE
fig, axes = plt.subplots(ncols=2, nrows=NUM_GRAPHS, figsize = used_figsize)

for ax in axes.flatten():
    ax.set_xticks([])
    ax.set_yticks([])
    
for idx, row_ax in enumerate(axes):
    graph_gml, graph_coo = np.random.choice(X_gml), np.random.choice(X_coo)
    ax = row_ax[0]
    nx.draw_networkx(graph_gml, ax = ax, node_size = 0, style = 'dotted')
    
    if idx == 0:
        ax.set_title('Concept map')
        
    ax = row_ax[1]
    nx.draw_networkx(graph_coo, ax = ax, node_size = 0, style = 'dotted')

    if idx == 0:
        ax.set_title('Co-Occurrence (window size: {})'.format(coo_graph_cache.split('cooccurrence_')[-1].split('_')[0]))
        
fig.tight_layout()
fig.savefig('tmp/graph-examples.png', dpi = EXPORT_DPI)

## Mixed tfidf and co-occurrence features

In [None]:
import sklearn
from sklearn import pipeline
from sklearn import feature_extraction
from sklearn import svm
from sklearn.base import BaseEstimator, TransformerMixin

import dataset_helper

X_text, Y_text = dataset_helper.get_dataset('ling-spam')
graph_cache_files = dataset_helper.get_all_cached_graph_phi_datasets(dataset_name='ling-spam')
X_graph_phi, Y_graph_phi = dataset_helper.get_dataset_cached(graph_cache_files[0], check_validity=False)

assert len(X_text) == X_graph_phi[0].shape[0]

# See http://scikit-learn.org/stable/auto_examples/plot_feature_stacker.html#sphx-glr-auto-examples-plot-feature-stacker-py
class TupleSelector(BaseEstimator, TransformerMixin):
    def __init__(self, tuple_index):
        self.tuple_index = tuple_index

    def fit(self, x, y=None):
        return self

    def transform(self, X):
        return [x[self.tuple_index] for x in X]
    
combined_features = sklearn.pipeline.FeatureUnion([
    ('tfidf', sklearn.pipeline.Pipeline([
        ('selector', TupleSelector(tuple_index=0)),
        ('tfidf', sklearn.feature_extraction.text.TfidfVectorizer(stop_words = 'english')),
    ])),
    ('phi', sklearn.pipeline.Pipeline([
        ('selector', TupleSelector(tuple_index=1))
    ]))
])

param_grid = dict()
X_combined = list(zip(X_text, X_graph_phi[0]))
svm = sklearn.svm.SVC(class_weight = 'balanced')
pipeline = sklearn.pipeline.Pipeline([("features", combined_features), ("svm", svm)])
gscv = sklearn.model_selection.GridSearchCV(pipeline, param_grid=param_grid, verbose=11)
gscv.fit(X_combined, Y_text)
#trans = sklearn.feature_extraction.text.TfidfVectorizer(stop_words = 'english')
#trans.fit(X_text, Y_text)
#tfidf = trans.transform(X_text)
#print(len(X_text))
#combined_features.fit(X_combined, Y_text)
#combined_features.transform(X_combined)
#print(combined_features)

In [None]:
import dataset_helper
import sympy
import pandas as pd

word_counts = []
for dataset_name in dataset_helper.get_all_available_dataset_names():
    X, Y = dataset_helper.get_dataset(dataset_name)
    text = []
    for t in X:
        text.append(t)
    text = ' '.join(text)
    text = text.lower().replace('\n', ' ')
    words = [x.strip() for x in text.split() if x.strip() != '']
    unique_words = set(words)
    word_counts.append((dataset_name, len(unique_words)))

In [None]:
df = pd.DataFrame(word_counts, columns = ['dataset', 'unique_words']).set_index('dataset').sort_values(by = 'unique_words')
df.unique_words.plot(kind = 'barh', logx = True)
df

In [None]:
import pickle

with open('data/primes.npy', 'rb') as f:
    primes = pickle.load(f)

max_prime_range = sorted(list(primes.keys()))[-1]
max_words = df.unique_words.max()
num_primes = len(primes[max_prime_range])
print(max_words, num_primes, num_primes - max_words)

## Mixed gml classification (single document, merged document)

In [None]:
import dataset_helper
from kernels import fast_wl
import networkx as nx
import graph_helper
import numpy as np

#'reuters-21578',
for dataset in ['ng20']:
    X, Y = dataset_helper.get_gml_graph_dataset(dataset)

In [None]:
X_single, Y_single = dataset_helper.get_dataset_cached([x for x in dataset_helper.get_all_cached_graph_datasets(dataset_name='ng20') if 'gml' in x and 'single' in x][0])

In [None]:
def filter_classes(x, y, classes_to_keep = ()):
    indices = [y_ in classes_to_keep for y_ in y]
    return np.array(x, dtype = object)[indices].tolist(), np.array(y, dtype = object)[indices].tolist()

X_single_filtered, Y_single_filtered = filter_classes(X_single, Y_single, set(Y))

# Compute phi
graph_helper.convert_graphs_to_adjs_tuples(X_single_filtered)
graph_helper.convert_graphs_to_adjs_tuples(X)

In [None]:
X_merged = np.concatenate([X, X_single_filtered])
Y_merged = np.concatenate([Y, Y_single_filtered])
phi_lists, new_label_lookups, new_label_counters = fast_wl.fast_wl_compute(X_merged.tolist(), h = 4)

In [None]:
import sklearn
import pandas as pd

svm = sklearn.svm.LinearSVC()
results = []
for idx, phi in enumerate(phi_lists):
    svm.fit(phi.T[:len(X),:], Y)
    Y_pred = svm.predict(phi.T[len(X):,:])
    results.append((idx, sklearn.metrics.f1_score(y_true = Y_single_filtered, y_pred=Y_pred, average = 'micro')))
    #print(sklearn.metrics.classification_report(y_true = Y_single_filtered, y_pred = Y_pred)) #, average='macro')
pd.DataFrame(results, columns = ['phi', 'f1_macro']).set_index('phi')

## SPGK

In [None]:
from kernels import spgk
import dataset_helper
import numpy as np
import sklearn

graph_dataset_caches = dataset_helper.get_all_cached_graph_datasets()
for graph_cache in graph_dataset_caches:
    if 'ling-spam' not in graph_cache: continue
    print(graph_cache)
    X, Y = dataset_helper.get_dataset_cached(graph_cache)
    #X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, Y, train_size = 0.7, stratify = Y)
    X_train, y_train = X, Y
    K = spgk.build_kernel_matrix(X_train, depth = 1)
    break

In [None]:
import collections
NUM_ELEMENTS=int(K.shape[0] * 0.8)
svm = sklearn.svm.SVC(kernel = 'precomputed', class_weight='balanced')
#sklearn.model_selection.cross_val_score(svm, K, y_train)
svm.fit(K[:NUM_ELEMENTS,:NUM_ELEMENTS], y_train[:NUM_ELEMENTS])
#Y_pred = svm.predict(K[NUM_ELEMENTS:,:NUM_ELEMENTS])
#print(collections.Counter(y_train[:NUM_ELEMENTS]))
#print(sklearn.metrics.classification_report(y_true = y_train[NUM_ELEMENTS:], y_pred=Y_pred))

## DeepWalk

In [None]:
import deepwalk
from deepwalk import graph
from deepwalk import walks as serialized_walks
from gensim.models import Word2Vec
from deepwalk.skipgram import Skipgram
import dataset_helper
import graph_helper
import random
from gensim.models import Word2Vec
import tsne
import matplotlib.pyplot as plt

max_memory_data_size = 1000000000
number_walks = 1000
representation_size = 64
seed = 0
undirected = True
vertex_freq_degree = False
walk_length = 60
window_size = 10
workers = 1
output = 'data/DUMP'

for dataset in dataset_helper.get_all_available_dataset_names():
    cache_file = dataset_helper.CACHE_PATH + '/dataset_graph_cooccurrence_{}.npy'.format(dataset)
    X, Y = dataset_helper.get_dataset(dataset, preprocessed = False, use_cached=True, transform_fn=graph_helper.convert_dataset_to_co_occurence_graph_dataset, cache_file=cache_file)
    break
    
models = []
for idx, g in enumerate(X):
    if idx == 3: break
    print('Graph: {:>4}'.format(idx))
    G = graph.from_networkx(g)

    print("Number of nodes: {}".format(len(G.nodes())))
    if len(G.nodes()) == 0:
        continue

    num_walks = len(G.nodes()) * number_walks

    print("Number of walks: {}".format(num_walks))

    data_size = num_walks * walk_length

    print("Data size (walks*length): {}".format(data_size))

    print("Walking...")
    walks = graph.build_deepwalk_corpus(G, num_paths=number_walks, path_length=walk_length, alpha=0, rand=random.Random(seed))
    print("Training...")
    model = Word2Vec(walks, size=representation_size, window=window_size, min_count=0, workers=workers)

    #model.wv.save_word2vec_format(output)
    models.append(model)
print('Finished')

## tSNE

In [None]:
for model in models:
    print('Next')
    vectors = tsne.get_tsne_embedding(model)
    tsne.plot_embedding(model, vectors)
    plt.show()

## Test WL phi computation

In [None]:
if 0 == 1:
    for i, (a, b) in enumerate(zip(phi_1_2_3_test, phi_1_2_3)):
        if not np.array_equiv(a - b.todense(), np.zeros(b.shape, dtype = np.int32)):
            print("\tPhi different! {}".format(i))
            print(np.argwhere((a - b) != 0))

    for i, (a, b) in enumerate(zip(K_1_2_3_test, K_1_2_3)):
        if not np.array_equal(a, b):
            print(np.argwhere((a - b) != 0))
            print("\tK different! {}".format(i))

## Merge node labels

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

num_labels = len(labels)

for (n, treshold), lookup in results.items():
    cliques = coreference.get_cliques_from_lookup(lookup)
    similarity_counter = {'similar': len(lookup.keys()), 'unsimilar': num_labels - len(lookup.keys())}
    clique_lenghts = [len(x) for x in list(cliques.values())]
    fig, axes = plt.subplots(1, 2, figsize = (14, 6))
    fig.suptitle('Treshold: {}, N={}'.format(treshold, n), fontsize = 16)

    pd.DataFrame(clique_lenghts).plot(ax = axes[0], kind = 'hist', logy = True, legend = False, title = "Histogram of clique lengths".format(treshold))
    pd.DataFrame(list(similarity_counter.items()), columns = ['name', 'count']).set_index('name').plot(ax = axes[1], kind = 'bar', legend = False, title = '# of labels that have been merged vs. not merged')
    fig.tight_layout()
    fig.subplots_adjust(top=0.85)
    fig.savefig('tmp/{:.5f}.{}.png'.format(treshold, n), dpi = 120)
    plt.close(fig)

In [None]:
sns.set('notebook', 'white')
def plot_by(df, by, bins = 15, title = '', figsize = (12, 5), fontsize = 16):
    fig, ax = plt.subplots(figsize = figsize)

    data = []
    labels = []
    for n, vals in df.groupby(by):
        labels.append(n)
        data.append(vals.clique_length)
    ax.hist(data, bins = bins, alpha=0.7, label=labels, log = True)
    fig.suptitle(title, fontsize = fontsize)
    ax.legend(loc='upper right', fontsize = fontsize)
    ax.set_xlabel('clique sizes')
    fig.tight_layout()
    fig.subplots_adjust(top=0.9)
    return fig, ax
fig, ax = plot_by(df, 'n', title = 'Clique size histogram by n (all thresholds together)')
fig.savefig('tmp/clique_size_by_n_all_thresholds.png', dpi = 120)
fig, ax = plot_by(df, 'threshold', title = 'Clique size histogram by threshold (all n together)')
fig.savefig('tmp/clique_size_by_threshold_all_n.png', dpi = 120)
fig, ax = plot_by(df[df.threshold == 0.6], 'n', title = 'Clique size histogram by n (threshold=0.6)')
fig.savefig('tmp/clique_size_by_n_threshold_0.6.png', dpi = 120)
plt.show()

## Create small dataset

In [None]:
import dataset_helper
import pickle
import numpy as np
import sklearn
graph_cache_file = 'dataset_graph_gml_ng20-single.npy'
X, Y = dataset_helper.get_dataset_cached('data/CACHE/{}'.format(graph_cache_file))
X, Y = np.array(X, dtype=object), np.array(Y, dtype=object)
sss = sklearn.model_selection.StratifiedShuffleSplit(n_splits = 40, random_state=42)
for train_index, test_index in sss.split(X, Y):
    X_test, Y_test = X[test_index], Y[test_index]
    break
with open('data/CACHE/dataset_graph_gml_small-single.npy', 'wb') as f:
    pickle.dump((X_test.tolist(), Y_test.tolist()), f)    

In [None]:
import dataset_helper
import pandas as pd
from IPython.display import display
import matplotlib.pyplot as plt
import networkx as nx
import seaborn as sns

sns.set('notebook', 'white')
limit_dataset = ['ng20', 'ling-spam', 'reuters-21578', 'webkb']
#limit_dataset = ['ling-spam']
all_stats = {}
for dataset_name in dataset_helper.get_all_available_dataset_names():
    if dataset_name not in limit_dataset: continue
    print(dataset_name)
    X, Y = dataset_helper.get_dataset(dataset_name)
    graphs = dataset_helper.get_all_cached_graph_datasets(dataset_name)
    number_of_nodes = []
    
    coo_graph = [x for x in graphs if 'cooccurrence' in x][0]
    gml_graph = [x for x in graphs if 'gml' in x][0]
    
    def get_num_nodes(graph_file):
        X_graph, _ = dataset_helper.get_dataset_cached(graph_file)
        return [nx.number_of_nodes(x) for x in X_graph]
    
    stats = {
        'cooccurrence': get_num_nodes(coo_graph),
        'concept-graphs': get_num_nodes(gml_graph)
    }
    
    min_len = min([len(v) for k, v in stats.items()])
    graph_stats = {k: v[:min(min_len, len(v))] for k, v in stats.items()}
    text_stats = {'doc_lengths': [len(x) for x in X]}
    
    all_stats[dataset_name] = {
        'graphs': graph_stats,
        'text': text_stats['doc_lengths'],
        'num_docs': len(X),
        'num_classes': len(set(Y))
    }


In [None]:
figsize = (12, 4)
bins = 20

for dataset_name, stats in all_stats.items():
    graph_stats = stats['graphs']
    text_stats = stats['text']
    
    df = pd.DataFrame(graph_stats)
    
    fig, ax = plt.subplots(nrows = 1, ncols=2, figsize = figsize)
    df.plot(kind = 'hist', bins=bins, alpha = 0.7, logy = True, ax=ax[0])
    ax[0].set_xlabel('# nodes per graph')
    
    df = pd.DataFrame(text_stats)
    df.plot(kind = 'hist', bins=bins, logy = True, legend = False, ax=ax[1])
    ax[1].set_xlabel('# characters per document')
    fig.savefig('tmp/other/stats-{}.png'.format(dataset_name), dpi = 150)
    plt.show()

In [None]:
import numpy
out = []
for dataset, stats in all_stats.items():
    out.append((dataset, np.mean(stats['text']), np.mean(stats['graphs']['cooccurrence']), np.mean(stats['graphs']['concept-graphs']), stats['num_docs'], stats['num_classes']))
    
fig, axes = plt.subplots(nrows=4, ncols=1, figsize = (8, 6))

df = pd.DataFrame(out, columns = ['dataset', 'avg_doc_length', 'avg_coo_node_num', 'avg_cp_node_num', 'num_docs', 'num_classes'])
df = df.set_index('dataset')
#, ('# classes', 'num_classes')
for idx, (name, x) in enumerate([('Average document length', 'avg_doc_length'), ('# documents', 'num_docs'), ('Average of number of concept-graph nodes', 'avg_cp_node_num'), ('Average of number of co-occurence nodes', 'avg_coo_node_num')]):
    ax = axes[idx]
    df[x].plot(kind = 'barh', logx = True, title = name, ax = ax)
fig.tight_layout()
fig.savefig('tmp/other/stats-datasets.png', dpi = 120)
plt.show()

In [None]:
import pickle
import dataset_helper
import collections
import coreference
import matplotlib.pyplot as plt
from transformers.nx_graph_to_tuple_transformer import NxGraphToTupleTransformer

check_graphs = False

trans = NxGraphToTupleTransformer()
for dataset_name in dataset_helper.get_all_available_dataset_names():
    if dataset_name not in dataset_helper.DATASETS_LIMITED: continue
    graph_cache_files = dataset_helper.get_all_cached_graph_datasets(dataset_name)
    if not len(graph_cache_files): continue
    
    with open('data/embeddings/graph-embeddings/{}.label-lookup.npy'.format(dataset_name), 'rb') as f:
        lookup = pickle.load(f)
    
    fig, axes = coreference.plot_lookup_histogram(lookup, len(lookup.keys()))
    plt.show()
        
    counter = collections.Counter()
    all_labels = set()
    for graph_cache_file in graph_cache_files:
        X, _ = dataset_helper.get_dataset_cached(graph_cache_file)
        X = trans.transform(X)
        
        for adj, labels in X:
            all_labels |= set(labels)
            for label in labels:
                counter['found' if label in lookup and str(lookup[label]).strip() != str(label).strip() else 'not_found'] += 1
    print(counter)
    print(len(all_labels))
    fig, axes = coreference.plot_lookup_histogram(lookup, len(all_labels))
    plt.show()
    break

In [None]:
import pickle
import dataset_helper
import collections
import coreference
import matplotlib.pyplot as plt
from transformers.nx_graph_to_tuple_transformer import NxGraphToTupleTransformer
import networkx as nx

def merge_graphs(graphs):
    return nx.compose_all(graphs)

trans = NxGraphToTupleTransformer()
for dataset_name in dataset_helper.get_all_available_dataset_names():
    if dataset_name not in dataset_helper.DATASETS_LIMITED: continue
    graph_cache_files = dataset_helper.get_all_cached_graph_datasets(dataset_name)
    if not len(graph_cache_files): continue
    
    for graph_cache_file in graph_cache_files:
        if 'gml' not in graph_cache_file: continue
        
        X, Y = dataset_helper.get_dataset_cached(graph_cache_file)
        #X = trans.transform(X)
        
        num_labels = 0
        all_labels = set()
        all_labels_stripped = set()
        
        #for adj, labels in X:
        for g in X:
            labels = g.nodes()
            all_labels |= set(labels)
            all_labels_stripped |= set([str(label).strip() for label in labels])
            num_labels += len(labels)
    
    num_uniq_labels = len(all_labels)
    num_uniq_labels_stripped = len(all_labels_stripped)
    print('#labels:\t\t{}'.format(num_labels))
    print('#uniq. labels:\t\t{}'.format(num_uniq_labels))
    print('#uniq. labels stripped:\t{}'.format(num_uniq_labels_stripped))
    print('#non-uniq. labels: \t{}'.format(num_labels - num_uniq_labels))
    
    d = dataset_helper.get_dataset_dict(X, Y)
    merged = {label: merge_graphs(graphs) for label, graphs in d.items()}
    break

In [None]:
import pickle
import dataset_helper
import collections
import coreference
import matplotlib.pyplot as plt
from transformers.nx_graph_to_tuple_transformer import NxGraphToTupleTransformer
from transformers.relabel_graphs_transformer import RelabelGraphsTransformer
from glob import glob
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import coreference 
sns.set_style('white')

def get_treshold_and_topn_from_lookupfilename(filename):
    topn = filename.split('topn-')[1].split('.label')[0]
    threshold = filename.split('threshold-')[1].split('.topn')[0]
    return threshold, topn

trans = NxGraphToTupleTransformer()
for dataset_name in dataset_helper.get_all_available_dataset_names():
    #if dataset_name not in dataset_helper.DATASETS_LIMITED: continue
    if dataset_name not in ['ling-spam', 'ng20']: continue
    graph_cache_files = dataset_helper.get_all_cached_graph_datasets(dataset_name)
    if not len(graph_cache_files): continue

    for graph_cache_file in graph_cache_files:
        print('Loading dataset: Start ({})'.format(graph_cache_file))
        X_old, _ = dataset_helper.get_dataset_cached(graph_cache_file)
        print('TupleTransform: Start')
        X_old = trans.transform(X_old)
        X_labels = [labels for _, labels in X_old]
        
        lookups = glob('data/embeddings/graph-embeddings/{}.*.*.label-lookup.npy'.format(dataset_name))
        for lookup_file in lookups:
            # Load lookup
            threshold, topn = get_treshold_and_topn_from_lookupfilename(lookup_file)
            with open(lookup_file, 'rb') as f:
                lookup = pickle.load(f)
            
            # Relabel
            relabel_trans = RelabelGraphsTransformer(lookup)
            X = relabel_trans.transform(X_old)
            duplicate_labels_count = []
            different_counts = []
            for (_, labels), old_labels in zip(X, X_labels):
                labels_set = set(labels)
                different_counts.append(collections.Counter([str(l1).lower().strip() != str(l2).lower().strip() for l1, l2 in zip(labels, old_labels)])[True])
                duplicate_labels_count.append(len(labels) - len(labels_set))
            label_counters = [len(labels) for _, labels in X]
    
            df = pd.DataFrame(list(zip(different_counts, duplicate_labels_count, label_counters)), columns = ['different_count', 'duplicate_labels_count', 'label_count'])
            df['relabeled_ratio'] = df.different_count / df.label_count
            df['duplicate_ratio'] = df.duplicate_labels_count / df.label_count
            fig, ax = plt.subplots(figsize = (16, 4))
            df[['relabeled_ratio', 'duplicate_ratio']].plot(kind = 'hist', bins = 100, log = True, title = 'Histogram (threshold={}, topn={})'.format(threshold, topn), ax = ax, alpha = 0.7)
            plt.show()
            plt.close(fig)
        break


In [None]:
import coreference
for lookup_file in glob('data/embeddings/graph-embeddings/*.threshold-*.*.label-lookup.npy'):
    threshold, topn = get_treshold_and_topn_from_lookupfilename(lookup_file)
    with open(lookup_file, 'rb') as f:
        lookup = pickle.load(f)
    for key in lookup.values():
        if not isinstance(key, (str, int)):
            print("?")
            break
    fig, axes = coreference.plot_lookup_histogram(lookup=lookup, title = 'threshold={}, topn={}'.format(threshold, topn))
    plt.show()
    plt.close(fig)

In [None]:
import dataset_helper
import graph_helper
import collections
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

counts = []
for dataset_name in dataset_helper.get_all_available_dataset_names():
    if dataset_name not in ['ling-spam', 'ng20', 'webkb', 'reuters-21578']: continue
    graph_cache_files = dataset_helper.get_all_cached_graph_datasets(dataset_name)
    if not len(graph_cache_files): continue
    for graph_cache_file in graph_cache_files:
        #if 'gml' not in graph_cache_file: continue
        if 'gml' in graph_cache_file: continue
        print('Loading dataset: {}'.format(graph_cache_file))
        X_old, _ = dataset_helper.get_dataset_cached(graph_cache_file)
        label_counter = collections.Counter()
        for graph in X_old:
            labels = [str(x).strip() for x in graph.nodes()]
            label_counter.update(labels)
        counts_ = list(label_counter.values())
        counts += list(zip([dataset_name] * len(counts_), counts_))
        break


In [None]:
df = pd.DataFrame(counts, columns = ['dataset', 'counts'])
fig, axes = plt.subplots(nrows=2, ncols=2, figsize = (7, 4))
axes = df.hist(log = True, bins = 60, by = 'dataset', ax = axes)

for ax in axes.flatten():
    ax.set_xlim((0, 10000))
    pass

fig.tight_layout()
fig.savefig('tmp/label-distribution-per-dataset.png', dpi = 150)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

num_elements = 10
data = list(zip(["a"] * num_elements, range(num_elements))) + list(zip(["b"] * num_elements, range(num_elements)))
df = pd.DataFrame(data, columns = ["name", "counts"])
df.plot(kind = "hist", by = df.name)
df.hist(by = df.name)
plt.show()


In [None]:
for dataset_name in dataset_helper.get_all_available_dataset_names():
    #if dataset_name not in ['ling-spam', 'ng20', 'webkb', 'reuters-21578']: continue
    if dataset_name not in ['ling-spam']: continue
    graph_cache_files = dataset_helper.get_all_cached_graph_datasets(dataset_name)
    if not len(graph_cache_files): continue
    for graph_cache_file in graph_cache_files:
        if 'gml' in graph_cache_file: continue
        X, _ = dataset_helper.get_dataset_cached(graph_cache_file)

        break

In [None]:
import re
pattern = re.compile("^[a-zA-Z0-9]+$")
all_labels = graph_helper.get_all_node_labels(X)
new_labels = set()
for label in all_labels:
    new_labels.add(label.strip())
    new_labels.add(label)
len(all_labels), len(new_labels)

In [None]:
import scipy
from scipy import sparse
import numpy as np
indices = [0, 1, 5, 8]
mat = sparse.lil_matrix((10, 10))
mat[indices]

In [None]:
import pickle
import os
from glob import glob
import sklearn
from sklearn import metrics
import helper
import matplotlib.pyplot as plt

for pred_file in glob('data/results/predictions/*.npy'):
    if 'gml' not in pred_file or 'ng20' not in pred_file: continue
    with open(pred_file, 'rb') as f:
        predictions = pickle.load(f)
    Y_true, Y_pred = predictions['Y_real'], predictions['Y_pred']
    assert len(Y_true) == len(Y_pred)
    cmat = sklearn.metrics.confusion_matrix(y_true = Y_true, y_pred = Y_pred)
    try:
        fig, ax = plt.subplots(figsize=(10, 10))
        f1 = sklearn.metrics.f1_score(y_true = Y_true, y_pred = Y_pred, average = 'macro')
        helper.plot_confusion_matrix(cmat, classes=set(Y_true), normalize = True, show_non_horizontal_percent=False, title='{} ({})'.format(f1, pred_file.split('/')[-1]))
    except Exception as e:
        pass
    del predictions, Y_true, Y_pred
    plt.show()
    plt.close(fig)

In [None]:
import dataset_helper
import graph_helper
import collections
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import display

counts = []
for dataset_name in dataset_helper.get_all_available_dataset_names():
    #if dataset_name not in ['ling-spam', 'ng20', 'webkb', 'reuters-21578']: continue
    if dataset_name not in ['ng20']: continue
    graph_cache_files = dataset_helper.get_all_cached_graph_datasets(dataset_name)
    if not len(graph_cache_files): continue
    node_counts = []
    for graph_cache_file in graph_cache_files:
        if 'gml' in graph_cache_file: continue
        print('Loading dataset: {}'.format(graph_cache_file))
        X, Y = dataset_helper.get_dataset_cached(graph_cache_file)
        for x in X:
            node_counts.append((graph_cache_file, len(x.nodes())))
    

In [None]:
df = pd.DataFrame(node_counts, columns = ['filename', 'node_counts'])
df_all_words = df[df.filename.str.contains('_all_')]
fig, ax = plt.subplots(nrows = 1, ncols = 1, figsize = (14, 8))
df_all_words.node_counts.plot(kind = 'hist', log = False, bins = 400, ax = ax, title = 'Node count histogramm (ng20)')
ax.set_xscale('log')
ax.set_xlabel('node count per graph')
median = df_all_words.node_counts.median()
ax.text(x = median, y = -400, s = 'Median: {:.0f}'.format(median), fontdict={'fontsize': 12}, horizontalalignment = 'center')
#ax.set_xscale('log')
ax.vlines(ymin = 0, ymax=6000, x = median, colors='red')
fig.tight_layout()
fig.savefig('tmp/node-count-ng20.png', dpi = 100)
display(df_all_words.node_counts.describe().to_frame().T)
plt.show()
plt.close(fig)

In [None]:
from glob import glob
import scipy
from scipy import sparse

for cache_file in dataset_helper.get_all_cached_graph_phi_datasets(dataset_name = 'ling-spam'):
    print(cache_file)
    with open(cache_file, 'rb') as f:
        X, Y = pickle.load(f)
    stacked = scipy.sparse.hstack(X)
    break

In [None]:

df_all_words[df_all_words.node_counts == 1].size, df_all_words.size

## SPGK

In [None]:
import dataset_helper
from kernels import spgk
import networkx as nx
from time import time

depth = 1
MAX_ELEMENTS = None
    
for graph_cache_file in dataset_helper.get_all_cached_graph_datasets(dataset_name='ling-spam' if True else None):
    if 'all' in graph_cache_file: continue
    print('Cache file: {}'.format(graph_cache_file.split('/')[-1]))
    X, Y = dataset_helper.get_dataset_cached(graph_cache_file)
    if not isinstance(X[0], nx.Graph):
        print('\tWrong X type, expected: nx.Graphs, got: {}'.format(type(X[0])))
        continue

    num_elements = MAX_ELEMENTS if MAX_ELEMENTS else len(X)
    X = X[:num_elements]
    
    try:
        start_time = time()
        
        for x in X:
            for u,v,edata in x.edges(data = True):
                if 'weight' not in edata: continue
                if edata['weight'] < 1:
                    print("Edge weight={:>2}, from='{}', to='{}'".format(edata['weight'], u, v))
                edata['weight'] = 1
            if False:
                self_loop_edges = x.selfloop_edges()
                if len(self_loop_edges):
                    x.remove_edges_from(self_loop_edges)

        K = spgk.build_kernel_matrix(X, depth)
        
        time_needed = time() - start_time
        
        print('\tNon-zero\t\t{:.0f}%'.format(100 * len(K.nonzero()[0]) / (K.shape[0] * K.shape[0])))
        print('\tTime needed')
        print('\t\ttotal {}:\t{:.2f}s'.format(num_elements, time_needed))
        print('\t\tper element:\t{:.2f}s'.format(time_needed / num_elements))
        print('\t\tper 10000:\t{:.2f}s'.format(time_needed / num_elements * 10000))
    except Exception as e:
        print('\tError: {}'.format(e))


In [None]:
for graph in X:
    adj = nx.adjacency_matrix(graph)
    adj[0, 0] = 2
    adj[adj.nonzero()] = 1
    break

In [None]:
import sklearn
import scipy
from scipy import sparse

graph = sparse.eye(2, 10)


from sklearn.pipeline import FeatureUnion

class TupleSelector(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin):
    def __init__(self, selected_index = 0):
        self.selected_index = selected_index
        
    def fit(self, X, y=None, **fit_params):
        pass
    
    def transform(self, X, y=None, **fit_params):
        return [x[self.selected_index] for x in X]

estimators = [('TupleSelector', PCA()), ('kernel_pca', KernelPCA())]
combined = FeatureUnion(estimators)

trans = sklearn.feature_extraction.text.TfidfVectorizer(stop_words='english')
trans.fit(['Yes scheint es ja zu schlaiken', 'Yes'])
X_text = trans.transform(['yes', 'yes'])

X_combined = sparse.hstack([graph, X_text])
graph.shape, X_text.shape, X_combined.shape