In [1]:
from pathlib import Path
import numpy as np
import scipy.sparse as sp
from tqdm.notebook import tqdm

from gtn.dataloader.io import load_from_npz, save_to_npz
from gtn.dataloader.graphcollection import DistanceMatrix

In [2]:
dataname = "aids"
respath = Path(f"/nfs/staff-ssd/klicpera/{dataname}_ged")
datasets = ['train', 'val', 'test']
ngraphs = {'train': 144, 'val': 48, 'test': 48}

### Merge distances

In [25]:
graph1 = 0
graph2 = 1
fname = f"ged_{datasets[0]}_{graph1}_{graph2}.npz"
with np.load(respath / "dists_raw" / fname, 'r') as loader:
    loader = dict(loader)

In [5]:
for dataset in datasets:
    arr = np.zeros([ngraphs[dataset], ngraphs[dataset]], dtype=np.float32)
    for graph1 in tqdm(range(ngraphs[dataset] - 1)):
        missing = False
        for graph2 in range(graph1 + 1, ngraphs[dataset]):
            fname = f"ged_{dataset}_{graph1}_{graph2}.npz"
            with np.load(respath / "dists_raw" / fname, 'r') as loader:
                loader = dict(loader)
                if np.sum(loader['arr_0'] != 0) == 0:
                    print(f"No non-zero distance: {dataset}, {graph1} - {graph2}")
                arr[graph1, graph2] += loader['arr_0'][graph1, graph2]
                arr[graph2, graph1] += loader['arr_0'][graph1, graph2]
    nentries = np.sum(arr != 0)
    npairs = ngraphs[dataset] * (ngraphs[dataset] - 1) / 2
    if nentries != 2 * npairs:
        print(f"Not the right number of distances: {nentries} != {2 * npairs}")
    np.savez(respath / "dists" / f"{dataname}_ged_{dataset}.npz", arr)

HBox(children=(FloatProgress(value=0.0, max=47.0), HTML(value='')))




#### Investigate graph pair with 0 distance

In [4]:
npz_file = f"/nfs/homedirs/klicpera/graph-distance/graph-distance/data/raw/{dataname}_ged_train.npz"
gcoll = load_from_npz(npz_file)
g1 = gcoll[11]
g2 = gcoll[138]
print(g1)
print(g2)
print((g1.adj_matrix - g2.adj_matrix).nonzero())
print((g1.attr_matrix - g2.attr_matrix).nonzero())
print((g1.edge_attr_matrix - g2.edge_attr_matrix).nonzero())

<Undirected, unweighted and connected SparseGraph with 54 edges (no self-loops). Data: adj_matrix (24x24), attr_matrix (24x1), edge_attr_matrix (54x1), attr_names (1), edge_attr_names (1), metadata>
<Undirected, unweighted and connected SparseGraph with 46 edges (no self-loops). Data: adj_matrix (20x20), attr_matrix (20x1), edge_attr_matrix (46x1), attr_names (1), edge_attr_names (1), metadata>


ValueError: inconsistent shapes

In [5]:
max([g.num_nodes() for g in gcoll])

30

#### Investigate distance matrix

In [6]:
with np.load(respath / "dists" / f"{dataname}_ged_train.npz", 'r') as loader:
    loader = dict(loader)
    dists = loader['arr_0']
    col, row = np.where(dists == 0)
    print(np.all(col == row))
    if not np.all(col == row):
        print(col[col != row], row[col != row])
    
print(dists)
print(f"Symmetric: {np.all(np.isclose(dists, dists.T))}")

True
[[ 0. 39. 31. ... 61. 41. 40.]
 [39.  0. 45. ... 63. 33. 36.]
 [31. 45.  0. ... 46. 37. 38.]
 ...
 [61. 63. 46. ...  0. 55. 52.]
 [41. 33. 37. ... 55.  0. 23.]
 [40. 36. 38. ... 52. 23.  0.]]
Symmetric: True


### Save distances in GraphCollection

In [8]:
for dataset in datasets:
    print(f"Convert {dataset}")
    gcoll = load_from_npz(Path.home() / "graph-distance" / "graph-distance" / "data" / "raw" / f"{dataname}_ged_{dataset}.npz")
    # assert np.sum(gcoll.dists.A != 0) == 0
    with np.load(respath / "dists" / f"{dataname}_ged_{dataset}.npz", 'r') as loader:
        loader = dict(loader)
        gcoll.dists = DistanceMatrix(loader['arr_0'])
    save_to_npz(Path.home() / "graph-distance" / "graph-distance" / "data" / "raw" / f"{dataname}_ged_{dataset}_new.npz", gcoll)

Convert test


In [36]:
dataset = 'train'
gcoll = load_from_npz(Path.home() / "graph-distance" / "graph-distance" / "data" / "raw" / f"{dataname}_ged_{dataset}_new.npz")

### Convert attr_matrix and edge_attr_matrix to ndarray

In [11]:
for dataset in datasets:
    print(f"Convert {dataset}")
    data_path = Path.home() / "graph-distance" / "graph-distance" / "data" / "raw" / f"{dataname}_ged_{dataset}.npz"
    gcoll = load_from_npz(data_path)
    for graph in gcoll:
        if sp.issparse(graph.attr_matrix):
            graph._attr_matrix = graph.attr_matrix.A
            graph._edge_attr_matrix = graph.edge_attr_matrix.A
    save_to_npz(data_path, gcoll)

Convert test
