In [8]:
import numpy as np
import scipy.sparse as sp
import csv
import scipy.io as scio
import networkx as nx

In [2]:
data = scio.loadmat("BlogCatalog-dataset/small/BlogCatalog.mat")

In [3]:
data

{'__header__': b'MATLAB 5.0 MAT-file, Platform: MACI64, Created on: Tue Oct 17 16:57:18 2017',
 '__version__': '1.0',
 '__globals__': [],
 'Attributes': <5196x8189 sparse matrix of type '<class 'numpy.float64'>'
 	with 369435 stored elements in Compressed Sparse Column format>,
 'Label': array([[6],
        [2],
        [2],
        ...,
        [4],
        [4],
        [4]], dtype=uint8),
 'Network': <5196x5196 sparse matrix of type '<class 'numpy.float64'>'
 	with 343486 stored elements in Compressed Sparse Column format>}

In [4]:
features = sp.csr_matrix(data["Attributes"])
labels = np.ravel(data["Label"])
adj = sp.csr_matrix(data["Network"])

In [5]:
features

<5196x8189 sparse matrix of type '<class 'numpy.float64'>'
	with 369435 stored elements in Compressed Sparse Row format>

In [6]:
labels

array([6, 2, 2, ..., 4, 4, 4], dtype=uint8)

In [7]:
adj

<5196x5196 sparse matrix of type '<class 'numpy.float64'>'
	with 343486 stored elements in Compressed Sparse Row format>

In [8]:
# 保存adj
sp.save_npz("BlogCatalog-dataset/small_United/Adj.npz",adj)
# 保存features
sp.save_npz("BlogCatalog-dataset/small_United/Features.npz",features)
# 保存labels
np.save("BlogCatalog-dataset/small_United/Labels.npy",labels)

In [12]:

target = ["brazil","europe","usa"]
for item in target:
    edges = np.loadtxt("{}/{}-airports.edgelist".format(item,item),dtype=int,skiprows=0)
    labels = np.loadtxt("{}/labels-{}-airports.txt".format(item,item),dtype=int,skiprows=1,usecols=1)
    labels = np.ravel(labels)
    G = nx.from_edgelist(edges)
    adj = sp.csr_matrix(nx.adj_matrix(G))
    sp.save_npz("{}/United/Adj.npz".format(item),adj)
    np.save("{}/United/labels.npy".format(item),labels)
    print(adj.A)
    

[[1 1 0 ... 0 0 0]
 [1 1 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[[0 1 0 ... 0 0 0]
 [1 0 1 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[[0 1 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [9]:
import pickle as pkl
import sys
def parse_index_file(filename):
    """Parse index file."""
    index = []
    for line in open(filename):
        index.append(int(line.strip()))
    return index

In [17]:
target = ["cora","pubmed","citeseer"]
for item in target:
    names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
    objects = []
    for i in range(len(names)):
        with open("{}/ind.{}.{}".format(item,item , names[i]), 'rb') as f:
            if sys.version_info > (3, 0):
                objects.append(pkl.load(f, encoding='latin1'))
            else:
                objects.append(pkl.load(f))

    x, y, tx, ty, allx, ally, graph = tuple(objects)
    test_idx_reorder = parse_index_file("{}/ind.{}.test.index".format(item,item))
    test_idx_range = np.sort(test_idx_reorder)
    if item == 'citeseer':
        test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1)
        tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
        tx_extended[test_idx_range-min(test_idx_range), :] = tx
        tx = tx_extended
        ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]))
        ty_extended[test_idx_range-min(test_idx_range), :] = ty
        ty = ty_extended

    features = sp.vstack((allx, tx)).tolil()
    features[test_idx_reorder, :] = features[test_idx_range, :]
    adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))

    labels = np.vstack((ally, ty))
    labels[test_idx_reorder, :] = labels[test_idx_range, :]
    
    idx_test = np.array(test_idx_range.tolist())
    idx_train = np.array(list(range(len(y))))
    idx_val = np.array(list(range(len(y), len(y)+500)))
    
    sp.save_npz("{}/United/Adj.npz".format(item),adj.tocsr())
    sp.save_npz("{}/United/Features.npz".format(item),features.tocsr())
    np.save("{}/United/Labels.npy".format(item),np.ravel(labels))
    
    np.save("{}/United/train_splits.npy".format(item),idx_train)
    np.save("{}/United/valid_splits.npy".format(item),idx_val)
    np.save("{}/United/test_splits.npy".format(item),idx_test)
    

In [18]:
data = scio.loadmat("flickr/Flickr.mat")

In [19]:
data

{'__header__': b'MATLAB 5.0 MAT-file, Platform: MACI64, Created on: Tue Oct 17 16:58:12 2017',
 '__version__': '1.0',
 '__globals__': [],
 'Attributes': <7575x12047 sparse matrix of type '<class 'numpy.float64'>'
 	with 182517 stored elements in Compressed Sparse Column format>,
 'Label': array([[8],
        [2],
        [7],
        ...,
        [3],
        [6],
        [4]], dtype=uint8),
 'Network': <7575x7575 sparse matrix of type '<class 'numpy.float64'>'
 	with 479476 stored elements in Compressed Sparse Column format>}

In [20]:
features = sp.csr_matrix(data["Attributes"])
labels = np.ravel(data["Label"])
adj = sp.csr_matrix(data["Network"])
# 保存adj
sp.save_npz("flickr/United/Adj.npz",adj)
# 保存features
sp.save_npz("flickr/United/Features.npz",features)
# 保存labels
np.save("flickr/United/Labels.npy",labels)

In [34]:
# target = ["0.001","0.01","0.1"]
# for item in target:
#     names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
#     objects = []
#     for i in range(len(names)):
#         with open("NELL/ind.nell.{}.{}".format(item , names[i]), 'rb') as f:
#             if sys.version_info > (3, 0):
#                 objects.append(pkl.load(f, encoding='latin1'))
#             else:
#                 objects.append(pkl.load(f))

#     x, y, tx, ty, allx, ally, graph = tuple(objects)
#     test_idx_reorder = parse_index_file("NELL/ind.nell.{}.test.index".format(item))
#     test_idx_range = np.sort(test_idx_reorder)
#     features = sp.vstack((allx, tx)).tolil()
#     features[test_idx_reorder, :] = features[test_idx_range, :]
#     adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))

#     labels = np.vstack((ally, ty))
#     labels[test_idx_reorder, :] = labels[test_idx_range, :]
    
#     idx_test = np.array(test_idx_range.tolist())
#     idx_train = np.array(list(range(len(y))))
#     idx_val = np.array(list(range(len(y), len(y)+500)))
    
#     sp.save_npz("NELL/United_{}/Adj.npz".format(item),adj.tocsr())
#     sp.save_npz("NELL/United_{}/Features.npz".format(item),features.tocsr())
#     np.save("NELL/United_{}/Labels.npy".format(item),np.ravel(labels))
    
#     np.save("NELL/United_{}/train_splits.npy".format(item),idx_train)
#     np.save("NELL/United_{}/valid_splits.npy".format(item),idx_val)
#     np.save("NELL/United_{}/test_splits.npy".format(item),idx_test)