In [1]:
# from __future__ import division
from __future__ import print_function

import time
import argparse
import numpy as np

import torch
import torch.nn.functional as F
import torch.optim as optim

#from pygcn.utils import load_data, accuracy
from pygcn.models import GCN
import pandas as pd
import scipy.sparse as sp

In [6]:
def encode_onehot(labels):
    classes = set(labels)
    classes_dict = {c: np.identity(len(classes))[i, :] for i, c in
                    enumerate(classes)}
    labels_onehot = np.array(list(map(classes_dict.get, labels)),
                             dtype=np.int32)
    return labels_onehot

def normalize(mx):
    """Row-normalize sparse matrix"""
    rowsum = np.array(mx.sum(1))
    r_inv = np.power(rowsum, -1).flatten()
    r_inv[np.isinf(r_inv)] = 0.
    r_mat_inv = sp.diags(r_inv)
    mx = r_mat_inv.dot(mx)
    return mx


def accuracy(output, labels):
    preds = output.max(1)[1].type_as(labels)
    correct = preds.eq(labels).double()
    correct = correct.sum()
    return correct / len(labels)


def sparse_mx_to_torch_sparse_tensor(sparse_mx):
    """Convert a scipy sparse matrix to a torch sparse tensor."""
    sparse_mx = sparse_mx.tocoo().astype(np.float32)
    indices = torch.from_numpy(
        np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
    values = torch.from_numpy(sparse_mx.data)
    shape = torch.Size(sparse_mx.shape)
    return torch.sparse.FloatTensor(indices, values, shape)

In [91]:
# def encode_edges(edges_data):
#     new_series = edges_data[0].append(edges_data[1])
#     labels, levels = pd.factorize(new_series)

#     edgeId = {}

#     for i in range(len(levels)):
#         key = levels[i]
#         edgeId[key] = i

#     encoded_edges = []
#     for row in edges_data.itertuples(index=True, name='Pandas'):
#         encoded_edges.append([edgeId[row[1]], edgeId[row[2]]])
#     return np.asarray(encoded_edges)

In [153]:
# def remove_empty_edges(mapped_edges):
#     new_list = []
#     for edge in mapped_edges:
#         print(edge)
#         print(mapped_edges[edge])
#         if(edge[0] != None and edge[1] != None):
#             new_list.append(edge)
#     return edge

In [30]:
#https://github.com/tkipf/pygcn/issues/39
def load_data(content="embeddings.h5", edges="relations.txt"):
    print('Loading {} dataset...'.format(content))

    #idx_features_labels = np.genfromtxt("{}{}.content".format(path, dataset), dtype=np.dtype(str))
    idx_features_labels = pd.read_hdf(content, 'df')
    
    #reindex to make sure columns are in correct order
    cols = list(idx_features_labels)
    
    # move the article_name column to head of list using index, pop and insert
    cols.insert(0, cols.pop(cols.index('article_name')))
    idx_features_labels = idx_features_labels.ix[:, cols]
    idx_features_labels = idx_features_labels.values
    
    #print(idx_features_labels)
    
    features = sp.csr_matrix(idx_features_labels[:, 1:], dtype=np.float32)
    
    #TODO INSERT OUR OWN LABELS
    labels = encode_onehot(idx_features_labels[:, -1]) #invalid

    # build graph
    idx = np.array(idx_features_labels[:, 0], dtype=np.str)
    idx_map = {j: i for i, j in enumerate(idx)}
    edges_unordered = np.genfromtxt("{}.txt".format(edges), dtype=np.str)
    
    print(edges_unordered[:33])
    print(edges_unordered.shape)
    #print(list(map(idx_map.get, edges_unordered.flatten())))
    
    edges_unfiltered = np.array(list(map(idx_map.get, edges_unordered.flatten())), dtype=np.str).reshape(edges_unordered.shape)
    
    print("=====")
    #print(type(edges_unfiltered))
    
    #filter out edges that contain invalid categories 'None'
    edges = []
    for edge in edges_unfiltered:
        if(not(edge[0] == 'None') and not(edge[1] == 'None')):
            edges.append(edge)
            
    edges = np.array(edges)
    print(edges)
    
    adj = sp.coo_matrix((np.ones(edges.shape[0]), (edges[:, 0], edges[:, 1])),
                        shape=(labels.shape[0], labels.shape[0]),
                        dtype=np.int32)

    # build symmetric adjacency matrix
    adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)

    features = normalize(features)
    adj = normalize(adj + sp.eye(adj.shape[0]))

    idx_train = range(140)
    idx_val = range(200, 500)
    idx_test = range(500, 1500)

    features = torch.FloatTensor(np.array(features.todense()))
    labels = torch.LongTensor(np.where(labels)[1])
    adj = sparse_mx_to_torch_sparse_tensor(adj)

    idx_train = torch.LongTensor(idx_train)
    idx_val = torch.LongTensor(idx_val)
    idx_test = torch.LongTensor(idx_test)

    return adj, features, labels, idx_train, idx_val, idx_test

In [31]:
adj, features, labels, idx_train, idx_val, idx_test = load_data("embeddings.h5", "relations")

Loading embeddings.h5 dataset...
[['Artificial_neural_networks' 'Machine_learning']
 ['Applied_machine_learning' 'Machine_learning']
 ['Unsupervised_learning' 'Machine_learning']
 ['Structured_prediction' 'Machine_learning']
 ['Loss_functions' 'Machine_learning']
 ['Support_vector_machines' 'Machine_learning']
 ['Latent_variable_models' 'Machine_learning']
 ['Cluster_analysis' 'Machine_learning']
 ['Signal_processing_conferences' 'Machine_learning']
 ['Artificial_intelligence_conferences' 'Machine_learning']
 ['Data_mining_and_machine_learning_software' 'Machine_learning']
 ['Machine_learning_algorithms' 'Machine_learning']
 ['Bayesian_networks' 'Machine_learning']
 ['Machine_learning_portal' 'Machine_learning']
 ['Statistical_natural_language_processing' 'Machine_learning']
 ['Markov_models' 'Machine_learning']
 ['Evolutionary_algorithms' 'Machine_learning']
 ['Genetic_programming' 'Machine_learning']
 ['Datasets_in_machine_learning' 'Machine_learning']
 ['Dimension_reduction' 'Machin

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  del sys.path[0]


In [32]:
adj

tensor(indices=tensor([[ 0,  4,  6,  7, 12, 14, 17, 20, 21, 22, 24, 28, 29, 31,
                        34, 41, 42, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
                        57, 61, 62,  1,  2,  3,  0,  4,  5,  4,  5,  0,  6,  0,
                         7,  8,  7,  8, 10, 28, 33,  9,  8, 10, 11,  0, 12, 13,
                         0, 14, 15, 45, 14, 15, 16,  0, 17, 18, 19, 17, 18, 17,
                        19,  0, 20,  0, 21,  0, 22, 23, 22, 23,  0, 24, 36, 25,
                        26, 27,  0,  8, 28,  0, 29, 30, 29, 30,  0, 31, 32, 33,
                        31, 32,  8, 31, 33,  0, 34, 36, 39, 40, 41, 35, 24, 34,
                        36, 37, 39, 36, 37, 38, 34, 36, 39, 34, 40,  0, 34, 41,
                         0, 42, 43, 42, 43,  0, 44, 45, 14, 44, 45,  0, 46,  0,
                        47,  0, 48,  0, 49,  0, 50, 51,  0, 50, 51,  0, 52,  0,
                        53,  0, 54,  0, 55, 57, 60, 62, 56,  0, 55, 57, 58, 59,
                        57, 58, 57, 59, 