In [70]:
# from __future__ import division
from __future__ import print_function

import time
import argparse
import numpy as np
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from numpy.random import seed
from numpy.random import randint

#from pygcn.utils import load_data, accuracy
#from pygcn.models import GCN
import pandas as pd
import scipy.sparse as sp
random.seed(3001)
seed(3001)

In [71]:
def encode_onehot(labels, num_classes):
    classes = set(list(range(num_classes + 1)))
    #classes = set(labels)
    classes_dict = {c: np.identity(len(classes))[i, :] for i, c in enumerate(classes)}
    #print("Classes dict: ", classes_dict)
    labels_onehot = np.array(list(map(classes_dict.get, labels)), dtype=np.int32)
    print("Labels onehot", labels_onehot)
    return labels_onehot

def normalize(mx):
    """Row-normalize sparse matrix"""
    rowsum = np.array(mx.sum(1))
    r_inv = np.power(rowsum, -1).flatten()
    r_inv[np.isinf(r_inv)] = 0.
    r_mat_inv = sp.diags(r_inv)
    mx = r_mat_inv.dot(mx)
    return mx


def accuracy(output, labels):
    preds = output.max(1)[1].type_as(labels)
    correct = preds.eq(labels).double()
    correct = correct.sum()
    return correct / len(labels)


def sparse_mx_to_torch_sparse_tensor(sparse_mx):
    """Convert a scipy sparse matrix to a torch sparse tensor."""
    sparse_mx = sparse_mx.tocoo().astype(np.float32)
    indices = torch.from_numpy(
        np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
    values = torch.from_numpy(sparse_mx.data)
    shape = torch.Size(sparse_mx.shape)
    return torch.sparse.FloatTensor(indices, values, shape)

In [54]:
# def encode_edges(edges_data):
#     new_series = edges_data[0].append(edges_data[1])
#     labels, levels = pd.factorize(new_series)

#     edgeId = {}

#     for i in range(len(levels)):
#         key = levels[i]
#         edgeId[key] = i

#     encoded_edges = []
#     for row in edges_data.itertuples(index=True, name='Pandas'):
#         encoded_edges.append([edgeId[row[1]], edgeId[row[2]]])
#     return np.asarray(encoded_edges)

In [55]:
# def remove_empty_edges(mapped_edges):
#     new_list = []
#     for edge in mapped_edges:
#         print(edge)
#         print(mapped_edges[edge])
#         if(edge[0] != None and edge[1] != None):
#             new_list.append(edge)
#     return edge

In [76]:
#https://github.com/tkipf/pygcn/issues/39
def load_data(content, edges):
    print('Loading {} dataset...'.format(content))

    #idx_features_labels = np.genfromtxt("{}{}.content".format(path, dataset), dtype=np.dtype(str))
    idx_features_labels = pd.read_hdf(content, 'df')
    
    #reindex to make sure columns are in correct order
    cols = list(idx_features_labels)
    
    # move the article_name column to head of list using index, pop and insert
    cols.insert(0, cols.pop(cols.index('article_name')))
    idx_features_labels = idx_features_labels.ix[:, cols]
    idx_features_labels = idx_features_labels.values
    
    features = sp.csr_matrix(idx_features_labels[:, 1:], dtype=np.float32)
    
    proto_labels = np.zeros(len(idx_features_labels))
    #Training indices
    idx_train = randint(0, 1121, 55)
    print(idx_train)
    count = 0 
    train_labels = randint(0, 10, 55)
    for idx in idx_train:
        proto_labels[idx] = train_labels[count] #set default "understanding" level
        count += 1
        print("Training label: ", proto_labels[idx])
    #TODO INSERT OUR OWN LABELS
    labels = encode_onehot(proto_labels, 10)
    #labels = np.zeros(len(idx_features_labels))


    # build graph
    idx = np.array(idx_features_labels[:, 0], dtype=np.str)
    idx_map = {j: i for i, j in enumerate(idx)}
    edges_unordered = np.genfromtxt("{}".format(edges), dtype=np.str)

    #print(list(map(idx_map.get, edges_unordered.flatten())))
    
    edges_unfiltered = np.array(list(map(idx_map.get, edges_unordered.flatten())), dtype=np.str).reshape(edges_unordered.shape)
    
    print("=====")
    #print(type(edges_unfiltered))
    
    #filter out edges that contain invalid categories 'None'
    edges = []
    for edge in edges_unfiltered:
        if(not(edge[0] == 'None') and not(edge[1] == 'None')):
            edges.append(edge)
            
    edges = np.array(edges)
    #print(edges)
    
    adj = sp.coo_matrix((np.ones(edges.shape[0]), (edges[:, 0], edges[:, 1])),
                        shape=(labels.shape[0], labels.shape[0]),
                        dtype=np.int32)

    # build symmetric adjacency matrix
    adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)

    features = normalize(features)
    adj = normalize(adj + sp.eye(adj.shape[0]))

            
    idx_val = range(200, 500)
    idx_test = range(500, 1500)
    #print("Np where: ", [np.where(r==1)[0][0] for r in labels])
    print("Np where: ", (np.where(labels)[1]).size)
    features = torch.FloatTensor(np.array(features.todense()))
    labels = torch.LongTensor(np.where(labels)[1])
    #labels = torch.FloatTensor(labels)
    print(labels.shape)
    adj = sparse_mx_to_torch_sparse_tensor(adj)

    idx_train = torch.LongTensor(idx_train)
    idx_val = torch.LongTensor(idx_val)
    idx_test = torch.LongTensor(idx_test)

    return adj, features, labels, idx_train

In [77]:
adj, features, labels, idx_train = load_data("data/Roman_republic_embeddings.h5", "data/Roman_republic_relations.txt")

Loading data/Roman_republic_embeddings.h5 dataset...


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  del sys.path[0]


[ 597  412  920 1022  486 1091  145  907  716  148  825   95  721  600
  599   53   89  729  844  886  798  750  608  799  504  198  441  807
  593   65  665   48  786  921  614  859 1120  181  698  727  212   78
   35   16  121   43  654  540 1110  284  134  698  643 1099  518]
Training label:  2.0
Training label:  3.0
Training label:  8.0
Training label:  7.0
Training label:  3.0
Training label:  7.0
Training label:  2.0
Training label:  4.0
Training label:  1.0
Training label:  3.0
Training label:  3.0
Training label:  8.0
Training label:  3.0
Training label:  1.0
Training label:  3.0
Training label:  8.0
Training label:  0.0
Training label:  8.0
Training label:  7.0
Training label:  4.0
Training label:  2.0
Training label:  1.0
Training label:  8.0
Training label:  3.0
Training label:  3.0
Training label:  9.0
Training label:  6.0
Training label:  9.0
Training label:  7.0
Training label:  4.0
Training label:  6.0
Training label:  2.0
Training label:  7.0
Training label:  7.0
Traini

  del sys.path[0]


In [78]:
import math

import torch

from torch.nn.parameter import Parameter
from torch.nn.modules.module import Module


class GraphConvolution(Module):
    """
    Simple GCN layer, similar to https://arxiv.org/abs/1609.02907
    """

    def __init__(self, in_features, out_features, bias=True):
        super(GraphConvolution, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = Parameter(torch.FloatTensor(in_features, out_features))
        if bias:
            self.bias = Parameter(torch.FloatTensor(out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
        if self.bias is not None:
            self.bias.data.uniform_(-stdv, stdv)

    def forward(self, input, adj):
        support = torch.mm(input, self.weight)
        output = torch.spmm(adj, support)
        if self.bias is not None:
            return output + self.bias
        else:
            return output

    def __repr__(self):
        return self.__class__.__name__ + ' (' \
               + str(self.in_features) + ' -> ' \
               + str(self.out_features) + ')'



class GCN(nn.Module):
    def __init__(self, nfeat, nhid, nclass, dropout):
        super(GCN, self).__init__()

        self.gc1 = GraphConvolution(nfeat, nhid)
        self.gc2 = GraphConvolution(nhid, nclass) 
        self.dropout = dropout

    def forward(self, x, adj):
        x = F.relu(self.gc1(x, adj))
        x = F.dropout(x, self.dropout, training=self.training)
        x = self.gc2(x, adj)
        return F.log_softmax(x, dim=1) 

In [79]:

model = GCN(nfeat=features.shape[1],
            nhid=16,
            nclass=10, 
            dropout=0.5)
optimizer = optim.Adam(model.parameters(),
                       lr=0.01, weight_decay=5e-4)

In [80]:
#labels = labels.reshape((1121,1))
def train(epoch):
    t = time.time()
    model.train()
    optimizer.zero_grad()
    output = model(features, adj)
    #print((output[idx_train]).shape)
    #print(labels.shape)
    loss_train = F.nll_loss(output[idx_train], labels[idx_train])
    acc_train = accuracy(output[idx_train], labels[idx_train])
    loss_train.backward()
    optimizer.step()


    #loss_val = F.nll_loss(output[idx_val], labels[idx_val])
    #acc_val = accuracy(output[idx_val], labels[idx_val])
    print('Epoch: {:04d}'.format(epoch+1),
          'loss_train: {:.4f}'.format(loss_train.item()),
          'acc_train: {:.4f}'.format(acc_train.item()),
          'time: {:.4f}s'.format(time.time() - t))


# Train model
t_total = time.time()
for epoch in range(500):
    train(epoch)
print("Optimization Finished!")
print("Total time elapsed: {:.4f}s".format(time.time() - t_total))

#Final Output
model.eval()
final_labels = model(features, adj)
print("Final Labels")
print(final_labels)

Epoch: 0001 loss_train: 2.3256 acc_train: 0.0364 time: 0.0624s
Epoch: 0002 loss_train: 2.3035 acc_train: 0.1273 time: 0.0128s
Epoch: 0003 loss_train: 2.2868 acc_train: 0.1636 time: 0.0126s
Epoch: 0004 loss_train: 2.2657 acc_train: 0.2000 time: 0.0125s
Epoch: 0005 loss_train: 2.2500 acc_train: 0.2000 time: 0.0125s
Epoch: 0006 loss_train: 2.2517 acc_train: 0.2000 time: 0.0119s
Epoch: 0007 loss_train: 2.2382 acc_train: 0.2182 time: 0.0124s
Epoch: 0008 loss_train: 2.2320 acc_train: 0.2182 time: 0.0124s
Epoch: 0009 loss_train: 2.2161 acc_train: 0.2182 time: 0.0128s
Epoch: 0010 loss_train: 2.1967 acc_train: 0.2182 time: 0.0122s
Epoch: 0011 loss_train: 2.1854 acc_train: 0.2000 time: 0.0122s
Epoch: 0012 loss_train: 2.1730 acc_train: 0.2364 time: 0.0123s
Epoch: 0013 loss_train: 2.1700 acc_train: 0.2182 time: 0.0126s
Epoch: 0014 loss_train: 2.1823 acc_train: 0.2000 time: 0.0126s
Epoch: 0015 loss_train: 2.1743 acc_train: 0.2000 time: 0.0124s
Epoch: 0016 loss_train: 2.1480 acc_train: 0.2000 time: 

Epoch: 0139 loss_train: 0.8911 acc_train: 0.8000 time: 0.0352s
Epoch: 0140 loss_train: 0.8958 acc_train: 0.7636 time: 0.0148s
Epoch: 0141 loss_train: 0.7699 acc_train: 0.8364 time: 0.0185s
Epoch: 0142 loss_train: 0.9083 acc_train: 0.7818 time: 0.0169s
Epoch: 0143 loss_train: 0.8915 acc_train: 0.7636 time: 0.0230s
Epoch: 0144 loss_train: 0.8855 acc_train: 0.8364 time: 0.0215s
Epoch: 0145 loss_train: 0.8805 acc_train: 0.7455 time: 0.0199s
Epoch: 0146 loss_train: 0.8758 acc_train: 0.8182 time: 0.0217s
Epoch: 0147 loss_train: 0.8383 acc_train: 0.7818 time: 0.0171s
Epoch: 0148 loss_train: 0.8200 acc_train: 0.8000 time: 0.0181s
Epoch: 0149 loss_train: 0.8472 acc_train: 0.8000 time: 0.0193s
Epoch: 0150 loss_train: 0.7794 acc_train: 0.8000 time: 0.0221s
Epoch: 0151 loss_train: 0.8071 acc_train: 0.8182 time: 0.0139s
Epoch: 0152 loss_train: 0.7441 acc_train: 0.8182 time: 0.0183s
Epoch: 0153 loss_train: 0.9504 acc_train: 0.7273 time: 0.0144s
Epoch: 0154 loss_train: 0.8326 acc_train: 0.7818 time: 

Epoch: 0273 loss_train: 0.5357 acc_train: 0.8909 time: 0.0154s
Epoch: 0274 loss_train: 0.6676 acc_train: 0.8000 time: 0.0122s
Epoch: 0275 loss_train: 0.6086 acc_train: 0.8182 time: 0.0131s
Epoch: 0276 loss_train: 0.5449 acc_train: 0.8727 time: 0.0185s
Epoch: 0277 loss_train: 0.5649 acc_train: 0.8545 time: 0.0139s
Epoch: 0278 loss_train: 0.5756 acc_train: 0.8182 time: 0.0127s
Epoch: 0279 loss_train: 0.5273 acc_train: 0.9273 time: 0.0122s
Epoch: 0280 loss_train: 0.5808 acc_train: 0.8727 time: 0.0124s
Epoch: 0281 loss_train: 0.6532 acc_train: 0.8545 time: 0.0129s
Epoch: 0282 loss_train: 0.5223 acc_train: 0.8727 time: 0.0127s
Epoch: 0283 loss_train: 0.5825 acc_train: 0.8545 time: 0.0121s
Epoch: 0284 loss_train: 0.6764 acc_train: 0.8182 time: 0.0120s
Epoch: 0285 loss_train: 0.5759 acc_train: 0.8909 time: 0.0125s
Epoch: 0286 loss_train: 0.5172 acc_train: 0.9273 time: 0.0125s
Epoch: 0287 loss_train: 0.5780 acc_train: 0.8727 time: 0.0120s
Epoch: 0288 loss_train: 0.5611 acc_train: 0.8909 time: 

Epoch: 0404 loss_train: 0.6239 acc_train: 0.8000 time: 0.0155s
Epoch: 0405 loss_train: 0.4434 acc_train: 0.9091 time: 0.0147s
Epoch: 0406 loss_train: 0.4921 acc_train: 0.8727 time: 0.0266s
Epoch: 0407 loss_train: 0.5827 acc_train: 0.8727 time: 0.0149s
Epoch: 0408 loss_train: 0.4942 acc_train: 0.8364 time: 0.0178s
Epoch: 0409 loss_train: 0.4252 acc_train: 0.8545 time: 0.0160s
Epoch: 0410 loss_train: 0.4647 acc_train: 0.8727 time: 0.0216s
Epoch: 0411 loss_train: 0.4817 acc_train: 0.8909 time: 0.0262s
Epoch: 0412 loss_train: 0.4911 acc_train: 0.8909 time: 0.0198s
Epoch: 0413 loss_train: 0.4928 acc_train: 0.8909 time: 0.0230s
Epoch: 0414 loss_train: 0.5082 acc_train: 0.9091 time: 0.0236s
Epoch: 0415 loss_train: 0.4857 acc_train: 0.8545 time: 0.0184s
Epoch: 0416 loss_train: 0.4791 acc_train: 0.8909 time: 0.0179s
Epoch: 0417 loss_train: 0.5053 acc_train: 0.8364 time: 0.0152s
Epoch: 0418 loss_train: 0.4218 acc_train: 0.9091 time: 0.0129s
Epoch: 0419 loss_train: 0.5323 acc_train: 0.8909 time: 

In [81]:
for label in final_labels:
    print(label)

tensor([-2.9637, -2.9271, -2.7678, -2.0931, -1.6526, -1.5308, -3.2482, -1.8576,
        -2.9252, -2.9515], grad_fn=<SelectBackward>)
tensor([-3.0707, -3.0670, -2.9735, -1.7880, -1.5629, -1.4117, -3.5926, -2.0748,
        -3.1183, -3.2698], grad_fn=<SelectBackward>)
tensor([-3.5270, -3.5131, -3.5723, -2.8011, -1.6829, -2.4546, -3.7663, -0.7621,
        -3.1081, -3.0849], grad_fn=<SelectBackward>)
tensor([-2.9599, -3.1188, -2.8948, -1.8213, -1.9930, -1.1385, -3.3913, -2.3170,
        -3.1470, -2.8996], grad_fn=<SelectBackward>)
tensor([-1.1848, -5.6012, -3.7450, -4.6125, -4.2048, -4.0217, -4.6984, -4.3062,
        -2.5329, -0.6500], grad_fn=<SelectBackward>)
tensor([-2.9599, -3.1188, -2.8948, -1.8213, -1.9930, -1.1385, -3.3913, -2.3170,
        -3.1470, -2.8996], grad_fn=<SelectBackward>)
tensor([-1.9557, -5.4160, -2.4326, -4.6497, -5.8946, -5.4977, -5.2238, -4.2564,
        -0.4022, -2.7890], grad_fn=<SelectBackward>)
tensor([-2.3837, -3.0735, -2.5964, -1.8448, -2.6369, -2.3028, -3.8573

tensor([-2.4443, -3.0932, -3.0939, -2.7573, -1.4096, -1.9313, -2.8298, -2.5169,
        -3.6624, -1.5874], grad_fn=<SelectBackward>)
tensor([-3.2449, -3.4199, -2.3580, -1.6159, -1.6025, -1.7165, -3.3508, -1.8664,
        -3.3213, -3.5729], grad_fn=<SelectBackward>)
tensor([-3.6290, -3.4241, -1.7110, -0.8595, -4.0531, -3.0662, -3.8545, -2.7348,
        -1.7901, -3.9200], grad_fn=<SelectBackward>)
tensor([-5.1579, -4.5706, -0.2200, -3.8916, -6.0414, -4.2035, -2.2535, -4.3234,
        -4.2525, -4.4966], grad_fn=<SelectBackward>)
tensor([-2.6842, -2.9184, -2.0849, -2.7943, -2.1830, -1.9767, -1.8848, -2.4866,
        -3.3936, -1.7582], grad_fn=<SelectBackward>)
tensor([-3.6861, -2.4879, -1.7678, -1.4639, -3.6639, -2.5064, -2.4511, -2.1308,
        -1.9705, -3.2632], grad_fn=<SelectBackward>)
tensor([-4.5031, -3.4066, -3.7748, -1.3437, -4.4021, -3.2955, -4.6875, -1.2387,
        -1.1757, -4.1941], grad_fn=<SelectBackward>)
tensor([-4.0046, -2.4931, -3.2934, -0.7599, -3.9976, -3.0848, -4.4770

In [82]:
preds = final_labels.max(1)[1].type_as(labels)

In [83]:
for l in preds:
    print(l)

tensor(5)
tensor(5)
tensor(7)
tensor(5)
tensor(9)
tensor(5)
tensor(8)
tensor(8)
tensor(9)
tensor(8)
tensor(9)
tensor(9)
tensor(9)
tensor(9)
tensor(3)
tensor(0)
tensor(0)
tensor(9)
tensor(0)
tensor(9)
tensor(9)
tensor(9)
tensor(9)
tensor(0)
tensor(9)
tensor(9)
tensor(9)
tensor(9)
tensor(0)
tensor(7)
tensor(7)
tensor(7)
tensor(7)
tensor(7)
tensor(9)
tensor(9)
tensor(9)
tensor(9)
tensor(9)
tensor(9)
tensor(9)
tensor(0)
tensor(0)
tensor(9)
tensor(9)
tensor(9)
tensor(2)
tensor(9)
tensor(2)
tensor(9)
tensor(0)
tensor(8)
tensor(3)
tensor(8)
tensor(8)
tensor(8)
tensor(9)
tensor(9)
tensor(4)
tensor(4)
tensor(4)
tensor(4)
tensor(4)
tensor(4)
tensor(4)
tensor(4)
tensor(4)
tensor(4)
tensor(4)
tensor(4)
tensor(4)
tensor(4)
tensor(4)
tensor(4)
tensor(7)
tensor(7)
tensor(7)
tensor(7)
tensor(9)
tensor(4)
tensor(4)
tensor(4)
tensor(3)
tensor(9)
tensor(4)
tensor(4)
tensor(4)
tensor(4)
tensor(0)
tensor(0)
tensor(4)
tensor(4)
tensor(4)
tensor(4)
tensor(8)
tensor(8)
tensor(8)
tensor(8)
tensor(7)
tensor(7)


In [51]:
final_labels.max(1)[1]

tensor([86, 86, 86,  ..., 86, 86, 86])