In [94]:
# from __future__ import division
from __future__ import print_function

import time
import argparse
import numpy as np
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

#from pygcn.utils import load_data, accuracy
#from pygcn.models import GCN
import pandas as pd
import scipy.sparse as sp

In [95]:
def encode_onehot(labels, num_classes):
    classes = set(list(range(num_classes + 1)))
    #classes = set(labels)
    classes_dict = {c: np.identity(len(classes))[i, :] for i, c in enumerate(classes)}
    #print("Classes dict: ", classes_dict)
    labels_onehot = np.array(list(map(classes_dict.get, labels)), dtype=np.int32)
    print("Labels onehot", labels_onehot)
    return labels_onehot

def normalize(mx):
    """Row-normalize sparse matrix"""
    rowsum = np.array(mx.sum(1))
    r_inv = np.power(rowsum, -1).flatten()
    r_inv[np.isinf(r_inv)] = 0.
    r_mat_inv = sp.diags(r_inv)
    mx = r_mat_inv.dot(mx)
    return mx


def accuracy(output, labels):
    preds = output.max(1)[1].type_as(labels)
    correct = preds.eq(labels).double()
    correct = correct.sum()
    return correct / len(labels)


def sparse_mx_to_torch_sparse_tensor(sparse_mx):
    """Convert a scipy sparse matrix to a torch sparse tensor."""
    sparse_mx = sparse_mx.tocoo().astype(np.float32)
    indices = torch.from_numpy(
        np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
    values = torch.from_numpy(sparse_mx.data)
    shape = torch.Size(sparse_mx.shape)
    return torch.sparse.FloatTensor(indices, values, shape)

In [96]:
# def encode_edges(edges_data):
#     new_series = edges_data[0].append(edges_data[1])
#     labels, levels = pd.factorize(new_series)

#     edgeId = {}

#     for i in range(len(levels)):
#         key = levels[i]
#         edgeId[key] = i

#     encoded_edges = []
#     for row in edges_data.itertuples(index=True, name='Pandas'):
#         encoded_edges.append([edgeId[row[1]], edgeId[row[2]]])
#     return np.asarray(encoded_edges)

In [97]:
# def remove_empty_edges(mapped_edges):
#     new_list = []
#     for edge in mapped_edges:
#         print(edge)
#         print(mapped_edges[edge])
#         if(edge[0] != None and edge[1] != None):
#             new_list.append(edge)
#     return edge

In [98]:
#https://github.com/tkipf/pygcn/issues/39
def load_data(content, edges):
    print('Loading {} dataset...'.format(content))

    #idx_features_labels = np.genfromtxt("{}{}.content".format(path, dataset), dtype=np.dtype(str))
    idx_features_labels = pd.read_hdf(content, 'df')
    
    #reindex to make sure columns are in correct order
    cols = list(idx_features_labels)
    
    # move the article_name column to head of list using index, pop and insert
    cols.insert(0, cols.pop(cols.index('article_name')))
    idx_features_labels = idx_features_labels.ix[:, cols]
    idx_features_labels = idx_features_labels.values
    
    features = sp.csr_matrix(idx_features_labels[:, 1:], dtype=np.float32)
    
    proto_labels = np.zeros(len(idx_features_labels))
    #Training indices
    idx_train = range(4, 60)
    for idx in idx_train:
        proto_labels[idx] = random.randint(30,95) #set default "understanding" level
        print("Training label: ", proto_labels[idx])
    #TODO INSERT OUR OWN LABELS
    labels = encode_onehot(proto_labels, 100)
    #labels = np.zeros(len(idx_features_labels))


    # build graph
    idx = np.array(idx_features_labels[:, 0], dtype=np.str)
    idx_map = {j: i for i, j in enumerate(idx)}
    edges_unordered = np.genfromtxt("{}".format(edges), dtype=np.str)

    #print(list(map(idx_map.get, edges_unordered.flatten())))
    
    edges_unfiltered = np.array(list(map(idx_map.get, edges_unordered.flatten())), dtype=np.str).reshape(edges_unordered.shape)
    
    print("=====")
    #print(type(edges_unfiltered))
    
    #filter out edges that contain invalid categories 'None'
    edges = []
    for edge in edges_unfiltered:
        if(not(edge[0] == 'None') and not(edge[1] == 'None')):
            edges.append(edge)
            
    edges = np.array(edges)
    #print(edges)
    
    adj = sp.coo_matrix((np.ones(edges.shape[0]), (edges[:, 0], edges[:, 1])),
                        shape=(labels.shape[0], labels.shape[0]),
                        dtype=np.int32)

    # build symmetric adjacency matrix
    adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)

    features = normalize(features)
    adj = normalize(adj + sp.eye(adj.shape[0]))

            
    idx_val = range(200, 500)
    idx_test = range(500, 1500)
    #print("Np where: ", [np.where(r==1)[0][0] for r in labels])
    print("Np where: ", (np.where(labels)[1]).size)
    features = torch.FloatTensor(np.array(features.todense()))
    labels = torch.LongTensor(np.where(labels)[1])
    #labels = torch.FloatTensor(labels)
    print(labels.shape)
    adj = sparse_mx_to_torch_sparse_tensor(adj)

    idx_train = torch.LongTensor(idx_train)
    idx_val = torch.LongTensor(idx_val)
    idx_test = torch.LongTensor(idx_test)

    return adj, features, labels, idx_train

In [99]:
adj, features, labels, idx_train = load_data("data/Roman_republic_embeddings.h5", "data/Roman_republic_relations.txt")

Loading data/Roman_republic_embeddings.h5 dataset...


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  del sys.path[0]


Training label:  68.0
Training label:  51.0
Training label:  69.0
Training label:  73.0
Training label:  31.0
Training label:  51.0
Training label:  78.0
Training label:  31.0
Training label:  93.0
Training label:  77.0
Training label:  49.0
Training label:  94.0
Training label:  31.0
Training label:  50.0
Training label:  56.0
Training label:  43.0
Training label:  32.0
Training label:  83.0
Training label:  54.0
Training label:  36.0
Training label:  33.0
Training label:  89.0
Training label:  83.0
Training label:  57.0
Training label:  59.0
Training label:  62.0
Training label:  42.0
Training label:  72.0
Training label:  76.0
Training label:  49.0
Training label:  35.0
Training label:  38.0
Training label:  54.0
Training label:  30.0
Training label:  65.0
Training label:  46.0
Training label:  76.0
Training label:  40.0
Training label:  37.0
Training label:  88.0
Training label:  94.0
Training label:  44.0
Training label:  65.0
Training label:  41.0
Training label:  42.0
Training l

  del sys.path[0]


In [88]:
import math

import torch

from torch.nn.parameter import Parameter
from torch.nn.modules.module import Module


class GraphConvolution(Module):
    """
    Simple GCN layer, similar to https://arxiv.org/abs/1609.02907
    """

    def __init__(self, in_features, out_features, bias=True):
        super(GraphConvolution, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = Parameter(torch.FloatTensor(in_features, out_features))
        if bias:
            self.bias = Parameter(torch.FloatTensor(out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
        if self.bias is not None:
            self.bias.data.uniform_(-stdv, stdv)

    def forward(self, input, adj):
        support = torch.mm(input, self.weight)
        output = torch.spmm(adj, support)
        if self.bias is not None:
            return output + self.bias
        else:
            return output

    def __repr__(self):
        return self.__class__.__name__ + ' (' \
               + str(self.in_features) + ' -> ' \
               + str(self.out_features) + ')'



class GCN(nn.Module):
    def __init__(self, nfeat, nhid, nclass, dropout):
        super(GCN, self).__init__()

        self.gc1 = GraphConvolution(nfeat, nhid)
        self.gc2 = GraphConvolution(nhid, nclass) 
        self.dropout = dropout

    def forward(self, x, adj):
        x = F.relu(self.gc1(x, adj))
        x = F.dropout(x, self.dropout, training=self.training)
        x = self.gc2(x, adj)
        return F.log_softmax(x, dim=1) 

In [89]:

model = GCN(nfeat=features.shape[1],
            nhid=16,
            nclass=100, 
            dropout=0.5)
optimizer = optim.Adam(model.parameters(),
                       lr=0.01, weight_decay=5e-4)

In [90]:
#labels = labels.reshape((1121,1))
def train(epoch):
    t = time.time()
    model.train()
    optimizer.zero_grad()
    output = model(features, adj)
    print((output[idx_train]).shape)
    print(labels.shape)
    loss_train = F.nll_loss(output[idx_train], labels[idx_train])
    acc_train = accuracy(output[idx_train], labels[idx_train])
    loss_train.backward()
    optimizer.step()


    #loss_val = F.nll_loss(output[idx_val], labels[idx_val])
    #acc_val = accuracy(output[idx_val], labels[idx_val])
    print('Epoch: {:04d}'.format(epoch+1),
          'loss_train: {:.4f}'.format(loss_train.item()),
          'acc_train: {:.4f}'.format(acc_train.item()),
          'time: {:.4f}s'.format(time.time() - t))


# Train model
t_total = time.time()
for epoch in range(100):
    train(epoch)
print("Optimization Finished!")
print("Total time elapsed: {:.4f}s".format(time.time() - t_total))

#Final Output
model.eval()
final_labels = model(features, adj)
print("Final Labels")
print(final_labels)

torch.Size([56, 100])
torch.Size([1121])
Epoch: 0001 loss_train: 4.6022 acc_train: 0.0000 time: 0.0186s
torch.Size([56, 100])
torch.Size([1121])
Epoch: 0002 loss_train: 4.5810 acc_train: 0.0179 time: 0.0171s
torch.Size([56, 100])
torch.Size([1121])
Epoch: 0003 loss_train: 4.5630 acc_train: 0.0357 time: 0.0227s
torch.Size([56, 100])
torch.Size([1121])
Epoch: 0004 loss_train: 4.5453 acc_train: 0.0536 time: 0.0199s
torch.Size([56, 100])
torch.Size([1121])
Epoch: 0005 loss_train: 4.5237 acc_train: 0.0357 time: 0.0136s
torch.Size([56, 100])
torch.Size([1121])
Epoch: 0006 loss_train: 4.5027 acc_train: 0.0179 time: 0.0137s
torch.Size([56, 100])
torch.Size([1121])
Epoch: 0007 loss_train: 4.4807 acc_train: 0.0179 time: 0.0132s
torch.Size([56, 100])
torch.Size([1121])
Epoch: 0008 loss_train: 4.4605 acc_train: 0.0536 time: 0.0133s
torch.Size([56, 100])
torch.Size([1121])
Epoch: 0009 loss_train: 4.4227 acc_train: 0.0179 time: 0.0131s
torch.Size([56, 100])
torch.Size([1121])
Epoch: 0010 loss_train:

Epoch: 0086 loss_train: 3.1965 acc_train: 0.1250 time: 0.0137s
torch.Size([56, 100])
torch.Size([1121])
Epoch: 0087 loss_train: 3.1634 acc_train: 0.1786 time: 0.0158s
torch.Size([56, 100])
torch.Size([1121])
Epoch: 0088 loss_train: 3.2103 acc_train: 0.1250 time: 0.0141s
torch.Size([56, 100])
torch.Size([1121])
Epoch: 0089 loss_train: 3.1962 acc_train: 0.1250 time: 0.0149s
torch.Size([56, 100])
torch.Size([1121])
Epoch: 0090 loss_train: 3.2282 acc_train: 0.1071 time: 0.0169s
torch.Size([56, 100])
torch.Size([1121])
Epoch: 0091 loss_train: 3.0846 acc_train: 0.2143 time: 0.0140s
torch.Size([56, 100])
torch.Size([1121])
Epoch: 0092 loss_train: 3.1247 acc_train: 0.1429 time: 0.0135s
torch.Size([56, 100])
torch.Size([1121])
Epoch: 0093 loss_train: 3.0315 acc_train: 0.1786 time: 0.0135s
torch.Size([56, 100])
torch.Size([1121])
Epoch: 0094 loss_train: 3.1686 acc_train: 0.1964 time: 0.0133s
torch.Size([56, 100])
torch.Size([1121])
Epoch: 0095 loss_train: 3.1398 acc_train: 0.1250 time: 0.0132s
t

In [91]:
for label in final_labels:
    print(label)

tensor([-7.6755, -7.7215, -7.6747, -7.6871, -7.6819, -7.6853, -7.7407, -7.7196,
        -7.7268, -7.6845, -7.6918, -7.7100, -7.6856, -7.7156, -7.7214, -7.6834,
        -7.7483, -7.6791, -7.7097, -7.7247, -7.7124, -7.6866, -7.6613, -7.6888,
        -7.6880, -7.7055, -7.6929, -7.6856, -7.6578, -7.7182, -7.6943, -2.9510,
        -7.7183, -7.7081, -7.6495, -7.6986, -7.7232, -3.3735, -7.7015, -3.4808,
        -3.9088, -7.6620, -3.8289, -7.7196, -3.3867, -7.7170, -3.2430, -7.7137,
        -3.8533, -4.2360, -7.6764, -3.8463, -7.7175, -3.4869, -4.0652, -7.6734,
        -3.9610, -7.6961, -7.6977, -4.3482, -3.9655, -3.4618, -3.9895, -2.9959,
        -7.7286, -4.2991, -3.4754, -4.0769, -7.7008, -3.9274, -3.4547, -3.3121,
        -3.5339, -7.6812, -4.0497, -3.8592, -3.8821, -7.7226, -7.7164, -3.9157,
        -4.0024, -3.4496, -7.6701, -3.9347, -3.8388, -7.7001, -7.7113, -2.9797,
        -7.6772, -7.7182, -3.5581, -3.3542, -7.6823, -7.6997, -7.7252, -3.8612,
        -7.6760, -7.7054, -7.6797, -7.71

tensor([-7.6734, -7.7188, -7.6722, -7.6827, -7.6801, -7.6822, -7.7360, -7.7154,
        -7.7234, -7.6824, -7.6897, -7.7066, -7.6835, -7.7127, -7.7182, -7.6814,
        -7.7443, -7.6762, -7.7071, -7.7207, -7.7086, -7.6850, -7.6593, -7.6856,
        -7.6856, -7.7027, -7.6893, -7.6827, -7.6550, -7.7152, -7.6921, -2.9320,
        -7.7136, -7.7036, -7.6487, -7.6952, -7.7181, -3.3297, -7.6978, -3.4868,
        -3.9209, -7.6605, -3.8300, -7.7170, -3.3475, -7.7137, -3.2487, -7.7099,
        -3.8235, -4.2576, -7.6756, -3.8461, -7.7130, -3.5377, -4.0771, -7.6702,
        -3.9772, -7.6920, -7.6934, -4.3696, -3.9699, -3.4925, -3.9403, -3.0158,
        -7.7275, -4.3022, -3.4916, -4.0406, -7.6970, -3.9444, -3.4281, -3.3179,
        -3.5288, -7.6796, -3.9877, -3.8329, -3.8823, -7.7179, -7.7132, -3.9507,
        -4.0175, -3.4681, -7.6687, -3.9381, -3.8136, -7.6969, -7.7080, -2.9783,
        -7.6741, -7.7129, -3.5921, -3.3632, -7.6796, -7.6964, -7.7218, -3.8721,
        -7.6741, -7.7019, -7.6775, -7.70

tensor([-7.5917, -7.6387, -7.5914, -7.6046, -7.5966, -7.6014, -7.6588, -7.6373,
        -7.6459, -7.6020, -7.6106, -7.6266, -7.6031, -7.6337, -7.6398, -7.5983,
        -7.6674, -7.5949, -7.6270, -7.6398, -7.6295, -7.6018, -7.5761, -7.6045,
        -7.6063, -7.6249, -7.6107, -7.6023, -7.5748, -7.6358, -7.6099, -2.9841,
        -7.6371, -7.6245, -7.5635, -7.6159, -7.6401, -3.4147, -7.6173, -3.5397,
        -3.9126, -7.5777, -3.8342, -7.6357, -3.4222, -7.6331, -3.2587, -7.6297,
        -3.8529, -4.2197, -7.5930, -3.8391, -7.6370, -3.4554, -4.0465, -7.5893,
        -3.9477, -7.6113, -7.6135, -4.3648, -3.9496, -3.4357, -4.0081, -2.9897,
        -7.6477, -4.2853, -3.4376, -4.1020, -7.6175, -3.9130, -3.5112, -3.3233,
        -3.5783, -7.5956, -4.0689, -3.8603, -3.8733, -7.6415, -7.6337, -3.9036,
        -3.9703, -3.4284, -7.5866, -3.9243, -3.8232, -7.6157, -7.6292, -2.9755,
        -7.5929, -7.6360, -3.5226, -3.3334, -7.5972, -7.6166, -7.6436, -3.8713,
        -7.5927, -7.6227, -7.5962, -7.62

tensor([-7.6000, -7.6476, -7.6007, -7.6137, -7.6051, -7.6103, -7.6675, -7.6448,
        -7.6548, -7.6118, -7.6196, -7.6352, -7.6119, -7.6420, -7.6466, -7.6062,
        -7.6756, -7.6040, -7.6363, -7.6482, -7.6376, -7.6103, -7.5858, -7.6140,
        -7.6151, -7.6330, -7.6196, -7.6099, -7.5842, -7.6438, -7.6187, -2.9767,
        -7.6463, -7.6339, -7.5727, -7.6253, -7.6498, -3.4028, -7.6248, -3.5576,
        -3.9111, -7.5853, -3.8327, -7.6435, -3.4054, -7.6405, -3.2641, -7.6392,
        -3.8653, -4.1816, -7.6005, -3.8369, -7.6462, -3.4405, -4.0664, -7.5986,
        -3.9507, -7.6210, -7.6231, -4.3862, -3.9767, -3.4329, -3.9925, -2.9941,
        -7.6546, -4.3140, -3.4340, -4.1343, -7.6269, -3.9165, -3.4942, -3.3390,
        -3.5767, -7.6039, -4.0532, -3.8671, -3.8707, -7.6504, -7.6426, -3.9002,
        -3.9572, -3.4186, -7.5947, -3.9266, -3.8451, -7.6251, -7.6380, -2.9739,
        -7.6014, -7.6466, -3.5113, -3.3388, -7.6058, -7.6250, -7.6525, -3.8638,
        -7.6031, -7.6312, -7.6059, -7.63

tensor([-7.5852, -7.6315, -7.5847, -7.5984, -7.5890, -7.5947, -7.6522, -7.6309,
        -7.6397, -7.5953, -7.6040, -7.6198, -7.5966, -7.6269, -7.6333, -7.5908,
        -7.6610, -7.5882, -7.6199, -7.6327, -7.6227, -7.5945, -7.5689, -7.5976,
        -7.5995, -7.6184, -7.6042, -7.5956, -7.5683, -7.6293, -7.6027, -2.9952,
        -7.6305, -7.6180, -7.5560, -7.6092, -7.6333, -3.4328, -7.6107, -3.5638,
        -3.9190, -7.5706, -3.8380, -7.6287, -3.4358, -7.6262, -3.2626, -7.6227,
        -3.8568, -4.2183, -7.5862, -3.8373, -7.6307, -3.4408, -4.0425, -7.5828,
        -3.9447, -7.6045, -7.6067, -4.3802, -3.9456, -3.4218, -4.0209, -2.9819,
        -7.6410, -4.2882, -3.4122, -4.1211, -7.6110, -3.9109, -3.5374, -3.3291,
        -3.6018, -7.5882, -4.0816, -3.8622, -3.8724, -7.6351, -7.6270, -3.9047,
        -3.9475, -3.4091, -7.5800, -3.9250, -3.8204, -7.6085, -7.6226, -2.9624,
        -7.5860, -7.6294, -3.5011, -3.3199, -7.5899, -7.6096, -7.6369, -3.8801,
        -7.5855, -7.6162, -7.5893, -7.61

In [92]:
preds = final_labels.max(1)[1].type_as(labels)

In [93]:
for l in preds:
    print(l)

tensor(31)
tensor(31)
tensor(87)
tensor(87)
tensor(63)
tensor(87)
tensor(70)
tensor(31)
tensor(63)
tensor(31)
tensor(63)
tensor(63)
tensor(63)
tensor(31)
tensor(63)
tensor(63)
tensor(63)
tensor(63)
tensor(60)
tensor(70)
tensor(31)
tensor(63)
tensor(63)
tensor(39)
tensor(31)
tensor(63)
tensor(54)
tensor(31)
tensor(49)
tensor(37)
tensor(37)
tensor(37)
tensor(37)
tensor(37)
tensor(37)
tensor(63)
tensor(63)
tensor(63)
tensor(63)
tensor(63)
tensor(63)
tensor(65)
tensor(70)
tensor(63)
tensor(63)
tensor(63)
tensor(59)
tensor(31)
tensor(39)
tensor(31)
tensor(31)
tensor(87)
tensor(31)
tensor(87)
tensor(87)
tensor(87)
tensor(63)
tensor(63)
tensor(53)
tensor(53)
tensor(53)
tensor(53)
tensor(53)
tensor(53)
tensor(53)
tensor(53)
tensor(53)
tensor(53)
tensor(63)
tensor(63)
tensor(63)
tensor(53)
tensor(53)
tensor(53)
tensor(63)
tensor(63)
tensor(63)
tensor(63)
tensor(39)
tensor(53)
tensor(53)
tensor(53)
tensor(87)
tensor(54)
tensor(54)
tensor(53)
tensor(53)
tensor(53)
tensor(53)
tensor(53)
tensor(53)