In [1]:
from __future__ import division
from __future__ import print_function

import time, sys
import tensorflow as tf

In [2]:
from gcn.utils import *
from gcn.models import GCN, MLP

# Set random seed

In [3]:
seed = 123
np.random.seed(seed)
tf.set_random_seed(seed)

# Settings

In [4]:
flags = tf.app.flags
FLAGS = flags.FLAGS
flags.DEFINE_string('dataset', 'cora', 'Dataset string.')  # 'cora', 'citeseer', 'pubmed'
flags.DEFINE_string('model', 'gcn', 'Model string.')  # 'gcn', 'gcn_cheby', 'dense'
flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.')
flags.DEFINE_integer('epochs', 200, 'Number of epochs to train.')
flags.DEFINE_integer('hidden1', 16, 'Number of units in hidden layer 1.')
flags.DEFINE_float('dropout', 0.5, 'Dropout rate (1 - keep probability).')
flags.DEFINE_float('weight_decay', 5e-4, 'Weight for L2 loss on embedding matrix.')
flags.DEFINE_integer('early_stopping', 10, 'Tolerance for early stopping (# of epochs).')
flags.DEFINE_integer('max_degree', 3, 'Maximum Chebyshev polynomial degree.')

In [5]:
import scipy.sparse as sp

In [6]:
lil_mx = sp.lil_matrix(np.array([[1,2],[3,4],[5,6]]))

In [7]:
print(lil_mx[0])

  (0, 0)	1
  (0, 1)	2


In [8]:
vstacked_lil = sp.vstack((np.array([[1,2,3],[4,5,6]]),np.array([1,2,3]))).tolil()

In [9]:
print(vstacked_lil[0])

  (0, 0)	1
  (0, 1)	2
  (0, 2)	3


In [10]:
arr = np.array([[1,2,3],[4,5,6]])
rowsum = arr.sum(axis=1)
np.power(rowsum,-1.0).flatten()

array([ 0.16666667,  0.06666667])

# Load data

In [11]:
adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_data(FLAGS.dataset)

# Some preprocessing (run only once)

In [12]:
features = preprocess_features(features)
if FLAGS.model == 'gcn':
    support = [preprocess_adj(adj)]
    num_supports = 1
    model_func = GCN
elif FLAGS.model == 'gcn_cheby':
    support = chebyshev_polynomials(adj, FLAGS.max_degree)
    num_supports = 1 + FLAGS.max_degree
    model_func = GCN
elif FLAGS.model == 'dense':
    support = [preprocess_adj(adj)]  # Not used
    num_supports = 1
    model_func = MLP
else:
    raise ValueError('Invalid argument for model: ' + str(FLAGS.model))

# Look at preprocessed data

### Adjacency matrix

adj.shape

### Feature matrix: Stored as a sparse matrix in a dict
   * coordinates
   * values for coordinates
   * shape

In [13]:
print(features[0].shape)
print(features[1].shape)
print(features[2])

(49216, 2)
(49216,)
(2708, 1433)


### Labels: are onehot encoded

The number of columns is the number of different groups in the data

In [14]:
print(y_train.shape)
print(y_test.shape)
print(y_val.shape)

(2708, 7)
(2708, 7)
(2708, 7)


#### In the training data only 20 entity is revealed from each group (for [Cora](https://relational.fit.cvut.cz/dataset/CORA) citiation network)

In [15]:
print(np.sum(y_train,axis=0))
print(np.sum(y_test,axis=0))
print(np.sum(y_val,axis=0))

[ 20.  20.  20.  20.  20.  20.  20.]
[ 130.   91.  144.  319.  149.  103.   64.]
[  61.   36.   78.  158.   81.   57.   29.]


### Masks
boolean vectors which indicate where are the train, validation and test records in the data

In [16]:
train_mask.shape

(2708,)

# Define placeholders

In [17]:
placeholders = {
    'support': [tf.sparse_placeholder(tf.float32) for _ in range(num_supports)],
    'features': tf.sparse_placeholder(tf.float32, shape=tf.constant(features[2], dtype=tf.int64)),
    'labels': tf.placeholder(tf.float32, shape=(None, y_train.shape[1])),
    'labels_mask': tf.placeholder(tf.int32),
    'dropout': tf.placeholder_with_default(0., shape=()),
    'num_features_nonzero': tf.placeholder(tf.int32)  # helper variable for sparse dropout
}

# Create model

In [18]:
model = model_func(placeholders, input_dim=features[2][1], logging=True)

# Initialize session

In [19]:
sess = tf.Session()

# Define model evaluation function

In [20]:
def evaluate(features, support, labels, mask, placeholders):
    t_test = time.time()
    feed_dict_val = construct_feed_dict(features, support, labels, mask, placeholders)
    outs_val = sess.run([model.loss, model.accuracy], feed_dict=feed_dict_val)
    return outs_val[0], outs_val[1], (time.time() - t_test)

# Init variables

In [21]:
sess.run(tf.global_variables_initializer())

In [22]:
cost_val = []

# Train model

In [23]:
for epoch in range(FLAGS.epochs):

    t = time.time()
    # Construct feed dictionary
    feed_dict = construct_feed_dict(features, support, y_train, train_mask, placeholders)
    feed_dict.update({placeholders['dropout']: FLAGS.dropout})

    # Training step
    outs = sess.run([model.opt_op, model.loss, model.accuracy], feed_dict=feed_dict)

    # Validation
    cost, acc, duration = evaluate(features, support, y_val, val_mask, placeholders)
    cost_val.append(cost)

    # Print results
    if epoch % 10 == 0:
        print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(outs[1]),
              "train_acc=", "{:.5f}".format(outs[2]), "val_loss=", "{:.5f}".format(cost),
              "val_acc=", "{:.5f}".format(acc), "time=", "{:.5f}".format(time.time() - t))

    if epoch > FLAGS.early_stopping and cost_val[-1] > np.mean(cost_val[-(FLAGS.early_stopping+1):-1]):
        print("Early stopping...")
        break

print("Optimization Finished!")

print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(outs[1]),
              "train_acc=", "{:.5f}".format(outs[2]), "val_loss=", "{:.5f}".format(cost),
              "val_acc=", "{:.5f}".format(acc), "time=", "{:.5f}".format(time.time() - t))

Epoch: 0001 train_loss= 1.95399 train_acc= 0.07143 val_loss= 1.95070 val_acc= 0.20600 time= 0.05210
Epoch: 0011 train_loss= 1.87589 train_acc= 0.72143 val_loss= 1.91122 val_acc= 0.52000 time= 0.02722
Epoch: 0021 train_loss= 1.76760 train_acc= 0.77857 val_loss= 1.86662 val_acc= 0.59200 time= 0.03230
Epoch: 0031 train_loss= 1.62944 train_acc= 0.80000 val_loss= 1.80741 val_acc= 0.64400 time= 0.02754
Epoch: 0041 train_loss= 1.49882 train_acc= 0.85000 val_loss= 1.73166 val_acc= 0.69200 time= 0.03075
Epoch: 0051 train_loss= 1.38851 train_acc= 0.85714 val_loss= 1.64305 val_acc= 0.73600 time= 0.03151
Epoch: 0061 train_loss= 1.18911 train_acc= 0.89286 val_loss= 1.55086 val_acc= 0.75600 time= 0.02384
Epoch: 0071 train_loss= 1.09616 train_acc= 0.88571 val_loss= 1.46357 val_acc= 0.77400 time= 0.02704
Epoch: 0081 train_loss= 0.97463 train_acc= 0.91429 val_loss= 1.39013 val_acc= 0.77600 time= 0.02587
Epoch: 0091 train_loss= 0.92480 train_acc= 0.95000 val_loss= 1.32596 val_acc= 0.78000 time= 0.02873


# Testing

In [24]:
test_cost, test_acc, test_duration = evaluate(features, support, y_test, test_mask, placeholders)
print("Test set results:", "cost=", "{:.5f}".format(test_cost),
      "accuracy=", "{:.5f}".format(test_acc), "time=", "{:.5f}".format(test_duration))

Test set results: cost= 1.01263 accuracy= 0.81400 time= 0.01390


# Conclusions

   * I could use preprocessing as well: but I will have different features (not word occurances) - except if I make counters about the number of given degree neighbours ... _(That is cheating... for semi-classification)_
   * If my features will be various centrality scores of the given node, than it should not be row normalized!!! rather
   column normalized...
   
### Target
   * binary (active-offline node)
   * multiclass (the number of incoming mentions of a node - in logscale groups)
   
### TODO: What features should I generate?
   * **simple:** node activity in train period (binary)
   * **simple:** (the number of incoming mentions of a node - in logscale groups) in the train period
   * **complex:** set various node centrality measure values (calculated for train period) as features for nodes
   * **complex:** inter-event time statistics for nodes in the training period
   * **hard:** textual context based features...
   