In [None]:
from __future__ import division
from __future__ import print_function

import time, sys
import tensorflow as tf

In [None]:
from gcn.utils import *
from gcn.models import GCN, MLP

# Set random seed

In [None]:
seed = 123
np.random.seed(seed)
tf.set_random_seed(seed)

# Settings

In [None]:
dataset_id = '15o'

In [None]:
flags = tf.app.flags
FLAGS = flags.FLAGS
flags.DEFINE_string('dataset', dataset_id, 'Dataset string.')  # 'cora', 'citeseer', 'pubmed'
flags.DEFINE_string('model', 'gcn', 'Model string.')  # 'gcn', 'gcn_cheby', 'dense'
flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.')
flags.DEFINE_integer('epochs', 200, 'Number of epochs to train.')
flags.DEFINE_integer('hidden1', 16, 'Number of units in hidden layer 1.')
flags.DEFINE_float('dropout', 0.5, 'Dropout rate (1 - keep probability).')
flags.DEFINE_float('weight_decay', 5e-4, 'Weight for L2 loss on embedding matrix.')
flags.DEFINE_integer('early_stopping', 10, 'Tolerance for early stopping (# of epochs).')
flags.DEFINE_integer('max_degree', 3, 'Maximum Chebyshev polynomial degree.')

# Load data

In [None]:
if dataset_id in ['cora', 'citeseer', 'pubmed']:
    input_pref = "data"
else:
    input_pref = "/mnt/idms/fberes/network/gcn_project/data/"

In [None]:
adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_data(FLAGS.dataset,input_prefix=input_pref)

# Some preprocessing (run only once)

In [None]:
features = preprocess_features(features, norm_type = "col" if dataset_id == "15o" else "row")
if FLAGS.model == 'gcn':
    support = [preprocess_adj(adj)]
    num_supports = 1
    model_func = GCN
elif FLAGS.model == 'gcn_cheby':
    support = chebyshev_polynomials(adj, FLAGS.max_degree)
    num_supports = 1 + FLAGS.max_degree
    model_func = GCN
elif FLAGS.model == 'dense':
    support = [preprocess_adj(adj)]  # Not used
    num_supports = 1
    model_func = MLP
else:
    raise ValueError('Invalid argument for model: ' + str(FLAGS.model))

# Look at preprocessed data

### Adjacency matrix

adj.shape

### Feature matrix: Stored as a sparse matrix in a dict
   * coordinates
   * values for coordinates
   * shape

In [None]:
print(features[0].shape)
print(features[1].shape)
print(features[2])

### Labels: are onehot encoded

The number of columns is the number of different groups in the data

In [None]:
print(y_train.shape)
print(y_test.shape)
print(y_val.shape)

#### In the training data only 20 entity is revealed from each group (for [Cora](https://relational.fit.cvut.cz/dataset/CORA) citiation network)

In [None]:
print(np.sum(y_train,axis=0))
print(np.sum(y_test,axis=0))
print(np.sum(y_val,axis=0))

### Masks
boolean vectors which indicate where are the train, validation and test records in the data

In [None]:
train_mask.shape

# Define placeholders

In [None]:
placeholders = {
    'support': [tf.sparse_placeholder(tf.float32) for _ in range(num_supports)],
    'features': tf.sparse_placeholder(tf.float32, shape=tf.constant(features[2], dtype=tf.int64)),
    'labels': tf.placeholder(tf.float32, shape=(None, y_train.shape[1])),
    'labels_mask': tf.placeholder(tf.int32),
    'dropout': tf.placeholder_with_default(0., shape=()),
    'num_features_nonzero': tf.placeholder(tf.int32)  # helper variable for sparse dropout
}

# Create model

In [None]:
model = model_func(placeholders, input_dim=features[2][1], logging=True)

# Initialize session

In [None]:
sess = tf.Session()

# Define model evaluation function

In [None]:
def evaluate(features, support, labels, mask, placeholders):
    t_test = time.time()
    feed_dict_val = construct_feed_dict(features, support, labels, mask, placeholders)
    outs_val = sess.run([model.loss, model.accuracy], feed_dict=feed_dict_val)
    return outs_val[0], outs_val[1], (time.time() - t_test)

# Init variables

In [None]:
sess.run(tf.global_variables_initializer())

In [None]:
cost_val = []

# Train model

In [None]:
for epoch in range(FLAGS.epochs):

    t = time.time()
    # Construct feed dictionary
    feed_dict = construct_feed_dict(features, support, y_train, train_mask, placeholders)
    feed_dict.update({placeholders['dropout']: FLAGS.dropout})

    # Training step
    outs = sess.run([model.opt_op, model.loss, model.accuracy], feed_dict=feed_dict)

    # Validation
    cost, acc, duration = evaluate(features, support, y_val, val_mask, placeholders)
    cost_val.append(cost)

    # Print results
    if epoch % 10 == 0:
        print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(outs[1]),
              "train_acc=", "{:.5f}".format(outs[2]), "val_loss=", "{:.5f}".format(cost),
              "val_acc=", "{:.5f}".format(acc), "time=", "{:.5f}".format(time.time() - t))

    if epoch > FLAGS.early_stopping and cost_val[-1] > np.mean(cost_val[-(FLAGS.early_stopping+1):-1]):
        print("Early stopping...")
        break

print("Optimization Finished!")

print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(outs[1]),
              "train_acc=", "{:.5f}".format(outs[2]), "val_loss=", "{:.5f}".format(cost),
              "val_acc=", "{:.5f}".format(acc), "time=", "{:.5f}".format(time.time() - t))

# Testing

In [None]:
test_cost, test_acc, test_duration = evaluate(features, support, y_test, test_mask, placeholders)
print("Test set results:", "cost=", "{:.5f}".format(test_cost),
      "accuracy=", "{:.5f}".format(test_acc), "time=", "{:.5f}".format(test_duration))

# Results

   * Train-test split at half time (on first 4 days of 15o)
   * Using default parameters of GCN
   * For the below experiments **only 'frequency'** feature is used!!! Using any other generated feature ('degree','pagerank','time') would decrease the accuracy to 0.16-0.18 on the testing set.

## top_k=3

   * timeframe for edge appearence **60 sec**: **te: 0.70515**, tr: 0.75133, val: 0.73000 (2955 nodes, 13796 edges)
   * timeframe for edge appearence 100 sec: te: 0.68944, tr: 0.73709, val: 0.72000 (3329 nodes, 21852 edges)
   * timeframe for edge appearence 300 sec: te: 0.68245, tr: 0.72145, val: 0.70000 (3835 nodes, 54434 edges)
   
## top_k=5

   * timeframe for edge appearence **60 sec**: **te: 0.67440**, tr: 0.66237, val: 0.68200 (2955 nodes, 13796 edges)
   * timeframe for edge appearence 100 sec: te: 0.62205, tr: 0.65703, val: 0.64800 (3329 nodes, 21852 edges)
   * timeframe for edge appearence 300 sec: te: 0.60065, tr: 0.64590, val: 0.63400 (3835 nodes, 54434 edges)

# TODO

   * teach for each entity class separately (how to evaluate mixed groups?)
   * textual features based on tweets
   * change train-test cut_time