In [None]:
from __future__ import division
from __future__ import print_function

import time, sys, os
import tensorflow as tf

In [None]:
sys.path.insert(0,"../python/")
import preprocessing as pp
import baseline_utils as bu
import gcn_utils as gu

In [None]:
from gcn.utils import *
from gcn.models import GCN, MLP

In [None]:
from datawand.parametrization import ParamHelper
ph = ParamHelper("../pipelines/GcnProject.json",sys.argv)

### Set random seed

In [None]:
seed = 123
np.random.seed(seed)
tf.set_random_seed(seed)

# 1. Settings

In [None]:
dataset_id = ph.get("dataset_id")
label_type = ph.get("label_type")

In [None]:
label_folder = "blended" if label_type == "binary" else label_type
print(label_folder)

In [None]:
split_type = ph.get("split_type")
train_ratio = ph.get("train_ratio")
split_id = "%s_%.2f" % (split_type, train_ratio)

In [None]:
if dataset_id not in ['cora', 'citeseer', 'pubmed']:
    preprocessed_dir = "%s/data/%s/%s/%s/%s" % (ph.get("experiment_dir"), dataset_id, pp.get_experiment_dir(ph),split_id,label_folder)
    preprocessed_dir

In [None]:
flags = tf.app.flags
FLAGS = flags.FLAGS
flags.DEFINE_string('dataset', dataset_id, 'Dataset string.')  # 'cora', 'citeseer', 'pubmed'
flags.DEFINE_string('model', 'gcn', 'Model string.')  # 'gcn', 'gcn_cheby', 'dense'
flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.')
flags.DEFINE_integer('epochs', 200, 'Number of epochs to train.')
flags.DEFINE_integer('hidden1', 16, 'Number of units in hidden layer 1.')
flags.DEFINE_float('dropout', 0.5, 'Dropout rate (1 - keep probability).')
flags.DEFINE_float('weight_decay', 5e-4, 'Weight for L2 loss on embedding matrix.')
flags.DEFINE_integer('early_stopping', 10, 'Tolerance for early stopping (# of epochs).')
flags.DEFINE_integer('max_degree', 3, 'Maximum Chebyshev polynomial degree.')

# 2. Load data

In [None]:
if dataset_id in ['cora', 'citeseer', 'pubmed']:
    input_pref = "../gcn/data"
else:
    input_pref = preprocessed_dir
print(input_pref)

In [None]:
adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_data(FLAGS.dataset,input_prefix=input_pref)

# 3. Preprocessing (run only once)

In [None]:
features = preprocess_features(features, norm_type = "col" if dataset_id == "15o" else "row")
if FLAGS.model == 'gcn':
    support = [preprocess_adj(adj)]
    num_supports = 1
    model_func = GCN
elif FLAGS.model == 'gcn_cheby':
    support = chebyshev_polynomials(adj, FLAGS.max_degree)
    num_supports = 1 + FLAGS.max_degree
    model_func = GCN
elif FLAGS.model == 'dense':
    support = [preprocess_adj(adj)]  # Not used
    num_supports = 1
    model_func = MLP
else:
    raise ValueError('Invalid argument for model: ' + str(FLAGS.model))

## Look at preprocessed data

### Feature matrix: Stored as a sparse matrix in a dict
   * coordinates
   * values for coordinates
   * shape

print(features[0].shape)
print(features[1].shape)
print(features[2])

### Labels: are onehot encoded

The number of columns is the number of different groups in the data

In [None]:
print(y_train.shape)
print(y_test.shape)
print(y_val.shape)

#### In the training data only 20 entity is revealed from each group (for [Cora](https://relational.fit.cvut.cz/dataset/CORA) citiation network)

print(np.sum(y_train,axis=0))
print(np.sum(y_test,axis=0))
print(np.sum(y_val,axis=0))

### Masks
boolean vectors which indicate where are the train, validation and test records in the data

In [None]:
train_mask.shape

# 4. GCN model

In [None]:
def run_gcn(features, y_train, y_test, y_val, train_mask, test_mask, val_mask, num_supports, support, model_func, FLAGS):
    accuracies = []
    if label_type == "binary":
        for i in range(y_train.shape[1]):
            gcn_acc = gu.run(features, y_train, y_test, y_val, train_mask, test_mask, val_mask, num_supports, support, model_func, FLAGS, col_idx=i, verbose=False)
            accuracies.append(gcn_acc)
    else:
        gcn_acc = gu.run(features, y_train, y_test, y_val, train_mask, test_mask, val_mask, num_supports, support, model_func, FLAGS)
        accuracies.append(gcn_acc)
    return np.array(accuracies)

In [None]:
accuracies = run_gcn(features, y_train, y_test, y_val, train_mask, test_mask, val_mask, num_supports, support, model_func, FLAGS)
gcn_acc = accuracies.mean(axis=0)
print(gcn_acc)

# TODO: baseline models should also be computed for binary label_type properly!!!

# Baseline predictors

In [None]:
num_samples = 5

### Random prediction

In [None]:
rnd_acc_arr = np.array([bu.baseline_predict(y_train, y_test, y_val, train_mask, test_mask, val_mask) for i in range(num_samples)])
rnd_acc = list(rnd_acc_arr.mean(axis=0))
print(rnd_acc)

### Weighted random prediction based on total training set

In [None]:
bin_file = "%s/ind.%s.ally" % (input_pref,dataset_id)
print(bin_file)
w_rnd_acc_arr = np.array([bu.baseline_predict(y_train, y_test, y_val, train_mask, test_mask, val_mask, bin_file_path=bin_file) for i in range(num_samples)])
w_rnd_acc = list(w_rnd_acc_arr.mean(axis=0))
print(w_rnd_acc)

### Weighted random prediction based on partial training set (shown labels)

In [None]:
bin_file = "%s/ind.%s.y" % (input_pref,dataset_id)
print(bin_file)
partial_w_rnd_acc_arr = np.array([bu.baseline_predict(y_train, y_test, y_val, train_mask, test_mask, val_mask, bin_file_path=bin_file) for i in range(num_samples)])
partial_w_rnd_acc = list(partial_w_rnd_acc_arr.mean(axis=0))
print(partial_w_rnd_acc)

# Write performance to file

In [None]:
accuracy_dir = "%s/data/%s/%s/%s/%s" % (ph.get("experiment_dir"), dataset_id, pp.get_experiment_dir(ph),split_id,label_type)
if not os.path.exists(accuracy_dir):
    os.makedirs(accuracy_dir)

if dataset_id == "15o":
    with open("%s/acc.csv" % accuracy_dir, "w+") as f:
        f.write('"gcn";%f;%f;%f\n' % tuple(gcn_acc))
        f.write('"rnd";%f;%f;%f\n' % tuple(rnd_acc))
        f.write('"w_rnd";%f;%f;%f\n' % tuple(w_rnd_acc))
        f.write('"part_w_rnd";%f;%f;%f\n' % tuple(partial_w_rnd_acc))

# Results

   * Train-test split at half time (on first 4 days of 15o)
   * Using default parameters of GCN
   * For the below experiments **only 'frequency'** feature is used!!! Using any other generated feature ('degree','pagerank','time') would decrease the accuracy to 0.16-0.18 on the testing set.

## top_k=3

   * timeframe for edge appearence **60 sec**: **te: 0.70515**, tr: 0.75133, val: 0.73000 (2955 nodes, 13796 edges)
   * timeframe for edge appearence 100 sec: te: 0.68944, tr: 0.73709, val: 0.72000 (3329 nodes, 21852 edges)
   * timeframe for edge appearence 300 sec: te: 0.68245, tr: 0.72145, val: 0.70000 (3835 nodes, 54434 edges)
   
## top_k=5

   * timeframe for edge appearence **60 sec**: **te: 0.67440**, tr: 0.66237, val: 0.68200 (2955 nodes, 13796 edges)
   * timeframe for edge appearence 100 sec: te: 0.62205, tr: 0.65703, val: 0.64800 (3329 nodes, 21852 edges)
   * timeframe for edge appearence 300 sec: te: 0.60065, tr: 0.64590, val: 0.63400 (3835 nodes, 54434 edges)