In [None]:
from __future__ import division
from __future__ import print_function

import time, sys, os
import tensorflow as tf

In [None]:
sys.path.insert(0,"../python/")
import preprocessing as pp
import baseline_utils as bu
import gcn_utils as gu

In [None]:
import pandas as pd

In [None]:
from gcn.utils import *
from gcn.models import GCN, MLP

In [None]:
%matplotlib inline
import seaborn as sns

In [None]:
from datawand.parametrization import ParamHelper
ph = ParamHelper("../pipelines/GcnProject.json",sys.argv)

### Set random seed

In [None]:
seed = 123
np.random.seed(seed)
tf.set_random_seed(seed)

# 1. Settings

In [None]:
dataset_id = ph.get("dataset_id")
label_type = ph.get("label_type")

In [None]:
label_folder = "blended" if label_type == "binary" else label_type
print(label_folder)

In [None]:
split_type = ph.get("split_type")
train_ratio = ph.get("train_ratio")
split_id = "%s_%.2f" % (split_type, train_ratio)

In [None]:
if dataset_id not in ['cora', 'citeseer', 'pubmed']:
    preprocessed_dir = "%s/data/%s/%s/%s/%s" % (ph.get("experiment_dir"), dataset_id, pp.get_experiment_dir(ph),split_id,label_folder)
    print(preprocessed_dir)

In [None]:
flags = tf.app.flags
FLAGS = flags.FLAGS
flags.DEFINE_string('dataset', dataset_id, 'Dataset string.')  # 'cora', 'citeseer', 'pubmed'
flags.DEFINE_string('model', 'gcn', 'Model string.')  # 'gcn', 'gcn_cheby', 'dense'
flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.')
flags.DEFINE_integer('epochs', 200, 'Number of epochs to train.')
flags.DEFINE_integer('hidden1', 16, 'Number of units in hidden layer 1.')
flags.DEFINE_float('dropout', 0.5, 'Dropout rate (1 - keep probability).')
flags.DEFINE_float('weight_decay', 5e-4, 'Weight for L2 loss on embedding matrix.')
flags.DEFINE_integer('early_stopping', 10, 'Tolerance for early stopping (# of epochs).')
flags.DEFINE_integer('max_degree', 3, 'Maximum Chebyshev polynomial degree.')

# 2. Load data

In [None]:
if dataset_id in ['cora', 'citeseer', 'pubmed']:
    input_pref = "../gcn/data"
else:
    input_pref = preprocessed_dir
print(input_pref)

In [None]:
adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_data(FLAGS.dataset,input_prefix=input_pref)

# 3. Preprocessing (run only once)

In [None]:
features = preprocess_features(features, norm_type = "col" if dataset_id in ["15o","occupy"] else "row")
if FLAGS.model == 'gcn':
    support = [preprocess_adj(adj)]
    num_supports = 1
    model_func = GCN
elif FLAGS.model == 'gcn_cheby':
    support = chebyshev_polynomials(adj, FLAGS.max_degree)
    num_supports = 1 + FLAGS.max_degree
    model_func = GCN
elif FLAGS.model == 'dense':
    support = [preprocess_adj(adj)]  # Not used
    num_supports = 1
    model_func = MLP
else:
    raise ValueError('Invalid argument for model: ' + str(FLAGS.model))

## Look at preprocessed data

### Feature matrix: Stored as a sparse matrix in a dict
   * coordinates
   * values for coordinates
   * shape

print(features[0].shape)
print(features[1].shape)
print(features[2])

### Labels: are onehot encoded

The number of columns is the number of different groups in the data

In [None]:
print(y_train.shape)
print(y_test.shape)
print(y_val.shape)

#### In the training data only 20 entity is revealed from each group (for [Cora](https://relational.fit.cvut.cz/dataset/CORA) citiation network)

print(np.sum(y_train,axis=0))
print(np.sum(y_test,axis=0))
print(np.sum(y_val,axis=0))

### Masks
boolean vectors which indicate where are the train, validation and test records in the data

In [None]:
train_mask.shape

# 4. GCN model

In [None]:
def run_gcn(features, y_train, y_test, y_val, train_mask, test_mask, val_mask, num_supports, support, model_func, FLAGS):
    accuracies = []
    if label_type == "binary":
        for i in range(y_train.shape[1]):
            gcn_acc, _ = gu.run(features, y_train, y_test, y_val, train_mask, test_mask, val_mask, num_supports, support, model_func, FLAGS, col_idx=i, verbose=False)
            accuracies.append(gcn_acc)
    else:
        gcn_acc, _ = gu.run(features, y_train, y_test, y_val, train_mask, test_mask, val_mask, num_supports, support, model_func, FLAGS)
        accuracies.append(gcn_acc)
    return np.array(accuracies)

In [None]:
accuracies = run_gcn(features, y_train, y_test, y_val, train_mask, test_mask, val_mask, num_supports, support, model_func, FLAGS)
gcn_acc = accuracies.mean(axis=0)
print(gcn_acc)

### Examination of labels

acc_vect, preds = gu.run(features, y_train, y_test, y_val, train_mask, test_mask, val_mask, num_supports, support, model_func, FLAGS)

def get_confusions(pred_arr,label_arr,mask,label_type):
    dim = label_arr.shape[1]
    masked_preds = pred_arr[mask]
    masked_labels = label_arr[mask]
    conf_mx = np.zeros((dim,dim))
    for k in range(len(masked_labels)):
        label_row = masked_labels[k]
        pred_row = masked_preds[k]
        one_indices = np.argwhere(label_row==1)
        one_indices = [item[0] for item in one_indices]
        if label_type == "blended":
            update_blended_confusions(conf_mx, label_row, pred_row, one_indices)
        elif label_type == "onehot":
            update_onehot_confusions(conf_mx, label_row, pred_row, one_indices)
        else:
            raise RuntimeError("Invalid label type!!!")
    return conf_mx

def update_blended_confusions(conf_mx, label_row, pred_row, one_indices):
    dim = conf_mx.shape[1]
    num_one = len(one_indices)
    for i in one_indices:
        # correct prediction
        conf_mx[i,i] += pred_row[i]
        # incorrect prediction
        for j in range(dim):
            if not j in one_indices:
                conf_mx[i,j] += pred_row[i] / num_one

def update_onehot_confusions(conf_mx, label_row, pred_row, one_indices):
    if len(one_indices) != 1:
        raise RuntimeError("Label cannot contain more than one 1!")
    else:
        i = one_indices[0]
        j = np.argmax(pred_row)
        conf_mx[i,j] += pred_row[j]
        
def show_confusion_mx(pred_arr,label_arr,mask,label_type):
    conf_mx = get_confusions(pred_arr,label_arr,mask,label_type)
    sns.heatmap(conf_mx)
    print(pd.DataFrame(conf_mx))

show_confusion_mx(preds,y_test,test_mask,label_type)

# BUG? : the first row of y_test is all zero! but test_mask is True for this index!!! WHY? for onehot label creation, for blended it is not a problem!

# Baseline predictors

In [None]:
def run_baseline(y_train, y_test, y_val, train_mask, test_mask, val_mask, num_samples=5, bin_file_path=None):
    labels_arr = None
    if bin_file_path != None:
        with open(bin_file_path, 'rb') as f:
            if sys.version_info > (3, 0):
                labels_arr = pkl.load(f, encoding='latin1')
            else:
                labels_arr = pkl.load(f)
    accuracies = []
    if label_type == "binary":
        for i in range(y_train.shape[1]):
            tmp_label = np.vstack(labels_arr[i]) if labels_arr != None else None
            accuracies += [bu.baseline_predict(y_train, y_test, y_val, train_mask, test_mask, val_mask, label_samples=tmp_label) for i in range(num_samples)]
    else:
        accuracies += [bu.baseline_predict(y_train, y_test, y_val, train_mask, test_mask, val_mask, label_samples=labels_arr) for i in range(num_samples)]
    return np.array(accuracies)

### Random prediction

In [None]:
rnd_acc_arr = run_baseline(y_train, y_test, y_val, train_mask, test_mask, val_mask)
rnd_acc = list(rnd_acc_arr.mean(axis=0))
print(rnd_acc)

### Weighted random prediction based on total training set

In [None]:
bin_file = "%s/ind.%s.ally" % (input_pref,dataset_id)
print(bin_file)
w_rnd_acc_arr = run_baseline(y_train, y_test, y_val, train_mask, test_mask, val_mask, bin_file_path=bin_file)
w_rnd_acc = list(w_rnd_acc_arr.mean(axis=0))
print(w_rnd_acc)

### Weighted random prediction based on partial training set (shown labels)

In [None]:
bin_file = "%s/ind.%s.y" % (input_pref,dataset_id)
print(bin_file)
partial_w_rnd_acc_arr = run_baseline(y_train, y_test, y_val, train_mask, test_mask, val_mask, bin_file_path=bin_file)
partial_w_rnd_acc = list(partial_w_rnd_acc_arr.mean(axis=0))
print(partial_w_rnd_acc)

# Write performance to file

In [None]:
accuracy_dir = "%s/data/%s/%s/%s/%s" % (ph.get("experiment_dir"), dataset_id, pp.get_experiment_dir(ph),split_id,label_type)
print(accuracy_dir)
if not os.path.exists(accuracy_dir):
    os.makedirs(accuracy_dir)

if dataset_id in ["15o","occupy"]:
    with open("%s/acc.csv" % accuracy_dir, "w+") as f:
        f.write('"gcn";%f;%f;%f\n' % tuple(gcn_acc))
        f.write('"rnd";%f;%f;%f\n' % tuple(rnd_acc))
        f.write('"w_rnd";%f;%f;%f\n' % tuple(w_rnd_acc))
        f.write('"part_w_rnd";%f;%f;%f\n' % tuple(partial_w_rnd_acc))