In [1]:
import pandas as pd
import os
import numpy as np

from stellargraph import StellarGraph
import stellargraph as sg
from stellargraph.mapper import FullBatchNodeGenerator
from stellargraph.layer import GCN

from tensorflow.keras import layers, optimizers, losses, metrics, Model
from sklearn import preprocessing, model_selection
from IPython.display import display, HTML
import matplotlib.pyplot as plt
import seaborn
%matplotlib inline

## Data Preprocessing

The dataset for this node classification task will be the CiteSeer dataset: a citation network with 3312 scientific publications classified as either Agents, IR, DB, AI, HCI, and ML. The data comes in two tab-separated files from https://deepai.org/dataset/citeseer: citeseer.cites and citeseer.content. The files contain edge information and node information respectively.

First, I will read in the edges/citeseer.cites data into a DataFrame. By data documentation, I know that the target node is the first column and the source target is the second, so I label them as such when I read them in.

In [3]:
citeseer_cites = pd.read_csv("../raw/citeseer.cites", 
                             sep='\t', 
                             header=None, 
                             names=['target', 'source'])
citeseer_cites

FileNotFoundError: [Errno 2] No such file or directory: '../raw/citeseer.cites'

Here, I make sure that target and source are strings since some of the target and source names are in integer form.

In [None]:
citeseer_cites['target'] = citeseer_cites['target'].astype(str)
citeseer_cites['source'] = citeseer_cites['source'].astype(str)

In [None]:
# create feature names for all 3703 word vector values

citeseer_features = [f"f{i}" for i in range(3703)]

citeseer_content = pd.read_csv("citeseer.content",
                                   sep="\t",
                                   header=None,
                                   names=["id", *citeseer_features, "class"]
)
# make sure that id are all string since some are int
citeseer_content['id'] = citeseer_content['id'].astype(str)
citeseer_content

Set id as the index since all are unique.

In [None]:
citeseer_content_str_subject = citeseer_content.set_index("id")
citeseer_content_str_subject

I realize that some target and sources in citeseer_cites are not included in citeseer_content at all since trying to ingest the graph later down the line threw errors. Luckily the errors pointed out exactly which papers they were, so I put them into a list and remove them from the dataframe if they are involved.

In [None]:
# creates a copy of citeseer_cites and removes the offending papers if found in list
source_remove = ['ghani01hypertext', 'nielsen00designing', 'kohrs99using', 'wang01process', 'hahn98ontology', '293457', 'gabbard97taxonomy']
target_remove = ['197556', '38137', '95786', 'flach99database', 'khardon99relational', 'kohrs99using', 'raisamo99evaluating', 'tobies99pspace', 'weng95shoslifn']
citeseer_cites_copy = citeseer_cites[~citeseer_cites.source.isin(source_remove)]
citeseer_cites_copy = citeseer_cites_copy[~citeseer_cites_copy.target.isin(target_remove)]
citeseer_cites_copy

In [None]:
citeseer_cites_copy = citeseer_cites_copy.reset_index(drop=True)
citeseer_cites_copy

In [None]:
citeseer_content_feats= citeseer_content_str_subject.loc[:, 'f0':'f3702']
citeseer_content_feats

In [None]:
# in script, make sure that transform_data(citeseer_cites, citeseer_content) is called and assigned before using next

In [None]:
graph = StellarGraph({"paper": citeseer_content_feats}, {"cites": citeseer_cites})
print(graph.info())
dataset = sg.datasets.CiteSeer()
graph, nodes = dataset.load()
nodes.value_counts()

Split dataset intro train-validation-testing splits. I will be replicating the same splits as in the paper "Semi-Supervised Classification with Graph Convolutional Networks". 1000 in the test set, 500 in the validation set, and everything else is in the training set.

In [None]:
# test size None means everything other than what was specified for training and
# validation is for testing
train_subjects, test_subjects = model_selection.train_test_split(nodes, train_size = 1812, test_size=None, stratify=nodes)
val_subjects, test_subjects = model_selection.train_test_split(test_subjects, train_size = 500, test_size=None, stratify=test_subjects)

train_subjects.value_counts()

In [None]:
train_subjects # take a look at the index and labels

The next cell turns the target into one-hot encoding, as the current targets are in a string form. We can convert everything into numeric arrays.

In [None]:
target_encoding = preprocessing.LabelBinarizer()
train_targets = target_encoding.fit_transform(train_subjects)
val_targets = target_encoding.transform(val_subjects)
test_targets = target_encoding.transform(test_subjects)

## Creating Graph Convolutional Network layers


In [None]:
# create a FullBatchNodeGenerator object, as GCN is a full batch model.
generator = FullBatchNodeGenerator(graph, method="gcn")
train_gen = generator.flow(train_subjects.index, train_targets) # produce object that can be used to train model

Similar to the paper, I create a two layered GCN with 32 units each, softmax activations, and a dropout of 0.5.

In [None]:
gcn = GCN(layer_sizes=[32, 32], activations=["softmax", "softmax"], generator=generator, dropout=0.5)
x_inp, x_out = gcn.in_out_tensors()

In [None]:
predictions = layers.Dense(units=train_targets.shape[1], activation="softmax")(x_out)
predictions

The learning rate is 0.01 with a cross entropy loss.

In [None]:
model = Model(inputs=x_inp, outputs=predictions)
model.compile(optimizer=optimizers.Adam(lr=0.01),
              loss=losses.categorical_crossentropy,
              metrics=["acc"],
)

In [None]:
val_gen = generator.flow(val_subjects.index, val_targets)

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

es_callback = EarlyStopping(monitor="val_acc", patience=50, restore_best_weights=True)

In [None]:
history = model.fit(
    train_gen,
    epochs=200,
    validation_data=val_gen,
    verbose=2,
    shuffle=False,  # this should be False, since shuffling data means shuffling the whole graph
    callbacks=[es_callback],
)

In [None]:
sg.utils.plot_history(history)

In [None]:
test_gen = generator.flow(test_subjects.index, test_targets)

In [None]:
test_metrics = model.evaluate(test_gen)
print("\nTest Set Metrics:")
for name, val in zip(model.metrics_names, test_metrics):
    print("\t{}: {:0.4f}".format(name, val))

In [None]:
all_nodes = nodes.index
all_gen = generator.flow(all_nodes)
all_predictions = model.predict(all_gen)

In [None]:
node_predictions = target_encoding.inverse_transform(all_predictions.squeeze())


In [None]:
df = pd.DataFrame({"Predicted": node_predictions, "Actual": nodes})
accuracy = df.loc[df['Predicted'] == df['Actual']].shape[0] / df.shape[0]
accuracy

In [None]:
df