<a href="https://colab.research.google.com/github/claudiocapanema/minicurso_gnn_sbrc2022/blob/main/Classificacao_semantica_de_poi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install spektral
!pip install pandas

Collecting spektral
  Downloading spektral-1.1.0-py3-none-any.whl (129 kB)
[K     |████████████████████████████████| 129 kB 5.3 MB/s 
Collecting tf-estimator-nightly==2.8.0.dev2021122109
  Downloading tf_estimator_nightly-2.8.0.dev2021122109-py2.py3-none-any.whl (462 kB)
[K     |████████████████████████████████| 462 kB 52.6 MB/s 
Installing collected packages: tf-estimator-nightly, spektral
Successfully installed spektral-1.1.0 tf-estimator-nightly-2.8.0.dev2021122109


In [2]:
import json
from tensorflow.keras import utils as np_utils
import spektral as sk

import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from spektral.data import Dataset, Graph

from spektral.data import BatchLoader, PackedBatchLoader

from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dropout, Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2

In [3]:
class PoiCategoryDataset(Dataset):
    """
    The PoICategory Dataset
    **Arguments**
    - `name`: str, name of the dataset to load.
    """

    def __init__(self, name, **kwargs):
        self.name = name
        super().__init__(**kwargs)

    def poi_dataset(self, max_samples=5000):

        A_df = pd.read_csv("/home/claudio/Documentos/pycharm_projects/minicurso_gnn_sbrc2022/datasets/adjacency.csv").dropna(how='any', axis=0)
        X_df = pd.read_csv("/home/claudio/Documentos/pycharm_projects/minicurso_gnn_sbrc2022/datasets/node_features.csv").dropna(how='any', axis=0)
        print("Original number of graphs", len(A_df))
        userid = A_df['user_id'].tolist()[:max_samples]
        matrix_df = A_df['matrices'].tolist()
        temporal_df = X_df['matrices'].tolist()
        category_df = A_df['category'].tolist()

        A_list = []
        X_list = []
        labels_labels_list = []
        count_nodes = 0

        for i in range(len(userid)):
            adjacency = matrix_df[i]
            labels = category_df[i]
            adjacency = json.loads(adjacency)
            if len(adjacency) < 2:
                continue

            labels = json.loads(labels)
            labels = np.array(labels)
            node_features = temporal_df[i]
            node_features = json.loads(node_features)
            node_features = np.array(node_features).astype(np.float)
            node_features = _normalize(node_features)
            adjacency = np.array(adjacency).astype(np.float)


            labels = np.array(np_utils.to_categorical(labels, num_classes=7))
            labels_labels_list.append(labels)

            indice = np.argmax(np.sum(adjacency, axis=1))
            """ Change the pre-processing based on the used message passing layer """
            adjacency = sk.layers.ARMAConv.preprocess(adjacency)
            count_nodes += len(adjacency)
            A_list.append(adjacency)
            X_list.append(node_features)

        print("Total of nodes: ", count_nodes)

        A_list, X_list, labels_list = np.array(A_list), np.array(X_list), np.array(labels_labels_list)

        print("A: ", A_list.shape, " X: ", X_list.shape, " Labels: ", labels_list.shape)

        return A_list, X_list, labels_list

    def read(self):

        # Convert to Graph
        a_list, x_list, labels = self.poi_dataset()
        print("Successfully loaded {}.".format(self.name))
        e_list = [None] * len(a_list)
        return [
            Graph(x=x, a=a, e=e, y=y)
            for x, a, e, y in zip(x_list, a_list, e_list, labels)
        ]


def _normalize(x, norm=None):
    """
    Apply one-hot encoding or z-score to a list of node features
    """
    if norm == "ohe":
        fnorm = OneHotEncoder(sparse=False, categories="auto")
    elif norm == "zscore":
        fnorm = StandardScaler()
    else:
        return x
    return fnorm.fit_transform(x)

In [4]:
from spektral.layers import ARMAConv, GraphMasking, GCNConv

class GNN(Model):
    def __init__(self):
        super().__init__()
        self.mask = GraphMasking()
        self.conv1 = ARMAConv(
        16,
        iterations=1,
        order=2,
        share_weights=True,
        dropout_rate=0.75,
        activation="elu",
        gcn_activation="elu",
        kernel_regularizer=l2(5e-5)
    )
        self.dropout = Dropout(0.6)
        self.conv2 = ARMAConv(
        7,
        iterations=1,
        order=1,
        share_weights=True,
        dropout_rate=0.75,
        activation="softmax",
        gcn_activation=None,
        kernel_regularizer=l2(5e-5),
    )
    def call(self, inputs):
        X_input, A_input = inputs
        X = self.mask(X_input)
        X = self.conv1([X, A_input])
        X = self.dropout(X)
        output = self.conv2([X, A_input])
        return output

if __name__ == '__main__':

    dataset = PoiCategoryDataset("PoICategoryDataset", n_samples=10000)

    # Parameters
    N = max(g.n_nodes for g in dataset)
    D = dataset.n_node_features  # Dimension of node features
    S = dataset.n_edge_features  # Dimension of edge features
    n_out = dataset.n_labels  # Dimension of the target

    print("Parameters")
    print(N, D, S, n_out)

    np.random.seed(seed=1)
    # shuffle data
    idxs = np.random.permutation(len(dataset))
    split_va, split_te = int(0.8 * len(dataset)), int(0.9 * len(dataset))
    idx_tr, idx_va, idx_te = np.split(idxs, [split_va, split_te])
    dataset_train = dataset[idx_tr]
    dataset_validation = dataset[idx_va]
    dataset_test = dataset[idx_te]
    batch_size = 5  # Batch size
    epochs = 10
    # The data have already been shuffled
    loader_tr = BatchLoader(dataset_train, epochs=10, batch_size=batch_size, mask=True, node_level=True,
                            shuffle=False)
    loader_va = BatchLoader(dataset_validation, epochs=10, batch_size=batch_size, mask=True, node_level=True,
                            shuffle=False)
    loader_te = BatchLoader(dataset_test, epochs=10, batch_size=batch_size, mask=True, node_level=True,
                            shuffle=False)

    model = GNN()
    opt = Adam(lr=0.0001)
    model.compile(optimizer=opt,
                  loss="categorical_crossentropy",
                  metrics=["acc"])

    model.fit(
        loader_tr.load(),
        steps_per_epoch=loader_tr.steps_per_epoch,
        epochs=epochs,
        validation_data=loader_va.load(),
        validation_steps=loader_va.steps_per_epoch,
        callbacks=[EarlyStopping(patience=3,
                                 restore_best_weights=True)],
    )

    ################################################################################
    # Evaluate model
    ################################################################################
    print("Testing model")
    loss, acc = model.evaluate(loader_te.load(), steps=loader_te.steps_per_epoch)
    print("Done. Test loss: {}. Test acc: {}".format(loss, acc))

FileNotFoundError: ignored