In [33]:
import pandas as pd
import numpy as np
import torch
import scipy.sparse as scsp
from sklearn.cluster import KMeans
from torch_geometric.data import Data
import torch.nn.functional as F
import torch.nn as nn
import torch_scatter

from torch_geometric.nn.conv import MessagePassing
import torch_geometric.transforms as T
from torch_geometric.utils import remove_self_loops, add_self_loops, softmax, degree

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,roc_auc_score

# EDA

In [2]:
df_classes = pd.read_csv('elliptic_bitcoin_dataset/elliptic_txs_classes.csv')
df_edges = pd.read_csv('elliptic_bitcoin_dataset/elliptic_txs_edgelist.csv')
df_nodes = pd.read_csv('elliptic_bitcoin_dataset/elliptic_txs_features.csv', header=None)
df_nodes.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,157,158,159,160,161,162,163,164,165,166
0,230425980,1,-0.171469,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162097,...,-0.562153,-0.600999,1.46133,1.461369,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792
1,5530458,1,-0.171484,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162112,...,0.947382,0.673103,-0.979074,-0.978556,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792
2,232022460,1,-0.172107,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162749,...,0.670883,0.439728,-0.979074,-0.978556,-0.098889,-0.106715,-0.131155,-0.183671,-0.120613,-0.119792


In [3]:
df_classes.head(3)

Unnamed: 0,txId,class
0,230425980,unknown
1,5530458,unknown
2,232022460,unknown


In [4]:
df_classes['class'].unique()

array(['unknown', '2', '1'], dtype=object)

In [5]:
print(df_edges.shape)
df_edges.head(3)

(234355, 2)


Unnamed: 0,txId1,txId2
0,230425980,5530458
1,232022460,232438397
2,230460314,230459870


In [6]:
df_nodes = df_nodes.rename(columns={1: "time step", 0:'txId'})
df_nodes.head(3)

Unnamed: 0,txId,time step,2,3,4,5,6,7,8,9,...,157,158,159,160,161,162,163,164,165,166
0,230425980,1,-0.171469,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162097,...,-0.562153,-0.600999,1.46133,1.461369,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792
1,5530458,1,-0.171484,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162112,...,0.947382,0.673103,-0.979074,-0.978556,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792
2,232022460,1,-0.172107,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162749,...,0.670883,0.439728,-0.979074,-0.978556,-0.098889,-0.106715,-0.131155,-0.183671,-0.120613,-0.119792


In [20]:
selected_ids = df_classes.loc[(df_classes['class'] != 'unknown'), 'txId']
df_edges = df_edges.loc[df_edges['txId1'].isin(selected_ids)]
df_edges = df_edges.loc[df_edges['txId2'].isin(selected_ids)]
df_classes = df_classes.loc[df_classes['txId'].isin(selected_ids)]
df_nodes = df_nodes.loc[df_nodes['txId'].isin(selected_ids)]

In [8]:
df_nodes = pd.merge(df_classes, df_nodes)
df_nodes.head()

Unnamed: 0,txId,class,time step,2,3,4,5,6,7,8,...,157,158,159,160,161,162,163,164,165,166
0,232438397,2,1,0.163054,1.96379,-0.646376,12.409294,-0.063725,9.782742,12.414558,...,-0.577099,-0.613614,0.241128,0.241406,1.072793,0.08553,-0.131155,0.677799,-0.120613,-0.119792
1,232029206,2,1,-0.005027,0.578941,-0.091383,4.380281,-0.063725,4.667146,0.851305,...,-0.577099,-0.613614,0.241128,0.241406,0.60412,0.008632,-0.131155,0.333211,-0.120613,-0.119792
2,232344069,2,1,-0.147852,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,...,-0.577099,-0.613614,0.241128,0.241406,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792
3,27553029,2,1,-0.151357,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,...,-0.539735,-0.582077,-0.979074,-0.978556,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792
4,3881097,2,1,-0.172306,-0.184668,-1.201369,0.028105,-0.043875,-0.02914,0.242712,...,-0.577099,-0.600999,0.241128,0.241406,0.018279,-0.068266,-0.084674,-0.05445,-1.760926,-1.760984


In [21]:
df_edges.shape

(36624, 2)

In [16]:
df_nodes['class'] = df_nodes['class'].astype(int)

In [9]:
# Check if GPU is avaiable
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

# Train

## Data Prep

In [10]:
# helper function that support time_step_split and community_split_transd function
def time_step_split_support(nodes, edges):
    """
    Split the graph and store node features, edges (represented by adjacency list),
    and labels separately by timestamp t (from 1 to 49).

    Args:
        nodes         A dataframe of the node features
        edges         A dataframe of the graph's adjacency list

    Returns:
        features_t    A list of (|N_t|, d) feature matrices by timestamp
        edge_indices  A list of (2, |E_t|) adjacency list by timestamp
        labels_t      A list of (|N_t|) labels by timestamp
    """

    features =  torch.FloatTensor(nodes.iloc[:, 2:].to_numpy())
    times = nodes.iloc[:, 2].to_numpy()
    times = torch.LongTensor(times.reshape(len(times),))
    labels = nodes.iloc[:, 1].to_numpy()
    labels = torch.LongTensor(labels.reshape(len(labels),))

    nodes_id = nodes.iloc[:, 0].to_numpy()
    nodes_id = torch.LongTensor(nodes_id.reshape(len(nodes_id),))

    min_t = torch.min(times) # 1
    max_t = torch.max(times) # 49
    
    # Construct nodes of the directed graph for each time step;
    # features by time step are stored in "features_t"; labels by
    # time step are stored in "labels_t"
    features_t = []
    labels_t = []
    
    # Create a dictionary where
    # <key, value> = <node_id, <<idx, node_index_in_time_t_subgraph>, <t, time_t>>>.
    id2idx = {}
    for t in range(min_t, max_t + 1):
        features_t.append(features[times == t, :])
        labels_t.append(labels[times == t])
        nodes_t = nodes_id[times == t]
        for i in range(nodes_t.shape[0]):
            id2idx[nodes_t[i].item()] = {}
            id2idx[nodes_t[i].item()]['idx'] = i
            id2idx[nodes_t[i].item()]['t'] = t

    # Construct adjacency lists of the directed graph (non-symmetric) for each time step;
    # adjacency lists for each time step are stored in "edge_indices".
    edge_idx_t = [[] for _ in range(min_t, max_t + 1)]
    for index in range(edges.shape[0]):   
        node1_t = id2idx[edges.iloc[index, 0]]['t']
        node1_idx = id2idx[edges.iloc[index, 0]]['idx']
        node2_t = id2idx[edges.iloc[index, 1]]['t']
        node2_idx = id2idx[edges.iloc[index, 1]]['idx']
        edge_idx_t[node1_t - 1].append([node1_idx, node2_idx]) # time_step starts from 1

    edge_indices = [torch.LongTensor(edge_idx_t[i]).t() for i in range(len(edge_idx_t))]
    return features_t, edge_indices, labels_t

In [11]:
def time_step_split(nodes, edges, device, train_lt = 31, val_lt = 36, test_lt = 49):
    """
    Create and return the training, validation, and test set, splitted by time step,
    where each subgraph at time t is considered as an input of GCN model.

    Args:
        nodes         A dataframe of the node features
        edges         A dataframe of the graph's adjacency list
        device        Computing device
        train_lt      The last time step index of training set
        val_lt        The last time step index of validation set
        test_lt       The last time step index of test set

    Returns:
        data          A dictionary that stores training, validation, and test set,
                        each value is a list of Data object
        graph_info    A matrix where each row contains information of the time-step subgraph
                      [time_step, num_of_nodes, num_of_edges, num_of_illicit_nodes]
    """
    features_t, edge_indices, labels_t = time_step_split_support(nodes, edges)

    graph_info = np.zeros((len(labels_t), 4), dtype = np.int64)
    for t in range(len(labels_t)):
        graph_info[t, :] = np.array([t, features_t[t].shape[0], edge_indices[t].shape[1],
                                     labels_t[t][labels_t[t] == 1].shape[0]])

    train_idx, val_idx, test_idx = [np.arange(train_lt), np.arange(train_lt, val_lt),
                                    np.arange(val_lt, test_lt)]
    train_list = [Data(x = features_t[idx], edge_index = edge_indices[idx],
                       y = labels_t[idx]).to(device) for idx in train_idx ]
    val_list = [Data(x = features_t[idx], edge_index = edge_indices[idx],
                     y = labels_t[idx]).to(device) for idx in val_idx ]
    test_list = [Data(x = features_t[idx], edge_index = edge_indices[idx],
                      y = labels_t[idx]).to(device) for idx in test_idx ]
    data = {}
    data['train'] = train_list
    data['val'] = val_list
    data['test'] = test_list

    return data, graph_info

In [22]:
node_sum = df_nodes.shape[0]
data, graph_info = time_step_split(df_nodes, df_edges, device)
train_node_size = np.sum(graph_info[0: 31, 1]) / node_sum
test_node_size = np.sum(graph_info[36:, 1]) / node_sum

In [23]:
def laplacian(A, alpha = 0.1):
    """
    Returns the Laplacian matrix of the given adjacency matrix. For the directed
    acyclic graph (not connected) with adjacency matrix A, we define a modified
    Laplacian matrix as follows:
            A_tilde = (1 - alpha) * (A + A^T) + alpha * 11^T
            L = I - D_tilde^{-1/2} A_tilde D_tilde^{-1/2}
    Args:
        A             Adjacency matrix of certain graph
        alpha         Smoothing constant that prevents isolated nodes

    Returns:
        L             Modified Laplacian matrix of the adjacency matrix A
    """
    # A is sparse, csr format
    A = (1 - alpha) * (A + A.T) + alpha * scsp.csr_matrix(np.outer(np.ones(A.shape[0]), np.ones(A.shape[0])))
    D = scsp.diags(np.asarray(np.sum(A, axis = 0)).reshape(-1) ** (-1/2))
    L = scsp.diags(np.ones(A.shape[0])) - D @ A @ D
    return L

def adj_list_to_mtx(n, edge_index):
    """
    Create a csr-format adjacency matrix by the given adjacency list.
    Args:
        n             The number of nodes in the graph
        edge_index    The (2, |E|) adjacency list of the graph

    Returns:
        a csr-format adjacency matrix
    """
    edge_mtx = np.zeros((n, n))
    for i in range(edge_index.shape[1]):
        node1 = int(edge_index[0, i])
        node2 = int(edge_index[1, i])
        edge_mtx[node1, node2] = 1

    return scsp.csr_matrix(edge_mtx)

def nearest_sum(arr, target):
    """ 
    Get a combination of numbers in the given array that sums nearest to 
    the target number.
    
    Args:
        arr      The given array
        target   The target number

    Returns:
        resid           The residual between the summation and the target number
        elt_idx_list    The indices of the subarray for summation
    """

    n = len(arr)
    opt_arr = np.zeros((n + 1, target + 1))
    for i in range(1, n + 1):
        opt_arr[i, :] = opt_arr[i-1, :]
        for j in np.arange(target, 0, step = -1):
            if opt_arr[i, j] > 0 and j + arr[i - 1] <= target:
                opt_arr[i, j + arr[i - 1]] += 1
        opt_arr[i, arr[i - 1]] += 1
    
    elt_list = []
    elt_idx_list = []
    target_sum = target
    idx = n
    if (opt_arr[idx, target] == 0):
        while opt_arr[idx, target_sum] == 0:
            print(target_sum)
            target_sum -= 1
    resid = target - target_sum

    while (idx > 0 and target_sum != 0):
        if (opt_arr[idx, target_sum] - opt_arr[idx - 1, target_sum] > 0):
            elt_list.append(arr[idx - 1])
            elt_idx_list.append(idx - 1)
            target_sum -= arr[idx - 1]
        idx = idx - 1
    return resid, elt_idx_list

In [24]:
def community_split_transd(nodes, edges, train_size, test_size, device):
    """
    Create and return the training, validation, and test set by merging small
    clusters of the graphs. Keep edge_index known for all sets.

    Args:
        nodes         A dataframe of the node features
        edges         A dataframe of the graph's adjacency list
        train_size    The node size proportion in training set
        test_size     The node size proportion in test set

    Returns:
        data          A Data object that stores node features, edge_index, and labels
        dict          A dictionary that stores training, validation, test set node indices
    """
    cluster_num = 500
    features_t, edge_indices, labels_t = time_step_split_support(nodes, edges)

     # Construct the features, labels, and edge_index
    features =  torch.FloatTensor(nodes.iloc[:, 2:].to_numpy())
    labels = nodes.iloc[:, 1].to_numpy()
    labels = torch.LongTensor(labels.reshape(len(labels),))
    nodes_id = nodes.iloc[:, 0].to_numpy()
    # Create a dictionary that maps nodeId to index in the dataframe.
    id2idx = {}
    for i in range(nodes.shape[0]):
        id2idx[nodes.iloc[i, 0]] = i
    # Construct edge_index with same node indexing as in features and labels
    edge_idx = np.zeros((2, edges.shape[0]), dtype = np.int64)
    for index in range(edges.shape[0]):   
        node1 = id2idx[edges.iloc[index, 0]]
        node2 = id2idx[edges.iloc[index, 1]]
        edge_idx[:, index] = [node1, node2]
    edge_index = torch.LongTensor(edge_idx)

    # Perform spectral clustering on the entire graph.
    # Since the entire graph's adjacency matrix A can be written as a block
    # diagonal matrix (blocked by time steps), we can recreate the eigenvalues
    # and eigenvectors of A by the eigenvalues and eigenvectors of blocks A_1,
    # A_2, ..., A_49 of A.
    t = 0
    eval_dict = {}
    node_num = []

    for t in range(49): # max_t = 49
        n = features_t[t].shape[0]
        A = adj_list_to_mtx(n, edge_indices[t])
        L = laplacian(A)
        evals, evecs = scsp.linalg.eigsh(L, k = n // 40, which = 'SM')

        for i in range(evals.shape[0]):
            eval_dict[evals[i]] = [t, i, evecs[:, i]]
        node_num.append(n)
    
    # 'node_blk' store node indices that mark time group separation.
    node_blk = np.insert(np.cumsum(node_num), 0, 0)
    # Block diagonal matrix has the first number-of-block (i.e., 49) smallest
    # eigenvalues to be 0.
    small_evals = np.sort(np.array([*eval_dict]))[49: (49 + cluster_num)]

    node_mtx = np.zeros((node_blk[-1], cluster_num))
    for i in range(small_evals.shape[0]):
        eval = small_evals[i]
        t, _, evec = eval_dict[eval]
        node_mtx[node_blk[t]: node_blk[t + 1], i] = evec
    
    # Use K-means algorithm to create certain number of clusters (e.g., 500).
    kmeans = KMeans(n_clusters = cluster_num, init = 'random', random_state = 42,
                    n_init = 3, max_iter = 10).fit(node_mtx)
    
    comm_count = np.bincount(kmeans.labels_)

    # Split and merge the clusters into three sets by the given number of nodes
    # in each set.
    node_num = nodes.shape[0]
    train_num = int(np.round(node_sum * train_size))
    val_num = int(np.round(node_sum * (1 - train_size - test_size)))

    train_num_resid, train_clust_idx = nearest_sum(comm_count, train_num)
    val_test_comm_count = np.delete(comm_count, train_clust_idx)
    val_test_clust_idx = np.delete(np.arange(500), train_clust_idx)

    val_num_resid, val_clust_idx_tmp = nearest_sum(val_test_comm_count, val_num)
    val_clust_idx = val_test_clust_idx[val_clust_idx_tmp]
    test_clust_idx = np.delete(np.arange(500), np.hstack((train_clust_idx, val_clust_idx)))

    # Split node indices by their clusters into three datasets.
    train_idx = []
    val_idx = []
    test_idx = []

    train_clust_set = set(train_clust_idx)
    val_clust_set = set(val_clust_idx)
    test_clust_set = set(test_clust_idx)
    for i in range(len(kmeans.labels_)):
        if kmeans.labels_[i] in train_clust_set:
            train_idx.append(i)
        elif kmeans.labels_[i] in val_clust_set:
            val_idx.append(i)
        else:
            test_idx.append(i)

    if train_num_resid > 0:
        train_idx.append(test_idx[-train_num_resid:])
        test_idx = test_idx[:-train_num_resid]
    if val_num_resid > 0:
        val_idx.append(test_idx[-val_num_resid:])
        test_idx = test_idx[:-val_num_resid]

    train_idx = np.array(train_idx)
    val_idx = np.array(val_idx)
    test_idx = np.array(test_idx)
    
    # Construct the training, validation, test set and store
    # in a dictionary, 'data'.
    data = Data(x = features, edge_index = edge_index, y = labels).to(device)
    
    return data, {'train': train_idx, 'val': val_idx, 'test': test_idx}

In [25]:
data5, split_idx5 = community_split_transd(df_nodes, df_edges, train_size = train_node_size,
                                    test_size = test_node_size, device = device)
data5

Data(x=[46564, 166], edge_index=[2, 36624], y=[46564])

In [37]:
def random_split_transd(nodes, edges, train_size, test_size, device, seed = 42):
    """
    Create and return the training, validation, and test set by randomly splitting
    the node indices to these three sets. Keep edge_index known for all sets.

    Args:
        nodes         A dataframe of the node features
        edges         A dataframe of the graph's adjacency list
        train_size    The node size proportion in training set
        test_size     The node size proportion in test set
        device        Computing device
        seed          Random seed for data splitting

    Returns:
        data          A Data object that stores node features, edge_index, and labels
        dict          A dictionary that stores training, validation, test set node indices
    """
    features =  torch.FloatTensor(nodes.iloc[:, 2:].to_numpy())
    labels = nodes.iloc[:, 1].to_numpy()
    labels = torch.LongTensor(labels.reshape(len(labels),))
    nodes_id = nodes.iloc[:, 0].to_numpy()

    # Create a dictionary that maps nodeId to index in the dataframe.
    id2idx = {}
    for i in range(nodes.shape[0]):
        id2idx[nodes.iloc[i, 0]] = i

    # Construct edge_index with same node indexing as in features and labels
    edge_idx = np.zeros((2, edges.shape[0]), dtype = np.int64)
    for index in range(edges.shape[0]):   
        node1 = id2idx[edges.iloc[index, 0]]
        node2 = id2idx[edges.iloc[index, 1]]
        edge_idx[:, index] = [node1, node2]
    edge_index = torch.LongTensor(edge_idx)

    train_index, test_index = train_test_split(np.arange(labels.shape[0]),
                                               test_size = 1 - train_size,
                                               random_state = 42)
    val_index, test_index = train_test_split(test_index,
                                             test_size = test_size / (1 - train_size),
                                             random_state = 42)

    # Construct the training, validation, test set and store
    # in a dictionary, 'data'.
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    data = Data(x = features, edge_index = edge_index, y = labels).to(device)
                  
    return data, {'train': train_index, 'val': val_index, 'test': test_index}

In [38]:
train_node_size = np.sum(graph_info[0: 31, 1]) / node_sum
test_node_size = np.sum(graph_info[36:, 1]) / node_sum
data3, split_idx3 = random_split_transd(df_nodes, df_edges, train_size = train_node_size,
                                        test_size = test_node_size, device = device)
data3

Data(x=[46564, 166], edge_index=[2, 36624], y=[46564])

## GNN

In [28]:
class GCNLayer(MessagePassing):
    def __init__(self, in_channels, out_channels, bias = True, 
                 directed = False, self_loop = True, **kwargs):
        """
        Initialize a GCN layer.
        Args:
            in_channels      In-channel dimension of node embeddings
            out_channels     Out-channel dimension of node embeddings
            bias             A boolean value determining whether we add a
                                learnable bias term in linear transformation
            directed         A boolean value determining whether we use directed
                                message passing D^{-1}A or use symmetric normalized
                                adjacency matrix D^{-1/2}AD^{-1/2}
            self_loop        A boolean value determining whether we add a self-
                                loop for each node
        """
        super(GCNLayer, self).__init__(**kwargs, aggr = 'add')

        self.in_channels = in_channels
        self.out_channels = out_channels

        self.directed = directed
        self.self_loop = self_loop

        # Define the layers needed for the message and update functions below.
        # self.lin is the linear transformation that we apply to the embedding.
        self.lin = nn.Linear(self.in_channels, self.out_channels, bias = bias)
        
        self.reset_parameters()

    def reset_parameters(self):
        """
        Reset all learnable parameters in the linear transformation.
        """
        self.lin.reset_parameters()

    def forward(self, x, edge_index, edge_weight):
        """
        Produce a forward propagation of GCN layer.
        
        Args:
            x             The node embedding
            edge_index    The (2, |E|) adjacency list of the graph
            edge_weight   The (|E|) vector specifying the edge weights in the graph
                            (for unweighted graph, edge weight is 1)
        
        Returns:
            An updated node embedding
        """
        # Add self-loops to the adjacency matrix.
        if self.self_loop:
            edge_index, _ = add_self_loops(edge_index, num_nodes = x.size(0))
            edge_weight = torch.cat((edge_weight, torch.ones(x.size(0))), dim = -1)
        
        # Apply linear transformation on node features.
        x = self.lin(x)

        # Compute normalization by updated node degree.
        if self.directed:
            row , _ = edge_index
            deg = degree(row, x.size(0), dtype = x.dtype) # only out-degree
            deg_inv = deg.pow(-1)
            deg_inv[deg_inv == float('inf')] = 0
            norm = deg_inv[row]
        else:
            row, col = edge_index
            deg = degree(col, x.size(0), dtype = x.dtype)
            deg_inv_sqrt = deg.pow(-0.5)
            deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0
            norm = deg_inv_sqrt[row] * deg_inv_sqrt[col]

        return self.propagate(edge_index, x = (x, x), norm = norm, edge_weight = edge_weight)

    def message(self, x_j, edge_weight, norm):
        """
        Send the message of the neighboring node (i.e., x_j) to the source node (i.e., x_i).
        
        Args:
            x_j           The embedding of the neighboring node of source node x_i
            edge_weight   The edge weight of certain edge
            norm          Normalization constant determined by self.directed
        
        Returns:
            A message sending from the neighboring node to the source node
        """
        return norm.view(-1, 1) * x_j * edge_weight.view(-1, 1)

In [29]:
class GCN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers,
                 dropout, return_embeds = False):
        """
        Initialize a GCN model.
        Args:
            input_dim       Input dimension of node embeddings
            hidden_dim      Hidden dimension of node embeddings
            output_dim      Output dimension of node embeddings
            num_layers      The number of GCN layers
            dropout         The dropout ratio in (0, 1]
                              (dropout: the probability of an element getting zeroed)
            return_embeds   A boolean value determining whether we skip the
                              classification layer and return node embeddings
        """

        super(GCN, self).__init__()

        # Construct all convs
        self.num_layers = num_layers
        self.convs = torch.nn.ModuleList([GCNLayer(hidden_dim, hidden_dim, directed = False) 
                                                        for i in range(self.num_layers-1)])

        # Construct batch normalization
        self.bns = torch.nn.ModuleList([torch.nn.BatchNorm1d(hidden_dim)
                                        for i in range(self.num_layers-1)])
        # First GCN layer
        self.convs[0] = GCNLayer(input_dim, hidden_dim, directed = False)
        # Last GCN layer
        self.last_conv = GCNLayer(hidden_dim, output_dim, directed = False)
        self.softmax = torch.nn.LogSoftmax(dim = -1)
        
        self.dropout = dropout
        self.return_embeds = return_embeds

    def reset_parameters(self):
        """
        Reset all learnable parameters in GCN layers and Batch Normalization
        Layers.
        """
        for conv in self.convs:
            conv.reset_parameters()
        for bn in self.bns:
            bn.reset_parameters()

    def forward(self, x, edge_index):
        """
        Produce a forward propagation of GCN model. Before the last GCN layer,
        we transform the embedding (x) in the following sequence:
          x -> GCN_Layer -> Batch_Norm -> ReLU -> Dropout.
        At the last GCN layer, the following sequence is applied:
          x -> GCN Layer -> Softmax -> output.
        
        Args:
            x             The node embedding
            edge_index    The adjacency list of the graph
        
        Returns:
            out           The predictions of labels / the updated node embedding
        """
        x = torch.clone(x.detach())
        for l in range(self.num_layers - 1):
            # Unweighted graph has weight 1.
            x = self.convs[l](x, edge_index, torch.ones(edge_index.shape[1]))
            x = self.bns[l](x)
            x = F.relu(x)
            x = F.dropout(x, p = self.dropout, training = self.training)

        x = self.last_conv(x, edge_index, torch.ones(edge_index.shape[1]))
        if self.return_embeds:
            out = x
        else:
            out = self.softmax(x)

        return out

In [30]:
args = {
    'device': device,
    'num_layers': 2,
    'hidden_dim': 256,
    'dropout': 0.5,
    'lr': 0.01,
    'epochs': 50,
    'label_weight': torch.Tensor([0.5, 0.5])
}
args

{'device': 'cpu',
 'num_layers': 2,
 'hidden_dim': 256,
 'dropout': 0.5,
 'lr': 0.01,
 'epochs': 50,
 'label_weight': tensor([0.5000, 0.5000])}

In [31]:
model = GCN(data5.x.shape[1], args['hidden_dim'],
            2, args['num_layers'], args['dropout']).to(device)

In [34]:
def train_transd(model, data, train_idx, optimizer, loss_fn):
    """
    Train the model by using the given optimizer and loss_fn.
    
    Args:
        model       The GCN model
        data        The Data object that stores x, edge_index, and labels
        train_idx   The node indices in the training set
        optimizer   The optimizer
        loss_fn     The loss function

    Returns
        The prediction loss by the given loss function
    """
    model.train()
    loss = 0

    optimizer.zero_grad()
    train_slice = model.forward(data.x, data.edge_index)[train_idx]
    train_label = data.y[train_idx]
    loss = loss_fn(train_slice, train_label)

    loss.backward()
    optimizer.step()
    return loss.item()

In [35]:
@torch.no_grad()
def test_transd(model, data, split_idx, save_model_results=False):
    """
    Test the model by using the given split_idx.

    Args:
        model                 The GCN model
        data                  The Data object that stores x, edge_index, and labels
        split_idx             A dictionary that stores node indices for three sets
        save_model_results    A boolean determining whether we save the model results

    Returns
        The accuracy and auc-roc score of training, validation, and test set
    """

    model.eval()
    # The output of model on all data
    out = model.forward(data.x, data.edge_index)
    
    train_index = split_idx['train']
    val_index = split_idx['val']
    test_index = split_idx['test']

    y_pred = out.argmax(dim=-1, keepdim=True)
    train_acc = classification_report(torch.unsqueeze(data.y[train_index], -1),
                                      y_pred[train_index], zero_division=0)
    valid_acc = classification_report(torch.unsqueeze(data.y[val_index], -1),
                                      y_pred[val_index], zero_division=0)
    valid_accuracy = classification_report(torch.unsqueeze(data.y[val_index], -1),
                                      y_pred[val_index], output_dict=True,
                                      zero_division=0)['accuracy']
    test_acc = classification_report(torch.unsqueeze(data.y[test_index], -1),
                                     y_pred[test_index], zero_division=0)
    train_auc_roc = roc_auc_score(torch.unsqueeze(data.y[train_index], -1),
                                      y_pred[train_index])
    val_auc_roc = roc_auc_score(torch.unsqueeze(data.y[val_index], -1),
                                      y_pred[val_index])
    test_auc_roc = roc_auc_score(torch.unsqueeze(data.y[test_index], -1),
                                      y_pred[test_index])
    
    if save_model_results:
        print ("Saving Model Predictions")

        data = {}
        data['y_pred'] = y_pred.view(-1).cpu().detach().numpy()

        df = pd.DataFrame(data=data)
        # Save locally as csv
        df.to_csv('gcn_transd.csv', sep=',', index=False)
    
    return train_acc, valid_acc, test_acc, \
           train_auc_roc, val_auc_roc, test_auc_roc

In [40]:
import copy

model.reset_parameters()

optimizer = torch.optim.Adam(model.parameters(), lr=args['lr'])
loss_fn = torch.nn.NLLLoss(weight=args['label_weight'])

best_model = None
best_valid_auc = 0
best_result = None
losses = []

for epoch in range(1, 1 + args["epochs"]):
    # train with random split
    # loss = train_transd(model, data3, split_idx3['train'], optimizer, loss_fn)
    # losses.append(loss)
    result = test_transd(model, data3, split_idx3)
    train_acc, val_acc, test_acc, train_auc, val_auc, test_auc = result 
    if val_auc > best_valid_auc:
        best_valid_auc = val_auc
        best_model = copy.deepcopy(model)
        best_result = [train_acc, val_acc, test_acc, train_auc, val_auc, test_auc]

    print('Epoch: {:02},'.format(epoch),
        #   'Loss:{:.4f}'.format(loss),
          'Train:\n{}\n'.format(train_acc),
          'Train_auc_roc: {}'.format(train_auc),
          '\n\n'
          'Valid:\n{}\n'.format(val_acc),
          'Val_auc_roc: {}'.format(val_auc),
          '\n\n'
          'Test:\n{}\n'.format(test_acc),
          'Test_auc_roc: {}'.format(test_auc),
          '\n'
          )
  
  

Epoch: 01, Train:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.04      0.03      0.03      2700
           2       0.00      0.00      0.00     24915

    accuracy                           0.00     27615
   macro avg       0.01      0.01      0.01     27615
weighted avg       0.00      0.00      0.00     27615

 Train_auc_roc: 0.5217677882578544 

Valid:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.06      0.04      0.05       507
           2       0.00      0.00      0.00      4821

    accuracy                           0.00      5328
   macro avg       0.02      0.01      0.02      5328
weighted avg       0.01      0.00      0.00      5328

 Val_auc_roc: 0.5162626772171552 

Test:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.05   