In [1]:
%matplotlib inline

## 0. Dependencies installation

In [None]:
!pip install numpy
!pip install matplotlib
!pip install torch
!pip install networkx
!pip install gensim

DGL installation: https://www.dgl.ai/pages/start.html

## 1. NetworkX

NetworkX allows users to create, manipulate, and analyze graph structures using a variety of graph types, including directed, undirected, weighted, and bipartite graphs. It also provides a variety of algorithms for analyzing graph properties, such as centrality, clustering, connectivity, and paths.

In [2]:
import networkx as nx 

In [16]:
G = nx.Graph() 
# create a graph
print(G)

In [17]:
nx.draw(G)
# draw the graph

In [14]:
G.add_node(1)
G.add_nodes_from([2, 3, 4])
G.add_edge(1, 2)
G.add_edges_from([(1, 3), (2, 3), (4, 2)])
# nodes and edges can be added to the graph using add_node() and add_edge() methods

In [18]:
print(G)

In [20]:
nx.draw(G)

In [22]:
nx.draw(G, with_labels=True)
# draw graph with node labels

In [26]:
print(f"Graph nodes: {G.nodes()}")
print(f"Graph edges: {G.edges()}")

In [30]:
neighbors = list(G.neighbors(1))
print(f"Neighbors of node 1: {neighbors}")
# get the neighbors of a particular node using the neighbors() method

In [35]:
G.graph['name'] = 'Graph'
print(f"Graph attributes: {G.graph}")
# adding attributes to the graph

In [41]:
G.nodes[1]['label'] = 'Node 1'
print(f"Node 1 attributes: {G.nodes[1]}")
# adding attributes to the node

In [52]:
G.edges[1,3]['label'] = 'Edge 1,3'
print(f"Edge attributes: {G.edges[1,3]}")
# adding attributes to the edge

In [53]:
nx.write_edgelist(G, 'graph.txt')
# create a text file graph.txt with graph edges.

In [54]:
G = nx.read_edgelist('graph.txt')
# import the graph from the file using read_edgelist()

## 2. Dataset

Nodes mean paper and edges mean citation relationships. Each node has a predefined feature with 1433 dimensions. The dataset is designed for the node classification task. The task is to predict the category of certain paper.

**Statistics:**

- Nodes: 2708
- Edges: 10556
- Number of Classes: 7

**Label split:**

- Train: 140
- Valid: 500
- Test: 1000

In [9]:
import dgl
from dgl.data import CiteseerGraphDataset

citeseer_dataset = CiteseerGraphDataset()
citeseer_graph = citeseer_dataset[0]
num_class = citeseer_dataset.num_classes

# get node feature
feat = citeseer_graph.ndata['feat']

# get data split
train_mask = citeseer_graph.ndata['train_mask']
valid_mask = citeseer_graph.ndata['val_mask']
test_mask = citeseer_graph.ndata['test_mask']

# get labels
label = citeseer_graph.ndata['label']

In [6]:
# get the list of nodes and their corresponding labels from the DGL graph
node_list = citeseer_graph.nodes()
node_labels = citeseer_graph.ndata['label']

# train/test/validation
train_node_list = node_list[train_mask]
valid_node_list = node_list[valid_mask]
test_node_list = node_list[test_mask]

train_node_labels = node_labels[train_mask]
valid_node_labels = node_labels[valid_mask]
test_node_labels = node_labels[test_mask]

train_feat = feat[train_mask]
valid_feat = feat[valid_mask]
test_feat = feat[test_mask]

Creating a graph in the NetworkX

In [7]:
nx_citeseer_graph = dgl.to_networkx(citeseer_graph)

In [10]:
print(nx_citeseer_graph)

## 3. Baseline

In [401]:
from catboost import CatBoostClassifier

# create a CatBoostClassifier model
model = CatBoostClassifier()

# train the model with the train set
model.fit(train_feat.numpy(), train_node_labels.numpy(), 
          eval_set=(valid_feat.numpy(), valid_node_labels.numpy()), 
          early_stopping_rounds=10, verbose=10)

# make predictions on the test data
predicted_labels = model.predict(test_feat.numpy())

In [400]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(test_node_labels, predicted_labels)
print(f"Accuracy score: {accuracy}")

In [399]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(test_node_labels, predicted_labels)
classes = range(num_class)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
plt.figure(figsize=(8, 6))
sns.set(font_scale=1.2)
sns.heatmap(cm_normalized, annot=True, cmap='Blues', fmt='.2f', xticklabels=classes, yticklabels=classes)

plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')

plt.show()

## 4. Node2Vec

The Node2Vec algorithm introduced is a 2-step representation learning algorithm:

- Use random walks to generate sentences from a graph. A sentence is a list of node ids. The set of all sentences makes a corpus.

- The corpus is then used to learn an embedding vector for each node in the graph. Each node id is considered a unique word/token in a dictionary that has size equal to the number of nodes in the graph. The Word2Vec algorithm is used for calculating the embedding vectors.

In [11]:
import networkx as nx
import numpy as np
from gensim.models import Word2Vec

  "class": algorithms.Blowfish,


In [14]:
import gensim
gensim.__version__ 

We can start training Node2Vec. We first define the parameters of the algorithm, such as the dimensionality of the embedding, the length of random walks, and the number of walks from each node.

In [13]:
# Node2Vec parameters
walk_length = 100
num_walks = 10

In [15]:
import networkx as nx
import numpy as np

from tqdm import tqdm

def generate_random_walks(graph, num_walks, walk_length, p=1.0, q=1.0):
    walks = []
    
    for _ in tqdm(range(num_walks)):
        for node in graph.nodes():
            walk = [node]
            for _ in range(walk_length - 1):
                current_node = walk[-1]
                neighbors = list(graph.neighbors(current_node))
                
                if len(neighbors) == 0:  # Handle nodes with no outgoing edges
                    break
                
                if len(walk) == 1:  # First step in the walk
                    walk.append(np.random.choice(neighbors))
                else:
                    previous_node = walk[-2]
                    next_node = _select_next_node(graph, current_node, previous_node, neighbors, p, q)
                    walk.append(next_node)
                    
            walks.append(walk)
    
    return walks

def _select_next_node(graph, current_node, previous_node, neighbors, p, q):
    weights = []
    for neighbor in neighbors:
        if neighbor == previous_node:
            weights.append(1 / p)
        elif graph.has_edge(neighbor, previous_node):
            weights.append(1)
        else:
            weights.append(1 / q)
    
    probabilities = np.array(weights) / sum(weights)
    next_node = np.random.choice(neighbors, p=probabilities)
    
    return next_node



# random walk generation
walks = generate_random_walks(nx_citeseer_graph, num_walks, walk_length, p = 0.5, q = 2.0)

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [01:05<00:00,  6.53s/it]


In [16]:
# learning the Word2Vec model
model = Word2Vec(walks, vector_size = 128, window = 5, min_count = 0, sg = 1, workers = 2)

# conversion of embeddings into np-array
node_embeddings = np.array([model.wv[node_id] for node_id in range(len(nx_citeseer_graph))])

In [17]:
train_embeddings = node_embeddings[train_node_list]
valid_embeddings = node_embeddings[valid_node_list]
test_embeddings = node_embeddings[test_node_list]

In [18]:
from sklearn.linear_model import LogisticRegressionCV

# сreate a CatBoost classifier with hyperparameters
classifier = LogisticRegressionCV(Cs = 10, cv = 10, 
                                  scoring = "accuracy", 
                                  verbose = False,
                                  multi_class="ovr", 
                                  max_iter=200)

# train the classifier on the train set
classifier.fit(train_embeddings, train_node_labels.numpy())

# predict the labels for the test set
predicted_labels = classifier.predict(test_embeddings)

In [165]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(test_node_labels, predicted_labels)
print(f"Accuracy score: {accuracy}")

In [164]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(test_node_labels, predicted_labels)
classes = range(num_class)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
plt.figure(figsize=(8, 6))
sns.set(font_scale=1.2)
sns.heatmap(cm_normalized, annot=True, cmap='Blues', fmt='.2f', xticklabels=classes, yticklabels=classes)

plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')

plt.show()

Implementation of node2vec: https://github.com/eliorc/node2vec

## 5. DGL

DGL (Deep Graph Library) is a library for deep graph learning. It provides a high-level API for creating, processing and training graph neural networks.

In [5]:
import dgl

In DGL, a graph is represented as a adjacency list consisting of two arrays: an array of nodes (ndata) and an array of edges (edata)

In [6]:
import dgl
import torch

# create a graph with 3 nodes
g = dgl.graph(([0, 1, 2], [1, 2, 0]))
# add node features
g.ndata['feat'] = torch.tensor([[1.0, 2.0], [2.0, 4.0], [3.0, 5.0]])
# add edge features
g.edata['weight'] = torch.tensor([0.5, 1.0, 2.0])

In [10]:
g

In [11]:
g.ndata['feat']

In [12]:
g.edata['weight']

DGL provides various convolution operations on graphs to handle node and edge information:

In [40]:
node_features = torch.randn(5, 10)
edge_features = torch.randn(4, 5)

In [41]:
g = dgl.graph(([0, 2, 4, 3], [1, 3, 1, 0]))

# add node features
g.ndata['feat'] = node_features

# add edge features
g.edata['weight'] = edge_features

In [45]:
g.ndata['feat']

In [46]:
g.edata['weight']

In [47]:
import dgl.nn as dglnn

# creating a GraphConv model
conv = dglnn.GraphConv(in_feats=10, out_feats=16)

# applying convolution to a graph
features = g.ndata['feat']
g.ndata['h'] = conv(g, features)

<img src="gcn.png" width="600" height="240">

In [48]:
# add self-loops
g = dgl.add_self_loop(g)

# applying convolution to a graph
features = g.ndata['feat']
g.ndata['h'] = conv(g, features)

In [53]:
g.ndata['h']

In [50]:
g = dgl.graph(([0, 2, 4, 3], [1, 3, 1, 0]))

# add node features
g.ndata['feat'] = node_features

# add edge features
g.edata['weight'] = edge_features

# add reverse edges
g = dgl.add_reverse_edges(g)

# applying convolution to a graph
features = g.ndata['feat']
g.ndata['h'] = conv(g, features)

In [52]:
g.ndata['h']

## 6. Graph Convolution Networks


Graph Convolutional Networks (GCNs) are a type of neural network architecture specifically designed for graph-structured data. Unlike traditional neural networks that operate on grid-like data such as images or sequences, GCNs can directly handle and learn from graph data, which represents relationships between entities.

The core idea behind GCNs is to generalize the convolution operation, typically used in grid-based domains, to operate on graph structures. GCNs leverage the local connectivity and graph structure to perform node feature aggregation and information propagation across the graph. This allows GCNs to capture both the local and global dependencies present in the graph.

In [715]:
import dgl
import dgl.function as fn

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from dgl.nn import GraphConv
from torch.utils.data import DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class GraphConvolution(nn.Module):
    def __init__(self, in_feats, out_feats):
        super(GraphConvolution, self).__init__()
        self.linear = nn.Linear(in_feats, out_feats)

    def forward(self, graph, features):
        # node features
        graph.ndata['h'] = features
        # aggregation 
        graph.update_all(fn.copy_u('h', 'm'), fn.mean(msg='m', out='h_neigh'))
        # let's try to change the aggregation function(?)
        h_neigh = graph.ndata['h_neigh']
        
        # linear layer
        h = self.linear(h_neigh)
        return h

    
# define the GCN model
class GCN(nn.Module):
    def __init__(self, in_feats, hidden_size, num_classes, dropout):
        super(GCN, self).__init__()
        self.conv1 = GraphConvolution(in_feats, hidden_size)
        self.dropout1 = nn.Dropout(dropout)
        self.conv2 = GraphConvolution(hidden_size, num_classes)
    
    def forward(self, g, features):
        x = self.conv1(g, features)
        x = self.dropout1(x)
        x = F.elu(x)
        
        x = self.conv2(g, x)        
        return F.log_softmax(x, dim = 1)

In [716]:
in_feats = feat.shape[1]
hidden_size = 16
dropout = 0.6

# create model
model_gcn = GCN(in_feats, hidden_size, num_class, dropout).to(device)

criterion = nn.CrossEntropyLoss()

# create optimizer
optimizer = optim.Adam(model_gcn.parameters(), lr=0.01, weight_decay=5e-4)

In [725]:
print(model_gcn)

In [724]:
best_valid_loss = np.inf
best_model = None
num_epochs = 300

for epoch in range(1, num_epochs + 1):
    # train
    model_gcn.train()
    optimizer.zero_grad()
    
    output = model_gcn(citeseer_graph.to(device), feat.to(device))
    loss = criterion(output[train_mask], train_node_labels.to(device))
    loss.backward()
    optimizer.step()
    
    # validation
    model_gcn.eval()
    with torch.no_grad():
        output = model_gcn(citeseer_graph.to(device), feat.to(device))
        val_loss = criterion(output[valid_mask], valid_node_labels.to(device))
        
    if val_loss < best_valid_loss:
        best_valid_loss = val_loss
        best_model = model_gcn.state_dict()
    
    if epoch % 20 == 0:
        print(f"Epoch {epoch} | Train loss: {loss:.4f} | Valid loss: {val_loss:.4f}")
        
model_gcn.load_state_dict(best_model)

In [723]:
from sklearn.metrics import accuracy_score

with torch.no_grad():
    eoutput = model_gcn(citeseer_graph.to(device), feat.to(device))
    predicted_labels = output[test_mask].max(1)[1].to('cpu')

accuracy = accuracy_score(test_node_labels, predicted_labels)
print(f"Accuracy score: {accuracy}")

In [721]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(test_node_labels, predicted_labels)
classes = range(num_class)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
plt.figure(figsize=(8, 6))
sns.set(font_scale=1.2)
sns.heatmap(cm_normalized, annot=True, cmap='Blues', fmt='.2f', xticklabels=classes, yticklabels=classes)

plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')

plt.show()

## 7. Graph Attention Networks

Graph Attention Network (GAT) is a neural network architecture designed for processing graph-structured data. It leverages attention mechanisms to enable each node in the graph to selectively attend to its neighbors during information propagation. GAT employs self-attention mechanisms to compute attention coefficients, which determine the importance of neighboring nodes for each node in the graph. By assigning different weights to different neighbors, GAT can effectively aggregate information from the relevant nodes while suppressing noise and irrelevant information. 

In [677]:
import dgl
import dgl.function as fn

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from dgl.nn import GATv2Conv
from torch.utils.data import DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

import torch
import torch.nn as nn
import torch.nn.functional as F

# define the GAT model
class GAT(nn.Module):
    def __init__(self, in_dim, hidden_dim, num_classes, num_heads, dropout):
        super(GAT, self).__init__()
        self.conv1 = GATv2Conv(in_dim, hidden_dim, num_heads, feat_drop = 0.6, attn_drop = 0.6)
        self.dropout1 = nn.Dropout(dropout)
        self.conv2 = GATv2Conv(hidden_dim * num_heads, num_classes, feat_drop = 0.6, num_heads = 1, attn_drop = 0.6)

    def forward(self, g, features):
        x = self.conv1(g, features).flatten(1)
        x = self.dropout1(x)
        x = F.elu(x)
        
        x = self.conv2(g, x).flatten(1)
        return F.log_softmax(x, dim=1)


In [678]:
in_feats = feat.shape[1]
hidden_size = 16
dropout = 0.6
num_heads = 4

# create model
model_gat = GAT(in_feats, hidden_size, num_class, num_heads, dropout).to(device)

criterion = nn.CrossEntropyLoss()

# create optimizer
optimizer = optim.Adam(model_gat.parameters(), lr=0.005, weight_decay=5e-4)

In [687]:
print(model_gat)

In [686]:
best_valid_loss = np.inf
best_model = None
num_epochs = 200

for epoch in range(1, num_epochs + 1):
    # train
    model_gat.train()
    optimizer.zero_grad()
    
    output = model_gat(citeseer_graph.to(device), feat.to(device))
    loss = criterion(output[train_mask], train_node_labels.to(device))
    loss.backward()
    optimizer.step()
    
    # validation
    model_gat.eval()
    with torch.no_grad():
        output = model_gat(citeseer_graph.to(device), feat.to(device))
        val_loss = criterion(output[valid_mask], valid_node_labels.to(device))
        
    if val_loss < best_valid_loss:
        best_valid_loss = val_loss
        best_model = model_gat.state_dict()
    
    if epoch % 20 == 0:
        print(f"Epoch {epoch} | Train loss: {loss:.4f} | Valid loss: {val_loss:.4f}")
        
model_gat.load_state_dict(best_model)

In [685]:
from sklearn.metrics import accuracy_score

with torch.no_grad():
    output = model_gat(citeseer_graph.to(device), feat.to(device))[test_mask]
    predicted_labels = output.max(1)[1].to('cpu')

accuracy = accuracy_score(test_node_labels, predicted_labels)
print(f"Accuracy score: {accuracy}")

In [684]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(test_node_labels, predicted_labels)
classes = range(num_class)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
plt.figure(figsize=(8, 6))
sns.set(font_scale=1.2)
sns.heatmap(cm_normalized, annot=True, cmap='Blues', fmt='.2f', xticklabels=classes, yticklabels=classes)

plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')

plt.show()

## 8. GraphSAGE

In [726]:
import dgl
import dgl.function as fn

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from dgl.nn import SAGEConv
from torch.utils.data import DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# define the GraphSAGE model
class GraphSAGE(nn.Module):
    def __init__(self, in_feats, hidden_size, num_classes, dropout):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_feats, hidden_size, 'mean')
        self.dropout1 = nn.Dropout(dropout)
        self.conv2 = SAGEConv(hidden_size, num_classes, 'mean')
    
    def forward(self, g, features):
        x = self.conv1(g, features)
        x = self.dropout1(x)
        x = F.elu(x)
        
        x = self.conv2(g, x)        
        return F.log_softmax(x, dim = 1)

In [727]:
in_feats = feat.shape[1]
hidden_size = 16
dropout = 0.6

# create model
model_sage = GraphSAGE(in_feats, hidden_size, num_class, dropout).to(device)

criterion = nn.CrossEntropyLoss()

# create optimizer
optimizer = optim.Adam(model_sage.parameters(), lr=0.01, weight_decay=5e-4)

In [730]:
best_valid_loss = np.inf
best_model = None
num_epochs = 300

for epoch in range(1, num_epochs + 1):
    # train
    model_sage.train()
    optimizer.zero_grad()
    
    output = model_sage(citeseer_graph.to(device), feat.to(device))
    loss = criterion(output[train_mask], train_node_labels.to(device))
    loss.backward()
    optimizer.step()
    
    # validation
    model_sage.eval()
    with torch.no_grad():
        output = model_sage(citeseer_graph.to(device), feat.to(device))
        val_loss = criterion(output[valid_mask], valid_node_labels.to(device))
        
    if val_loss < best_valid_loss:
        best_valid_loss = val_loss
        best_model = model_sage.state_dict()
    
    if epoch % 20 == 0:
        print(f"Epoch {epoch} | Train loss: {loss:.4f} | Valid loss: {val_loss:.4f}")
        
model_sage.load_state_dict(best_model)

In [731]:
from sklearn.metrics import accuracy_score

with torch.no_grad():
    eoutput = model_sage(citeseer_graph.to(device), feat.to(device))
    predicted_labels = output[test_mask].max(1)[1].to('cpu')

accuracy = accuracy_score(test_node_labels, predicted_labels)
print(f"Accuracy score: {accuracy}")

In [733]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(test_node_labels, predicted_labels)
classes = range(num_class)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
plt.figure(figsize=(8, 6))
sns.set(font_scale=1.2)
sns.heatmap(cm_normalized, annot=True, cmap='Blues', fmt='.2f', xticklabels=classes, yticklabels=classes)

plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')

plt.show()

## 9. PyG

## 8.1. GCN

In [688]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import GCNConv

# load the CiteSeer dataset
dataset = Planetoid(root='data/CiteSeer', name='CiteSeer')

In [689]:
dataset[0]

Data(x=[3327, 3703], edge_index=[2, 9104], y=[3327], train_mask=[3327], val_mask=[3327], test_mask=[3327])

In [693]:
class GCN_pyg(nn.Module):
    def __init__(self, in_features, hidden_features, num_classes, dropout):
        super(GCN_pyg, self).__init__()
        self.conv1 = GCNConv(in_features, hidden_features)
        self.dropout1 = nn.Dropout(dropout)

        self.conv2 = GCNConv(hidden_features, num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.dropout1(x)
        x = self.conv2(x, edge_index)
        
        return F.log_softmax(x, dim=1)

    
dropout = 0.6

# create model
model_pyg = GCN_pyg(dataset.num_features, 16, dataset.num_classes, dropout).to(device)
criterion = nn.CrossEntropyLoss()

# create optimizer
optimizer = torch.optim.Adam(model_pyg.parameters(), lr = 0.0005, weight_decay = 5e-4)

In [696]:
best_valid_loss = np.inf
best_model = None
num_epochs = 200

# training and validation
for epoch in range(1, num_epochs + 1):
    model_pyg.train()
    optimizer.zero_grad()
    
    output = model_pyg(dataset[0].to(device))
    loss = criterion(output[train_mask], train_node_labels.to(device))
    loss.backward()
    optimizer.step()
    
    # validation
    model_pyg.eval()
    with torch.no_grad():
        output = model_pyg(dataset[0].to(device))
        val_loss = criterion(output[valid_mask], valid_node_labels.to(device))
        
    if val_loss < best_valid_loss:
        best_valid_loss = val_loss
        best_model = model_pyg.state_dict()
    
    if epoch % 20 == 0:
        print(f"Epoch {epoch} | Train loss: {loss:.4f} | Valid loss: {val_loss:.4f}")

In [697]:
from sklearn.metrics import accuracy_score

with torch.no_grad():
    output = model_pyg(dataset[0].to(device))[test_mask]
    predicted_labels = output.max(1)[1].to('cpu')

accuracy = accuracy_score(test_node_labels, predicted_labels)
print(f"Accuracy score: {accuracy}")