<a href="https://colab.research.google.com/github/dgl2000/COMP-559_ML-with-Graphs/blob/main/COMP_559_HW4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# COMP 559 Homework 4
@author: Gaole Dai (gd25)

## Quesiton 1
Compare the following node classification approaches in terms of accuracy using the Cora dataset: (1) A neural network for classification using only the node
features, (2) a Graph Convolutional Network with node attributes as the n × n identity matrix,(3) a GCN with original attributes, and (4) a GCN with deepwalk or node2vec embeddings as node features. Hints: (1) use PyTorch Geometric1 or the Deep Graph Library2; (2) in case you are unable to train
the second model, use a random vector of fixed length as the attributes of each node.

### (1) A neural network for classification using only the node features

In [259]:
!pip install torch-geometric

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [193]:
import torch_geometric.datasets as datasets

dataset = datasets.Planetoid(root='/tmp/Cora', name='Cora')
data = dataset[0]

print(data.num_node_features)
print(dataset.num_node_features)

1433
1433


In [260]:
import torch
import torch.nn.functional as F
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import Linear

class NodeFeatureNN(torch.nn.Module):
    def __init__(self):
        super(NodeFeatureNN, self).__init__()
        self.layer1 = Linear(dataset.num_node_features, 64)
        self.layer2 = Linear(64, dataset.num_classes)

    def forward(self, x):
        x = torch.relu(self.layer1(x))
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.layer2(x)
        return F.log_softmax(x, dim=1)

In [195]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = NodeFeatureNN().to(device)
data = dataset[0].to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

model.train()
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data.x)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

In [196]:
model.eval()
_, pred = model(data.x).max(dim=1)
correct = float(pred[data.test_mask].eq(data.y[data.test_mask]).sum().item())
acc = correct / data.test_mask.sum().item()
print('Test Accuracy: {:.4f}'.format(acc))

Test Accuracy: 0.5660


### (2) a Graph Convolutional Network with node attributes as the n × n identity matrix

In [253]:
import torch
import torch.nn.functional as F
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import GCNConv

dataset = Planetoid(root='/tmp/Cora', name='Cora')
data = dataset[0]

# Create identity matrix
num_nodes = data.num_nodes
data.x = torch.eye(num_nodes)

In [254]:
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = GCNConv(num_nodes, 64)
        self.conv2 = GCNConv(64, dataset.num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

In [255]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Net().to(device)
data = data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

model.train()
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

In [256]:
model.eval()
_, pred = model(data).max(dim=1)
correct = float(pred[data.test_mask].eq(data.y[data.test_mask]).sum().item())
acc = correct / data.test_mask.sum().item()
print('Test Accuracy using Identity Matrix: {:.4f}'.format(acc))

Test Accuracy using Identity Matrix: 0.6420


### (3) a GCN with original attributes

In [202]:
import torch
import torch.nn.functional as F
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import GCNConv

dataset = Planetoid(root='/tmp/Cora', name='Cora')

In [203]:
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = GCNConv(dataset.num_node_features, 64)
        self.conv2 = GCNConv(64, dataset.num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

In [204]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Net().to(device)
data = dataset[0].to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

model.train()
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

In [205]:
model.eval()
_, pred = model(data).max(dim=1)
correct = float(pred[data.test_mask].eq(data.y[data.test_mask]).sum().item())
acc = correct / data.test_mask.sum().item()
print('Test Accuracy with Original Attributes: {:.4f}'.format(acc))

Test Accuracy with Original Attributes: 0.8130


### (4) a GCN with deepwalk or node2vec embeddings as node features

In [257]:
!pip install node2vec

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting node2vec
  Downloading node2vec-0.4.6-py3-none-any.whl (7.0 kB)
Collecting networkx<3.0,>=2.5
  Downloading networkx-2.8.8-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: networkx, node2vec
  Attempting uninstall: networkx
    Found existing installation: networkx 3.1
    Uninstalling networkx-3.1:
      Successfully uninstalled networkx-3.1
Successfully installed networkx-2.8.8 node2vec-0.4.6


In [258]:
import torch
import torch.nn.functional as F
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import GCNConv, global_mean_pool
from torch_geometric.data import DataLoader
from node2vec import Node2Vec
import networkx as nx

# Load the Cora dataset
dataset = Planetoid(root='data/Cora', name='Cora')

# Convert the dataset into a networkx graph and compute node2vec embeddings
data = dataset[0]
G = nx.Graph()
G.add_nodes_from(range(data.num_nodes))
G.add_edges_from(torch.tensor(data.edge_index).T.tolist())
node2vec = Node2Vec(G, dimensions=64, walk_length=30, num_walks=200, workers=4)
model = node2vec.fit(window=10, min_count=1, batch_words=4)

# Extract node2vec embeddings as node features
data.x = torch.tensor([model.wv[str(i)] for i in range(data.num_nodes)], dtype=torch.float)

# Define the GCN model
class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return x

# Initialize and train the GCN model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCN(in_channels=data.num_features, hidden_channels=64, out_channels=dataset.num_classes).to(device)
data = data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

def train(data):
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = F.cross_entropy(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss

def test(data):
    model.eval()
    out = model(data.x, data.edge_index)
    pred = out.argmax(dim=1)
    correct = (pred == data.y).sum()
    return correct / data.num_nodes

# Train and test the model
for epoch in range(200):
    loss = train(data)
    if epoch % 10 == 0:
        accuracy = test(data)
        print(f"Epoch: {epoch}, Loss: {loss:.4f}, Accuracy: {accuracy:.4f}")

  G.add_edges_from(torch.tensor(data.edge_index).T.tolist())


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

  data.x = torch.tensor([model.wv[str(i)] for i in range(data.num_nodes)], dtype=torch.float)


Epoch: 0, Loss: 1.9197, Accuracy: 0.2980
Epoch: 10, Loss: 0.6067, Accuracy: 0.7234
Epoch: 20, Loss: 0.2657, Accuracy: 0.7168
Epoch: 30, Loss: 0.0825, Accuracy: 0.6950
Epoch: 40, Loss: 0.0562, Accuracy: 0.7020
Epoch: 50, Loss: 0.0361, Accuracy: 0.7049
Epoch: 60, Loss: 0.0254, Accuracy: 0.7038
Epoch: 70, Loss: 0.0362, Accuracy: 0.7053
Epoch: 80, Loss: 0.0235, Accuracy: 0.7061
Epoch: 90, Loss: 0.0253, Accuracy: 0.6983
Epoch: 100, Loss: 0.0193, Accuracy: 0.7061
Epoch: 110, Loss: 0.0118, Accuracy: 0.6976
Epoch: 120, Loss: 0.0125, Accuracy: 0.6994
Epoch: 130, Loss: 0.0144, Accuracy: 0.7042
Epoch: 140, Loss: 0.0120, Accuracy: 0.7001
Epoch: 150, Loss: 0.0120, Accuracy: 0.7064
Epoch: 160, Loss: 0.0147, Accuracy: 0.7053
Epoch: 170, Loss: 0.0160, Accuracy: 0.7020
Epoch: 180, Loss: 0.0095, Accuracy: 0.7042
Epoch: 190, Loss: 0.0124, Accuracy: 0.7001
