# Recommending Amazon Products using Graph Neural Networks in PyTorch Geometric

- Read README.MD to install the dependencies.

Based on https://wandb.ai/manan-goel/gnn-recommender/reports/Recommending-Amazon-Products-using-Graph-Neural-Networks-in-PyTorch-Geometric--VmlldzozMTA3MzYw

In [1]:
from tqdm import tqdm
import numpy as np
from IPython.display import IFrame

import torch
from torch import nn
import torch.nn.functional as F

from torch_geometric.data import Data
from torch_geometric import utils
from torch_geometric.transforms import RandomLinkSplit
import torch_geometric as pyg


from pyvis.network import Network

In [2]:
from torch_geometric.explain import Explainer, GNNExplainer
from torch_geometric.explain import Explainer, PGExplainer

# download and format data

- uses data from Snap
- ref. : https://snap.stanford.edu/data/amazon0302.html
        

We then read all the lines in the file, initialize a numpy array and a list to keep track of the in-degree of each node and the edges respectively.

Then all the lines are read one by one and processed: the lines with metadata are ignored and the lines with the start node and end node are processed. The in-degree of the end node is incremented and the edge data is added to edge_index.

We use the in-degree of each node and the edge_index to create a PyG graph using the Data class.

In [3]:
with open(f'amazon0302.txt', 'r') as f:
    edges = f.readlines()

In [4]:
# limit dataset

edges = edges[:int(len(edges) * 0.1)]

In [5]:
print(len(edges))

123488


In [6]:
# create graph

edge_index = []
in_out_degrees = np.zeros((262111, 2))

for idx in tqdm(range(len(edges))):
    line = edges[idx]
    if line.startswith('#'):
        # skip comments
        continue
    start, end = line.strip().split()
    start, end = int(start), int(end)
    in_out_degrees[end][0] += 1  # in-degree on "end"
    in_out_degrees[start][1] += 1  # out-degree on "start"

    edge_index.append([start, end])

edge_index = torch.tensor(edge_index).t().contiguous()
graph = Data(x=in_out_degrees, edge_index=edge_index)

torch.save(graph, 'amazon0302.pt')

100%|███████████████████████████████████████████████████████████████████████████████████| 123488/123488 [00:00<00:00, 368551.08it/s]


It is incredibly hard and resource intensive to visualize hundreds of thousands of nodes so we sampled the first 100 nodes from the graph using the subgraph utility from PyTorch Geometric.

In [7]:
# Create a mask with the value True for nodes to be retained and False for nodes to be removed
mask = np.zeros(graph.x.shape[0])
mask[:100] = 1
mask = torch.tensor(mask == 1)

# Create and save the new smaller graph by sampling nodes according to the a the mask
g = Data(x=graph.x[mask], edge_index=utils.subgraph(mask, graph.edge_index)[0])
torch.save(g, 'smaller_graph.pt')

In [8]:
# Initialize the PyVis network
net = Network(height="750px", width="100%", bgcolor="#222222", font_color="white", notebook=True)

metadata = dict()
# Add the edges from the PyG graph to the PyVis network
for e in tqdm(g.edge_index.T):

    src = e[0].item()
    dst = e[1].item()
    if src == 0 or dst == 0:
        continue
    if src not in metadata:
        metadata[src] = {"title": str(src), "categories": []}
    if dst not in metadata:
        metadata[dst] = {"title": str(dst), "categories": []}
    src_title = "Title:" + metadata[src]['title'] + "\n\n" + "Categories:\n" + "\n".join(list(metadata[src]['categories'])[:3])
    dst_title = "Title:" + metadata[dst]['title'] + "\n\n" + "Categories:\n" + "\n".join(list(metadata[dst]['categories'])[:3])
    net.add_node(dst, label=src_title, title=src_title)
    net.add_node(src, label=dst_title, title=dst_title)
    net.add_edge(src, dst, value=0.1)

# Save the PyVis visualisation to a HTML file
# AttributeError: 'NoneType' object has no attribute 'render'

net.show("graph.html")



100%|██████████████████████████████████████████████████████████████████████████████████████████| 353/353 [00:00<00:00, 27154.32it/s]

graph.html





The first thing we need to do is create a train, test and validation split of the edges in the dataset. We start with creating a smaller graph with 20,000 nodes using the same script shown in the previous sections. You can use the following script to randomly split the edges into 3 sections with 5000 edges in the validation and test set each.

In [9]:
# Add 5000 edges in the validation and test sets respectively
transform = RandomLinkSplit(num_val=5000, num_test=5000, is_undirected=True, split_labels=True)
train_graph, val_graph, test_graph = transform(graph)

# Save the splits and save as W&B artifacts
torch.save(train_graph, 'train.pt')
torch.save(val_graph, 'val.pt')
torch.save(test_graph, 'test.pt')

## Model

In [10]:
class GNN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers, dropout):
        super(GNN, self).__init__()
        conv_model = pyg.nn.SAGEConv


        self.convs = nn.ModuleList()
        self.convs.append(conv_model(input_dim, hidden_dim))
        self.dropout = dropout
        self.num_layers = num_layers


        # Create num_layers GraphSAGE convs
        assert (self.num_layers >= 1), 'Number of layers is not >=1'
        for l in range(self.num_layers - 1):
            self.convs.append(conv_model(hidden_dim, hidden_dim))


        # post-message-passing processing 
        self.post_mp = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim), nn.Dropout(self.dropout),
            nn.Linear(hidden_dim, output_dim))


    def forward(self, x, edge_index):
        for i in range(self.num_layers):
            x = self.convs[i](x, edge_index)
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout, training=self.training)

        x = self.post_mp(x)

        # Return final layer of embeddings if specified
        return x

# Link Prediction

For a pair of nodes, the previous module provides an embedding for both of them. This module is responsible for combining the two embeddings and making a binary prediction.

In [11]:
class LinkPredictor(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_layers,
                 dropout):
        super(LinkPredictor, self).__init__()

        # Create linear layers
        self.lins = nn.ModuleList()
        self.lins.append(nn.Linear(in_channels, hidden_channels))
        for _ in range(num_layers - 2):
            self.lins.append(nn.Linear(hidden_channels, hidden_channels))
        self.lins.append(nn.Linear(hidden_channels, out_channels))

        self.dropout = dropout

    def reset_parameters(self):
        for lin in self.lins:
            lin.reset_parameters()

    def forward(self, x_i, x_j):
        # x_i and x_j are both of shape (E, D)
        x = x_i * x_j
        for lin in self.lins[:-1]:
            x = lin(x)
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.lins[-1](x)
        return torch.sigmoid(x)

# Training the Model

Training a link prediction model brings up a very interesting problem: the dataset we possess is a list of edges in the graph and when you think about it as a binary classification problem, this means we only have positive samples. Hence, there exists a concept called 'negative edges' i.e. edges that do not actually exist in the graph which we consider as negative samples.

In [12]:
def train(model, link_predictor, emb, edge_index, pos_train_edge, batch_size, optimizer):
    model.train()
    link_predictor.train()

    train_losses = []

    for edge_id in tqdm(pyg.loader.DataLoader(range(pos_train_edge.shape[0]), batch_size, shuffle=True), leave=True):
        optimizer.zero_grad()

        # Run message passing on the inital node embeddings to get updated embeddings
        node_emb = model(emb, edge_index)  # (N, d)

        # Predict the class probabilities on the batch of positive edges using link_predictor
        pos_edge = pos_train_edge[edge_id].T  # (2, B)
        pos_pred = link_predictor(node_emb[pos_edge[0]], node_emb[pos_edge[1]])  # (B, )

        # Sample negative edges (same number as number of positive edges) and predict class probabilities 
        neg_edge = utils.negative_sampling(edge_index, num_nodes=emb.shape[0],
                                     num_neg_samples=edge_id.shape[0], method='sparse')  # (Ne,2)
        neg_pred = link_predictor(node_emb[neg_edge[0]], node_emb[neg_edge[1]])  # (Ne,)

        # Compute the corresponding negative log likelihood loss on the positive and negative edges
        loss = -torch.log(pos_pred + 1e-15).mean() - torch.log(1 - neg_pred + 1e-15).mean()

        # Backpropagate and update parameters
        loss.backward()
        optimizer.step()

        train_losses.append(loss.item())
    return sum(train_losses) / len(train_losses)

### Configure the training

In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
optim_wd = 0

#epochs = 300
epochs = 3

hidden_dim = 1024
dropout = 0.3
num_layers = 2
lr = 1e-5
node_emb_dim = 2  # 2 features: in/out-degrees
batch_size = 1024

In [14]:
train_graph = train_graph.to(device)
val_graph = val_graph.to(device)

model = GNN(node_emb_dim, hidden_dim, hidden_dim, num_layers, dropout).to(device) # the graph neural network that takes all the node embeddings as inputs to message pass and agregate
link_predictor = LinkPredictor(hidden_dim, hidden_dim, 1, num_layers + 1, dropout).to(device)

optimizer = torch.optim.Adam(
    list(model.parameters()) + list(link_predictor.parameters()),
    lr=lr, weight_decay=optim_wd
)


train_loss = train(
    model, 
    link_predictor, 
    torch.tensor(train_graph.x).float().to(device), 
    train_graph.edge_index, 
    train_graph.pos_edge_label_index.T, 
    batch_size, 
    optimizer
)

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [1:08:18<00:00, 85.38s/it]


## Explain model using GNNExplainer

### you must control below the number of epochs used to train the explainer!

In [15]:
epoxhs_explainer = 10

In [16]:
explainer = Explainer(
    model=model,
    algorithm=GNNExplainer(epochs=epoxhs_explainer),
    explanation_type='model',
    node_mask_type='attributes',
    edge_mask_type='object',
    model_config=dict(
        mode='multiclass_classification',
        task_level='node',
        return_type='log_probs',
    ),
)

In [17]:
node_index = 10
explanation = explainer(
    torch.tensor(test_graph.x).float().to(device), 
    test_graph.edge_index, 
    index=node_index)
print(f'Generated explanations in {explanation.available_explanations}')

Generated explanations in ['node_mask', 'edge_mask']


In [18]:
path = 'subgraph.pdf'
explanation.visualize_graph(path)
print(f"Subgraph visualization plot has been saved to '{path}'")

Subgraph visualization plot has been saved to 'subgraph.pdf'


In [19]:
IFrame('subgraph.pdf', width=800, height=500)

In [20]:
test_graph.x.shape  # shape[1] ==>> number of features

(262111, 2)

In [21]:
# If there is only one feature, will raise
# ValueError: Cannot compute feature importance for object-level 'node_mask' (got shape torch.Size([262111, 1]))

In [22]:
path = 'feature_importance.png'
explanation.visualize_feature_importance(path, top_k=10)
print(f"Feature importance plot has been saved to '{path}'")

Feature importance plot has been saved to 'feature_importance.png'


<img src="./feature_importance.png" alt="Computed feature importance" />

## Explain model using PGExplainer

In [26]:
# from torch_geometric.explain.config import ModelConfig

In [27]:
explainer = Explainer(
    model=model,
    algorithm=PGExplainer(epochs=epoxhs_explainer, lr=0.003),
    explanation_type='phenomenon',
    edge_mask_type='object',
    model_config=dict(
        mode='multiclass_classification',
        task_level='node',
        return_type='log_probs',
    ),
)

In [35]:
# Train against a variety of node-level or graph-level predictions:
N = test_graph.x.shape[0]

indeces = np.random.choice(list(range(N)), replace=False, size=int(.1 * N))  # train only on 10%
# do I need to guarantee that `node_index` is in indeces??

# only positive examples
target = torch.tensor([1 for _ in range(N)])  

for epoch in range(epoxhs_explainer):
    for index in indeces:  # Indices to train against.
        loss = explainer.algorithm.train(
            epoch, 
            model, 
            torch.tensor(test_graph.x).float().to(device), 
            test_graph.edge_index,
            target=target, 
            index=torch.tensor(index)
        )

RuntimeError: [enforce fail at ..\c10\core\impl\alloc_cpu.cpp:72] data. DefaultCPUAllocator: not enough memory: you tried to allocate 1073606656 bytes.

In [None]:
# Get the final explanations:
explanation_pge = explainer(
    x, 
    torch.tensor(test_graph.x).float().to(device), 
    test_graph.edge_index, 
    index=node_index
)

In [None]:
path_pge = 'subgraph-pge.pdf'
explanation_pge.visualize_graph(path_pge)
print(f"Subgraph visualization plot has been saved to '{path_pge}'")

In [None]:
IFrame('subgraph-pge.pdf', width=800, height=500)