In [1]:
import os.path as osp

import torch
from tqdm import tqdm

import torch_geometric as PyG
from torch_geometric.datasets import AmazonBook
from torch_geometric.nn import LightGCN
from torch_geometric.utils import degree

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [2]:
path = osp.join(osp.dirname('..'), 'data', 'Amazon')
dataset = AmazonBook(path)
data = dataset[0]
print(data)
num_users, num_books = data['user'].num_nodes, data['book'].num_nodes
heterodata = data
data = heterodata.to_homogeneous().to(device)
data

HeteroData(
  user={ num_nodes=52643 },
  book={ num_nodes=91599 },
  (user, rates, book)={
    edge_index=[2, 2380730],
    edge_label_index=[2, 603378],
  },
  (book, rated_by, user)={ edge_index=[2, 2380730] }
)


Data(edge_index=[2, 4761460], edge_label_index=[2, 603378], node_type=[144242], edge_type=[4761460])

In [3]:
assert (heterodata['user', 'rates', 'book'].edge_index[0] < heterodata['user'].num_nodes).all()
assert (heterodata['book', 'rated_by', 'user'].edge_index[1] < heterodata['book'].num_nodes).all()
print("Total nodes:", heterodata.num_nodes)

Total nodes: 144242


In [4]:
# book -> user
def _min_max(x):
    print('0. min:', min(x[0]), 'max:', max(x[0]))
    print('1. min:', min(x[1]), 'max:', max(x[1]))
    print()

_min_max(heterodata['user', 'rates', 'book'].edge_index)
_min_max(data.edge_index)
_min_max(data.edge_index[:, :2380730])
_min_max(data.edge_index[:, 2380730:])

0. min: tensor(0) max: tensor(52642)
1. min: tensor(0) max: tensor(91598)

0. min: tensor(0) max: tensor(144241)
1. min: tensor(0) max: tensor(144241)

0. min: tensor(0) max: tensor(52642)
1. min: tensor(52643) max: tensor(144241)

0. min: tensor(52643) max: tensor(144241)
1. min: tensor(0) max: tensor(52642)



In [5]:
# Use all message passing edges as training labels:
batch_size = 8192
# Convert to onedirectional edge
mask = data.edge_index[0] < data.edge_index[1]
train_edge_label_index = data.edge_index[:, mask]
train_loader = torch.utils.data.DataLoader(
    range(train_edge_label_index.size(1)),
    shuffle=True,
    batch_size=batch_size,
)

In [6]:
_min_max(train_edge_label_index)
assert (data.edge_index[:, data.edge_type == 0] == train_edge_label_index).all()
assert (train_edge_label_index[0] < 52643).all()
assert (train_edge_label_index[1] >= 52643).all()

0. min: tensor(0) max: tensor(52642)
1. min: tensor(52643) max: tensor(144241)



In [7]:
model = LightGCN(
    num_nodes=data.num_nodes,
    embedding_dim=64,
    num_layers=3,
).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [8]:
print(PyG.nn.summary(model, data.edge_index[:, :batch_size]))

+------------------------+-------------------------+----------------+-----------+
| Layer                  | Input Shape             | Output Shape   | #Param    |
|------------------------+-------------------------+----------------+-----------|
| LightGCN               | [2, 8192]               | [8192]         | 9,231,488 |
| ├─(embedding)Embedding | --                      | --             | 9,231,488 |
| ├─(convs)ModuleList    | --                      | --             | --        |
| │    └─(0)LGConv       | [144242, 64], [2, 8192] | [144242, 64]   | --        |
| │    └─(1)LGConv       | [144242, 64], [2, 8192] | [144242, 64]   | --        |
| │    └─(2)LGConv       | [144242, 64], [2, 8192] | [144242, 64]   | --        |
+------------------------+-------------------------+----------------+-----------+


In [9]:
def train():
    total_loss = total_examples = 0

    for index in tqdm(train_loader):
        # Sample positive and negative labels.
        pos_edge_label_index = train_edge_label_index[:, index]
        neg_edge_label_index = torch.stack([
            pos_edge_label_index[0],
            torch.randint(num_users, num_users + num_books,
                          (index.numel(), ), device=device)
        ], dim=0)
        edge_label_index = torch.cat([
            pos_edge_label_index,
            neg_edge_label_index,
        ], dim=1)

        optimizer.zero_grad()

        # WHY DOES IT USE data.edge_index here??
        # but don't worry, it should be homogeneized
        pos_rank, neg_rank = model(data.edge_index, edge_label_index).chunk(2)

        loss = model.recommendation_loss(
            pos_rank,
            neg_rank,
            node_id=edge_label_index.unique(),
        )
        loss.backward()
        optimizer.step()

        total_loss += float(loss) * pos_rank.numel()
        total_examples += pos_rank.numel()

    return total_loss / total_examples

In [10]:
@torch.no_grad()
def test(k: int):
    # WHY DOES IT USE DATA.EDGE INDEX NOW??? AND NOT 
    emb = model.get_embedding(data.edge_index)
    user_emb, book_emb = emb[:num_users], emb[num_users:]

    print((model.convs[0](model.embedding.weight, data.edge_index) == 0).all())

    precision = recall = total_examples = 0
    for start in range(0, num_users, batch_size):
        end = start + batch_size
        logits = user_emb[start:end] @ book_emb.t()

        # print('start:', start, 'end:', end)
        # print('logits.size()', logits.size())

        # Exclude training edges:
        mask = ((train_edge_label_index[0] >= start) &
                (train_edge_label_index[0] < end))
        logits[train_edge_label_index[0, mask] - start,
               train_edge_label_index[1, mask] - num_users] = float('-inf')

        # print('mask.size()', mask.size())

        # Computing precision and recall:
        ground_truth = torch.zeros_like(logits, dtype=torch.bool)
        mask = ((data.edge_label_index[0] >= start) &
                (data.edge_label_index[0] < end))
        ground_truth[data.edge_label_index[0, mask] - start,
                     data.edge_label_index[1, mask] - num_users] = True
        node_count = degree(data.edge_label_index[0, mask] - start,
                            num_nodes=logits.size(0))

        # print('ground_truth.size():', ground_truth.size())
        # print('node_count.size():', node_count.size())

        topk_index = logits.topk(k, dim=-1).indices
        # print('topk_index.size()', topk_index.size())
        isin_mat = ground_truth.gather(1, topk_index)

        precision += float((isin_mat.sum(dim=-1) / k).sum())
        recall += float((isin_mat.sum(dim=-1) / node_count.clamp(1e-6)).sum())
        total_examples += int((node_count > 0).sum())
        print(20*'=')

    return precision / total_examples, recall / total_examples

precision, recall = test(20)

tensor(False)


In [None]:
for epoch in range(1, 5):
    loss = train()
    precision, recall = test(k=20)
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Precision@20: '
          f'{precision:.4f}, Recall@20: {recall:.4f}')

100%|████████████████████████████████████████████████████████████████████████████████| 291/291 [24:21<00:00,  5.02s/it]


tensor(False)
Epoch: 001, Loss: 0.5014, Precision@20: 0.0049, Recall@20: 0.0100


  0%|                                                                                          | 0/291 [00:00<?, ?it/s]