In [1]:
# Install required packages.
import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

# Helper function for visualization.
%matplotlib inline
import networkx as nx
import matplotlib.pyplot as plt


def visualize_graph(G, color):
    plt.figure(figsize=(7,7))
    plt.xticks([])
    plt.yticks([])
    nx.draw_networkx(G, pos=nx.spring_layout(G, seed=42), with_labels=False,
                     node_color=color, cmap="Set2")
    plt.show()


def visualize_embedding(h, color, epoch=None, loss=None):
    plt.figure(figsize=(7,7))
    plt.xticks([])
    plt.yticks([])
    h = h.detach().cpu().numpy()
    plt.scatter(h[:, 0], h[:, 1], s=140, c=color, cmap="Set2")
    if epoch is not None and loss is not None:
        plt.xlabel(f'Epoch: {epoch}, Loss: {loss.item():.4f}', fontsize=16)
    plt.show()

2.4.0+cu121
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.9/10.9 MB[0m [31m51.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.1/5.1 MB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for torch-geometric (pyproject.toml) ... [?25l[?25hdone


In [2]:
from torch_geometric.datasets import Planetoid
#from torch_geometric.datasets import KarateClub
dataset = Planetoid("","Cora")
#dataset=KarateClub()
print(f'Dataset: {dataset}:')
print('======================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.test.index
Processing...


Dataset: Cora():
Number of graphs: 1
Number of features: 1433
Number of classes: 7


Done!


In [3]:
data = dataset[0]  # Get the first graph object.

print(data)
print('==============================================================')

# Gather some statistics about the graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Number of training nodes: {data.train_mask.sum()}')
print(f'Training node label rate: {int(data.train_mask.sum()) / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')

Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])
Number of nodes: 2708
Number of edges: 10556
Average node degree: 3.90
Number of training nodes: 140
Training node label rate: 0.05
Has isolated nodes: False
Has self-loops: False
Is undirected: True


In [4]:
from IPython.display import Javascript  # Restrict height of output cell.
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))

edge_index = data.edge_index
print(edge_index)

<IPython.core.display.Javascript object>

tensor([[ 633, 1862, 2582,  ...,  598, 1473, 2706],
        [   0,    0,    0,  ..., 2707, 2707, 2707]])


In [5]:
#create adjacency matrics
import numpy as np
x=np.zeros([data.num_nodes,data.num_nodes])
for i in range(edge_index.shape[1]):
    x[edge_index[0,i],edge_index[1,i]]=1
    x[edge_index[1,i],edge_index[0,i]]=1
print(x)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 1. 0.]]


second version of cosine similarity vectorised

In [6]:
#this only works if the matrix x is symmetric, meaning we cannot use it with the embedd
def cosine_similarity(x):
  # Step 1: Normalize each row to have unit norm
  norms = np.linalg.norm(x, axis=1, keepdims=True)
  x_normalized = x / norms

  # Step 2: Compute cosine similarity matrix
  csim = np.dot(x_normalized, x_normalized.T)
  return csim
csim=cosine_similarity(x)
print(csim)

[[1.  0.  0.  ... 0.  0.  0. ]
 [0.  1.  0.  ... 0.  0.  0. ]
 [0.  0.  1.  ... 0.  0.  0. ]
 ...
 [0.  0.  0.  ... 1.  0.  0. ]
 [0.  0.  0.  ... 0.  1.  0.5]
 [0.  0.  0.  ... 0.  0.5 1. ]]


In [7]:
def suggestions(node,csim):
    x = csim  # Assuming csim is defined elsewhere as a cosine similarity matrix
    # Use argsort to get indices sorted by their cosine similarity values in ascending order
    indices = np.argsort(x[node])

    # Select the last 10 indices (the ones with the highest values) and reverse to make highest first
    top_indices = indices[-10:][::-1]

    # Print top indices for debug purposes
    for i in top_indices:  # Using 'i' directly to iterate over top_indices
        print(f'node: {i}, cosine similarity: {x[node][i]:.4f}')  # formatted to 4 decimal places

suggestions(10,csim)

node: 10, cosine similarity: 1.0000
node: 1140, cosine similarity: 0.7071
node: 1800, cosine similarity: 0.4082
node: 1986, cosine similarity: 0.0877
node: 306, cosine similarity: 0.0801
node: 890, cosine similarity: 0.0000
node: 906, cosine similarity: 0.0000
node: 905, cosine similarity: 0.0000
node: 904, cosine similarity: 0.0000
node: 903, cosine similarity: 0.0000


Find the suggested nodes using the embed

In [8]:
import torch
from torch.nn import Linear
from torch_geometric.nn import GCNConv


class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        torch.manual_seed(1234)
        self.conv1 = GCNConv(dataset.num_features, 4)
        self.conv2 = GCNConv(4, 4)
        self.conv3 = GCNConv(4, 2)
        self.classifier = Linear(2, dataset.num_classes)

    def forward(self, x, edge_index):
        h = self.conv1(x, edge_index)
        h = h.tanh()
        h = self.conv2(h, edge_index)
        h = h.tanh()
        h = self.conv3(h, edge_index)
        h = h.tanh()  # Final GNN embedding space.

        # Apply a final (linear) classifier.
        out = self.classifier(h)

        return out, h

model = GCN()
print(model)

GCN(
  (conv1): GCNConv(1433, 4)
  (conv2): GCNConv(4, 4)
  (conv3): GCNConv(4, 2)
  (classifier): Linear(in_features=2, out_features=7, bias=True)
)


In [9]:
model = GCN()
_, h = model(data.x, data.edge_index)

In [10]:
import time
from IPython.display import Javascript  # Restrict height of output cell.
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 430})'''))

model = GCN()
criterion = torch.nn.CrossEntropyLoss()  # Define loss criterion.
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)  # Define optimizer.

def train(data):
    optimizer.zero_grad()  # Clear gradients.
    out, h = model(data.x, data.edge_index)  # Perform a single forward pass.
    loss = criterion(out[data.train_mask], data.y[data.train_mask])  # Compute the loss solely based on the training nodes.
    loss.backward()  # Derive gradients.
    optimizer.step()  # Update parameters based on gradients.
    return loss, h

for epoch in range(2000):
    loss, h = train(data)

<IPython.core.display.Javascript object>

In [11]:

# Function to compute cosine similarity
def cosine_similarity(vec1, vec2):
    cos_sim = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
    return cos_sim

# Assuming h is the matrix of vectors where each row is a vector
h = h.detach().numpy()  # Make sure h is converted to a numpy array

# Initialize a dictionary to hold cosine similarities
csim = {}
for i in range(h.shape[0]):  # Iterate over each vector (row)
    for j in range(h.shape[0]):
        if i != j:  # Avoid self-similarity (diagonal)
            csim[(i, j)] = cosine_similarity(h[i, :], h[j, :])
        else:
            csim[(i, j)] = 0.0  # Set self-similarity to 0

# Create a dictionary to store the 10 most similar rows for each row
top = {}
for i in range(h.shape[0]):
    # Get all similarities for row i
    similarities = [(j, csim[(i, j)]) for j in range(h.shape[0])]
    # Sort by similarity score in descending order and select top 10 (excluding the row itself)
    top[i] = sorted(similarities, key=lambda x: x[1], reverse=True)[:10]

# Example: print the top 10 most similar rows for row 10
results = top[10]  # List of the 10 most similar rows for row 10
for node, similarity in results:
    print(f'node: {int(node)}, cosine similarity: {similarity:.4f}')  # formatted to 4 decimal places


node: 1692, cosine similarity: 1.0000
node: 1848, cosine similarity: 1.0000
node: 2671, cosine similarity: 1.0000
node: 62, cosine similarity: 1.0000
node: 65, cosine similarity: 1.0000
node: 94, cosine similarity: 1.0000
node: 415, cosine similarity: 1.0000
node: 603, cosine similarity: 1.0000
node: 716, cosine similarity: 1.0000
node: 851, cosine similarity: 1.0000
