In [None]:
# Install required packages.
import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

# Helper function for visualization.
%matplotlib inline
import networkx as nx
import matplotlib.pyplot as plt


def visualize_graph(G, color):
    plt.figure(figsize=(7,7))
    plt.xticks([])
    plt.yticks([])
    nx.draw_networkx(G, pos=nx.spring_layout(G, seed=42), with_labels=False,
                     node_color=color, cmap="Set2")
    plt.show()


def visualize_embedding(h, color, epoch=None, loss=None):
    plt.figure(figsize=(7,7))
    plt.xticks([])
    plt.yticks([])
    h = h.detach().cpu().numpy()
    plt.scatter(h[:, 0], h[:, 1], s=140, c=color, cmap="Set2")
    if epoch is not None and loss is not None:
        plt.xlabel(f'Epoch: {epoch}, Loss: {loss.item():.4f}', fontsize=16)
    plt.show()

2.4.0+cu121
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
#from torch_geometric.datasets import Planetoid
from torch_geometric.datasets import KarateClub
#dataset = Planetoid("","Cora")
dataset=KarateClub()
print(f'Dataset: {dataset}:')
print('======================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

Dataset: KarateClub():
Number of graphs: 1
Number of features: 34
Number of classes: 4


In [None]:
data = dataset[0]  # Get the first graph object.

print(data)
print('==============================================================')

# Gather some statistics about the graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Number of training nodes: {data.train_mask.sum()}')
print(f'Training node label rate: {int(data.train_mask.sum()) / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')

Data(x=[34, 34], edge_index=[2, 156], y=[34], train_mask=[34])
Number of nodes: 34
Number of edges: 156
Average node degree: 4.59
Number of training nodes: 4
Training node label rate: 0.12
Has isolated nodes: False
Has self-loops: False
Is undirected: True


In [None]:
from IPython.display import Javascript  # Restrict height of output cell.
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))

edge_index = data.edge_index
print(edge_index)

<IPython.core.display.Javascript object>

tensor([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,
          1,  1,  1,  1,  1,  1,  1,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  3,
          3,  3,  3,  3,  3,  4,  4,  4,  5,  5,  5,  5,  6,  6,  6,  6,  7,  7,
          7,  7,  8,  8,  8,  8,  8,  9,  9, 10, 10, 10, 11, 12, 12, 13, 13, 13,
         13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 19, 20, 20, 21,
         21, 22, 22, 23, 23, 23, 23, 23, 24, 24, 24, 25, 25, 25, 26, 26, 27, 27,
         27, 27, 28, 28, 28, 29, 29, 29, 29, 30, 30, 30, 30, 31, 31, 31, 31, 31,
         31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
         33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33],
        [ 1,  2,  3,  4,  5,  6,  7,  8, 10, 11, 12, 13, 17, 19, 21, 31,  0,  2,
          3,  7, 13, 17, 19, 21, 30,  0,  1,  3,  7,  8,  9, 13, 27, 28, 32,  0,
          1,  2,  7, 12, 13,  0,  6, 10,  0,  6, 10, 16,  0,  4,  5, 16,  0,  1,
          2,  3,  0,  2, 30, 32, 33,  2, 33,  0,  4

In [None]:
#create adjacency matrics
import numpy as np
x=np.zeros([data.num_nodes,data.num_nodes])
for i in range(edge_index.shape[1]):
    x[edge_index[0,i],edge_index[1,i]]=1
    x[edge_index[1,i],edge_index[0,i]]=1
print(x)

[[0. 1. 1. ... 1. 0. 0.]
 [1. 0. 1. ... 0. 0. 0.]
 [1. 1. 0. ... 0. 1. 0.]
 ...
 [1. 0. 0. ... 0. 1. 1.]
 [0. 0. 1. ... 1. 0. 1.]
 [0. 0. 0. ... 1. 1. 0.]]


second version of cosine similarity vectorised

In [None]:
#this only works if the matrix x is symmetric, meaning we cannot use it with the embedd
def cosine_similarity(x):
  # Step 1: Normalize each row to have unit norm
  norms = np.linalg.norm(x, axis=1, keepdims=True)
  x_normalized = x / norms

  # Step 2: Compute cosine similarity matrix
  csim = np.dot(x_normalized, x_normalized.T)
  return csim
csim=cosine_similarity(x)
print(csim)

[[1.         0.58333333 0.39528471 ... 0.         0.21650635 0.24253563]
 [0.58333333 1.         0.42163702 ... 0.13608276 0.19245009 0.24253563]
 [0.39528471 0.42163702 1.         ... 0.38729833 0.09128709 0.46017899]
 ...
 [0.         0.13608276 0.38729833 ... 1.         0.11785113 0.19802951]
 [0.21650635 0.19245009 0.09128709 ... 0.11785113 1.         0.70014004]
 [0.24253563 0.24253563 0.46017899 ... 0.19802951 0.70014004 1.        ]]


In [None]:
def suggestions(node,csim):
    x = csim  # Assuming csim is defined elsewhere as a cosine similarity matrix
    # Use argsort to get indices sorted by their cosine similarity values in ascending order
    indices = np.argsort(x[node])

    # Select the last 10 indices (the ones with the highest values) and reverse to make highest first
    top_indices = indices[-10:][::-1]

    # Print top indices for debug purposes
    for i in top_indices:  # Using 'i' directly to iterate over top_indices
        print(f'node: {i}, cosine similarity: {x[node][i]:.4f}')  # formatted to 4 decimal places

suggestions(10,csim)

node: 10, cosine similarity: 1.0000
node: 6, cosine similarity: 0.8660
node: 11, cosine similarity: 0.5774
node: 17, cosine similarity: 0.4082
node: 12, cosine similarity: 0.4082
node: 16, cosine similarity: 0.4082
node: 21, cosine similarity: 0.4082
node: 19, cosine similarity: 0.3333
node: 4, cosine similarity: 0.3333
node: 0, cosine similarity: 0.2887


Find the suggested nodes using the embed

In [None]:
import torch
from torch.nn import Linear
from torch_geometric.nn import GCNConv


class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        torch.manual_seed(1234)
        self.conv1 = GCNConv(dataset.num_features, 4)
        self.conv2 = GCNConv(4, 4)
        self.conv3 = GCNConv(4, 2)
        self.classifier = Linear(2, dataset.num_classes)

    def forward(self, x, edge_index):
        h = self.conv1(x, edge_index)
        h = h.tanh()
        h = self.conv2(h, edge_index)
        h = h.tanh()
        h = self.conv3(h, edge_index)
        h = h.tanh()  # Final GNN embedding space.

        # Apply a final (linear) classifier.
        out = self.classifier(h)

        return out, h

model = GCN()
print(model)

GCN(
  (conv1): GCNConv(34, 4)
  (conv2): GCNConv(4, 4)
  (conv3): GCNConv(4, 2)
  (classifier): Linear(in_features=2, out_features=4, bias=True)
)


In [None]:
model = GCN()

_, h = model(data.x, data.edge_index)

In [None]:
import time
from IPython.display import Javascript  # Restrict height of output cell.
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 430})'''))

model = GCN()
criterion = torch.nn.CrossEntropyLoss()  # Define loss criterion.
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)  # Define optimizer.

def train(data):
    optimizer.zero_grad()  # Clear gradients.
    out, h = model(data.x, data.edge_index)  # Perform a single forward pass.
    loss = criterion(out[data.train_mask], data.y[data.train_mask])  # Compute the loss solely based on the training nodes.
    loss.backward()  # Derive gradients.
    optimizer.step()  # Update parameters based on gradients.
    return loss, h

for epoch in range(400):
    loss, h = train(data)

<IPython.core.display.Javascript object>

In [None]:
# Function to compute cosine similarity
def cosine_similarity(vec1, vec2):
    cos_sim = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
    return cos_sim

h=h.detach().numpy()

# Assuming h is your matrix of vectors where each row is a vector
# Initialize dictionary to hold cosine similarities
csim = {}
for i in range(h.shape[0]):  # Iterate over each vector (row)
    for j in range(h.shape[0]):
        csim[(i, j)] = cosine_similarity(h[i, :], h[j, :])

# Create for each row a list of the 10 most similar rows
top = {}
for i in range(h.shape[0]):
    top_i = np.zeros([10, 2]) - 1  # Initialize with -1 to handle minimum correctly
    for j in range(h.shape[0]):
        current_similarity = csim[(i, j)]
        if current_similarity > np.min(top_i[:, 1]):
            min_index = np.argmin(top_i[:, 1])
            top_i[min_index] = [j, current_similarity]
    top[i] = top_i

# Example: print the top 10 most similar rows for row 10
results =np.array( top[10] ) # array of the 10 most similar rows for node 10
sorted_indices = np.argsort(results[:, 1])
sorted_results = results[sorted_indices][::-1]

for node, similarity in sorted_results:
    print(f'node: {int(node)}, cosine similarity: {similarity:.4f}')  # formatted to 4 decimal places


node: 10, cosine similarity: 1.0000
node: 4, cosine similarity: 1.0000
node: 5, cosine similarity: 1.0000
node: 6, cosine similarity: 1.0000
node: 16, cosine similarity: 1.0000
node: 11, cosine similarity: 0.0119
node: 0, cosine similarity: 0.0007
node: 24, cosine similarity: -0.0007
node: 25, cosine similarity: -0.0009
node: 3, cosine similarity: -0.0423
