In [9]:
import torch
import torch.nn.functional as F
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import GraphSAGE
from torch_geometric.loader import NeighborSampler

# Imports
import torch
import torch_cluster
from torch_geometric.nn import Node2Vec
from torch_geometric.datasets import Planetoid
from torchsummary import summary
from tqdm.notebook import tqdm

import ogb
# print(ogb.__version__)

import matplotlib.pyplot as plt
import plotly.express as px
import networkx as nx
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE

import time
import json
import copy
import random
import os.path as osp
from pprint import pprint

import importlib
import sys

sys.path.append('../')

import training
importlib.reload(training)
import graph_vis
importlib.reload(graph_vis)
import graph_creation
importlib.reload(graph_creation)
import utils
importlib.reload(utils)

from ogb.nodeproppred import Evaluator
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [10]:
device = utils.set_seeds_and_device() 
dataset,data = training.load_dataset('Cora', "../training_data/datasets")

Using device: cpu
Random seed set to: 42


# 3 aggrs, at the end create illsutration and get embeddings
for i in ['mean','max','lstm']:
    # Define the GraphSAGE model
    model = GraphSAGE(
        in_channels=dataset.num_features,
        hidden_channels=64,
        num_layers=2,
        aggr=i,
        out_channels=dataset.num_classes,
    ).to(device)

In [11]:

# GraphSAGE encoder
encoder = GraphSAGE(
    in_channels=dataset.num_features,
    hidden_channels=64,
    num_layers=2,
    aggr ='lstm',
    out_channels=32,
    dropout=0.5
).to('cpu')

optimizer = torch.optim.Adam(encoder.parameters(), lr=0.005)
sampler = NeighborSampler(data.edge_index, sizes=[10, 10], batch_size=128, shuffle=True, num_nodes=data.num_nodes)

# Manual skip-gram style loss
def unsup_loss(embeddings, pos_edge_index, num_neg=5):
    i, j = pos_edge_index
    z_i = embeddings[i]
    z_j = embeddings[j]
    pos_score = (z_i * z_j).sum(dim=-1)
    pos_loss = -torch.log(torch.sigmoid(pos_score) + 1e-15).mean()

    # Negative sampling
    neg_j = torch.randint(0, embeddings.size(0), (i.size(0) * num_neg,))
    z_k = embeddings[neg_j]
    z_i_rep = z_i.repeat_interleave(num_neg, dim=0)
    neg_score = (z_i_rep * z_k).sum(dim=-1)
    neg_loss = -torch.log(torch.sigmoid(-neg_score) + 1e-15).mean()
    return pos_loss + neg_loss

# Training
encoder.train()
for epoch in range(1, 100):
    optimizer.zero_grad()
    out = encoder(data.x, data.edge_index)
    loss = unsup_loss(out, data.edge_index)
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

# Get node embeddings from the trained encoder
encoder.eval()
with torch.no_grad():
    embeddings = encoder(data.x, data.edge_index)


RuntimeError: [enforce fail at alloc_cpu.cpp:114] data. DefaultCPUAllocator: not enough memory: you tried to allocate 49927094272 bytes.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

# Prepare embeddings and labels
X = embeddings.cpu().numpy() if hasattr(embeddings, 'cpu') else embeddings
y = data.y.cpu().numpy() if hasattr(data.y, 'cpu') else data.y

# Train/test split (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(),
    "MLP": MLPClassifier(max_iter=500)
}

results = {}

for name, model in models.items():
    try:
        print(f"\nTraining {name}...")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        print(f"{name} Test accuracy: {acc:.4f}")
        print(classification_report(y_test, y_pred))
        results[name] = acc
    except Exception as e:
        print(f"{name} failed: {e}")

# Summary table
results_df = pd.DataFrame(list(results.items()), columns=["Model", "Test Accuracy"])
print("\nModel Comparison Summary:")
print(results_df.sort_values(by="Test Accuracy", ascending=False).to_string(index=False))


Training Logistic Regression...
Logistic Regression Test accuracy: 0.4170
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        70
           1       0.65      0.77      0.70        43
           2       0.26      0.85      0.40        84
           3       0.53      0.55      0.54       164
           4       0.67      0.34      0.45        85
           5       0.38      0.05      0.09        60
           6       0.00      0.00      0.00        36

    accuracy                           0.42       542
   macro avg       0.36      0.36      0.31       542
weighted avg       0.40      0.42      0.36       542


Training SVM...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


SVM Test accuracy: 0.4151
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        70
           1       0.65      0.74      0.70        43
           2       0.27      0.90      0.41        84
           3       0.56      0.57      0.56       164
           4       0.81      0.20      0.32        85
           5       0.33      0.10      0.15        60
           6       0.00      0.00      0.00        36

    accuracy                           0.42       542
   macro avg       0.37      0.36      0.31       542
weighted avg       0.43      0.42      0.36       542


Training KNN...
KNN Test accuracy: 0.3875
              precision    recall  f1-score   support

           0       0.19      0.59      0.28        70
           1       0.66      0.77      0.71        43
           2       0.39      0.13      0.20        84
           3       0.54      0.43      0.48       164
           4       0.57      0.48      0.52        85
         

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Random Forest Test accuracy: 0.4133
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        70
           1       0.68      0.63      0.65        43
           2       0.30      0.79      0.43        84
           3       0.53      0.43      0.48       164
           4       0.52      0.52      0.52        85
           5       0.29      0.15      0.20        60
           6       0.44      0.19      0.27        36

    accuracy                           0.41       542
   macro avg       0.39      0.39      0.36       542
weighted avg       0.40      0.41      0.38       542


Training Gradient Boosting...
Gradient Boosting Test accuracy: 0.4317
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        70
           1       0.74      0.74      0.74        43
           2       0.30      0.79      0.43        84
           3       0.55      0.45      0.50       164
           4       0.56  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
