In [35]:
import torch
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import GraphSAGE
import torch.nn.functional as F

import importlib

import training
importlib.reload(training)
import graph_vis
importlib.reload(graph_vis)
import graph_creation
importlib.reload(graph_creation)
import utils
importlib.reload(utils)

<module 'utils' from 'c:\\Users\\csaba\\Documents\\Coding\\git_own\\thesis_coding\\utils.py'>

In [36]:
device = utils.set_seeds_and_device() 
dataset,data = training.load_dataset('Cora', "./training_data/datasets")


Using device: cpu
Random seed set to: 42


In [37]:
# Train function
def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = F.cross_entropy(out[data.train_mask], data.y[data.train_mask])
    # cross entroy loss makes this model supervised 
    loss.backward()
    optimizer.step()
    return loss.item()

# Test function
@torch.no_grad()
def test():
    model.eval()
    out = model(data.x, data.edge_index)
    pred = out.argmax(dim=1)
    accs = []
    for mask in [data.train_mask, data.val_mask, data.test_mask]:
        correct = pred[mask] == data.y[mask]
        accs.append(int(correct.sum()) / int(mask.sum()))
    return accs


In [None]:
# Define the GraphSAGE model
model = GraphSAGE(
    in_channels=dataset.num_features,
    hidden_channels=64,
    num_layers=2,
    aggr ='mean',
    out_channels=32,
    dropout=0.5
).to('cpu')

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

for epoch in range(1, 101):
    loss = train()
    train_acc, val_acc, test_acc = test()
    print(f'Epoch {epoch:03d}, Loss: {loss:.4f}, '
          f'Train: {train_acc:.4f}, Val: {val_acc:.4f}, Test: {test_acc:.4f}')

Epoch 001, Loss: 3.4512, Train: 0.2786, Val: 0.2700, Test: 0.2440
Epoch 002, Loss: 3.2432, Train: 0.2929, Val: 0.2820, Test: 0.2520
Epoch 003, Loss: 2.4522, Train: 0.2571, Val: 0.1280, Test: 0.1410
Epoch 004, Loss: 2.3658, Train: 0.1429, Val: 0.0720, Test: 0.0910
Epoch 005, Loss: 2.3237, Train: 0.4000, Val: 0.3720, Test: 0.3950
Epoch 006, Loss: 2.0291, Train: 0.5071, Val: 0.4860, Test: 0.5020
Epoch 007, Loss: 1.8088, Train: 0.4429, Val: 0.4260, Test: 0.4310
Epoch 008, Loss: 1.6611, Train: 0.4143, Val: 0.3480, Test: 0.3260
Epoch 009, Loss: 1.6332, Train: 0.4571, Val: 0.3720, Test: 0.3540
Epoch 010, Loss: 1.4882, Train: 0.4357, Val: 0.3520, Test: 0.3370
Epoch 011, Loss: 1.3094, Train: 0.5643, Val: 0.3680, Test: 0.3620
Epoch 012, Loss: 1.2371, Train: 0.8214, Val: 0.5740, Test: 0.5500
Epoch 013, Loss: 0.9345, Train: 0.9000, Val: 0.6340, Test: 0.6330
Epoch 014, Loss: 0.7152, Train: 0.9500, Val: 0.6580, Test: 0.6800
Epoch 015, Loss: 0.5963, Train: 0.9857, Val: 0.7340, Test: 0.7450
Epoch 016,

In [39]:
model.eval()
with torch.no_grad():
    embeddings = model(data.x, data.edge_index)

In [40]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

# Prepare embeddings and labels
X = embeddings.cpu().numpy() if hasattr(embeddings, 'cpu') else embeddings
y = data.y.cpu().numpy() if hasattr(data.y, 'cpu') else data.y

# Train/test split (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(),
    "MLP": MLPClassifier(max_iter=500)
}

results = {}

for name, model in models.items():
    try:
        print(f"\nTraining {name}...")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        print(f"{name} Test accuracy: {acc:.4f}")
        print(classification_report(y_test, y_pred))
        results[name] = acc
    except Exception as e:
        print(f"{name} failed: {e}")

# Summary table
results_df = pd.DataFrame(list(results.items()), columns=["Model", "Test Accuracy"])
print("\nModel Comparison Summary:")
print(results_df.sort_values(by="Test Accuracy", ascending=False).to_string(index=False))


Training Logistic Regression...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Test accuracy: 0.8358
              precision    recall  f1-score   support

           0       0.74      0.64      0.69        70
           1       0.88      0.84      0.86        43
           2       0.89      0.94      0.91        84
           3       0.83      0.87      0.85       164
           4       0.85      0.84      0.84        85
           5       0.89      0.85      0.87        60
           6       0.76      0.81      0.78        36

    accuracy                           0.84       542
   macro avg       0.83      0.83      0.83       542
weighted avg       0.83      0.84      0.83       542


Training SVM...
SVM Test accuracy: 0.8100
              precision    recall  f1-score   support

           0       0.59      0.51      0.55        70
           1       0.88      0.81      0.84        43
           2       0.94      0.90      0.92        84
           3       0.76      0.90      0.82       164
           4       0.84      0.81      0.83    