In [None]:
import importlib
import sys
sys.path.append('../')

import modules.training as training
importlib.reload(training)
import modules.graph_vis as graph_vis
importlib.reload(graph_vis)
import modules.graph_creation as graph_creation
importlib.reload(graph_creation)
import modules.utils as utils
importlib.reload(utils)

import torch
import torch.nn.functional as F
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import GCNConv


In [12]:
device = utils.set_seeds_and_device() 
dataset,data = training.load_dataset('Cora', "../training_data/datasets")

Using device: cpu
Random seed set to: 42


In [13]:
# Define the GCN model
class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

# Initialize model and optimizer
model = GCN(dataset.num_node_features, 16, dataset.num_classes).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

# Training function
def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

# Testing function
def test():
    model.eval()
    out = model(data.x, data.edge_index)
    pred = out.argmax(dim=1)
    accs = []
    for mask in [data.train_mask, data.val_mask, data.test_mask]:
        correct = pred[mask] == data.y[mask]
        accs.append(int(correct.sum()) / int(mask.sum()))
    return accs


In [14]:

# Run training loop
for epoch in range(101):
    loss = train()
    train_acc, val_acc, test_acc = test()
    if epoch % 10 == 0:
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}, '
              f'Val Acc: {val_acc:.4f}, Test Acc: {test_acc:.4f}')


Epoch: 000, Loss: 1.9457, Train Acc: 0.5571, Val Acc: 0.3680, Test Acc: 0.3790


Epoch: 010, Loss: 0.7768, Train Acc: 0.9786, Val Acc: 0.7100, Test Acc: 0.7440
Epoch: 020, Loss: 0.2544, Train Acc: 1.0000, Val Acc: 0.7540, Test Acc: 0.7960
Epoch: 030, Loss: 0.1034, Train Acc: 1.0000, Val Acc: 0.7580, Test Acc: 0.7920
Epoch: 040, Loss: 0.0938, Train Acc: 1.0000, Val Acc: 0.7560, Test Acc: 0.7950
Epoch: 050, Loss: 0.0659, Train Acc: 1.0000, Val Acc: 0.7700, Test Acc: 0.7950
Epoch: 060, Loss: 0.0556, Train Acc: 1.0000, Val Acc: 0.7720, Test Acc: 0.7960
Epoch: 070, Loss: 0.0300, Train Acc: 1.0000, Val Acc: 0.7700, Test Acc: 0.8000
Epoch: 080, Loss: 0.0410, Train Acc: 1.0000, Val Acc: 0.7700, Test Acc: 0.7940
Epoch: 090, Loss: 0.0446, Train Acc: 1.0000, Val Acc: 0.7680, Test Acc: 0.7910
Epoch: 100, Loss: 0.0406, Train Acc: 1.0000, Val Acc: 0.7760, Test Acc: 0.7970


In [15]:
# Get node embeddings from the trained GCN
model.eval()
with torch.no_grad():
    embeddings = model(data.x, data.edge_index)

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

# Prepare embeddings and labels
X = embeddings.cpu().numpy() if hasattr(embeddings, 'cpu') else embeddings
y = data.y.cpu().numpy() if hasattr(data.y, 'cpu') else data.y

# Train/test split (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(),
    "MLP": MLPClassifier(max_iter=500)
}

results = {}

for name, model in models.items():
    try:
        print(f"\nTraining {name}...")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        print(f"{name} Test accuracy: {acc:.4f}")
        print(classification_report(y_test, y_pred))
        results[name] = acc
    except Exception as e:
        print(f"{name} failed: {e}")

# Summary table
results_df = pd.DataFrame(list(results.items()), columns=["Model", "Test Accuracy"])
print("\nModel Comparison Summary:")
print(results_df.sort_values(by="Test Accuracy", ascending=False).to_string(index=False))



Training Logistic Regression...
Logistic Regression Test accuracy: 0.8358
              precision    recall  f1-score   support

           0       0.80      0.63      0.70        70
           1       0.81      0.81      0.81        43
           2       0.86      0.93      0.89        84
           3       0.81      0.90      0.85       164
           4       0.93      0.82      0.88        85
           5       0.84      0.78      0.81        60
           6       0.79      0.86      0.83        36

    accuracy                           0.84       542
   macro avg       0.84      0.82      0.82       542
weighted avg       0.84      0.84      0.83       542


Training SVM...
SVM Test accuracy: 0.8339
              precision    recall  f1-score   support

           0       0.71      0.70      0.71        70
           1       0.84      0.84      0.84        43
           2       0.87      0.93      0.90        84
           3       0.83      0.88      0.85       164
           4  