In [1]:
import scipy.sparse as sp
import numpy as np
import json
from pathlib import Path
path = Path('data_2024')
adj = sp.load_npz(path/'adj.npz')
feat  = np.load(path/'features.npy')
labels = np.load(path/'labels.npy')
splits = json.load(open(path/'splits.json'))
idx_train, idx_test = splits['idx_train'], splits['idx_test']

In [2]:
idx_test

[2119,
 2206,
 2165,
 1994,
 520,
 521,
 2103,
 1838,
 1840,
 116,
 134,
 2477,
 1522,
 821,
 1393,
 1983,
 2136,
 2478,
 1182,
 1617,
 899,
 1898,
 1592,
 903,
 815,
 1395,
 871,
 1596,
 1188,
 513,
 791,
 1296,
 1608,
 1149,
 1568,
 1625,
 1101,
 829,
 1595,
 1912,
 1856,
 308,
 603,
 1122,
 566,
 2175,
 2066,
 1556,
 339,
 1264,
 531,
 2266,
 3,
 191,
 423,
 2054,
 1098,
 338,
 2215,
 2184,
 302,
 982,
 725,
 362,
 1931,
 2178,
 745,
 1248,
 376,
 171,
 646,
 1569,
 1778,
 1974,
 321,
 156,
 2265,
 1814,
 1050,
 868,
 846,
 1969,
 2382,
 1508,
 636,
 428,
 2164,
 1044,
 1378,
 2259,
 1265,
 809,
 2387,
 1056,
 268,
 1459,
 1509,
 1158,
 1107,
 1024,
 755,
 293,
 2218,
 194,
 770,
 661,
 757,
 1450,
 1328,
 1151,
 986,
 1256,
 325,
 827,
 1300,
 1890,
 607,
 76,
 292,
 2237,
 236,
 1474,
 203,
 1065,
 2317,
 724,
 2308,
 2236,
 100,
 1423,
 1824,
 140,
 1641,
 148,
 1672,
 1540,
 240,
 2145,
 364,
 1291,
 921,
 637,
 1085,
 1484,
 139,
 1587,
 758,
 974,
 1257,
 1476,
 43,
 878,
 226

In [6]:
idx_test_filtered = [idx for idx in idx_test if idx < len(data.y)]
idx_test_filtered

[116,
 134,
 308,
 339,
 3,
 191,
 423,
 338,
 302,
 362,
 376,
 171,
 321,
 156,
 428,
 268,
 293,
 194,
 325,
 76,
 292,
 236,
 203,
 100,
 140,
 148,
 240,
 364,
 139,
 43,
 391,
 283,
 444,
 48,
 220,
 402,
 495,
 92,
 381,
 73,
 252,
 375,
 59,
 434,
 417,
 132,
 397,
 144,
 427,
 479,
 291,
 58,
 347,
 89,
 52,
 154,
 84,
 87,
 195,
 424,
 482,
 259,
 377,
 324,
 305,
 36,
 164,
 463,
 290,
 301,
 88,
 95,
 13,
 342,
 60,
 173,
 404,
 68,
 411,
 416,
 24,
 146,
 26,
 329,
 189,
 300,
 178,
 12,
 165,
 65,
 449,
 387,
 368,
 288,
 216,
 30,
 208,
 136,
 118,
 465,
 142,
 420,
 255,
 408,
 281,
 179,
 390,
 235,
 201,
 232,
 109,
 326,
 320,
 488,
 63,
 273,
 222,
 241,
 137,
 275,
 262,
 307,
 61,
 374,
 72,
 448,
 166,
 353,
 248,
 454,
 57,
 101,
 401,
 238,
 67,
 441,
 117,
 27,
 344,
 224,
 168,
 299,
 340,
 352,
 229,
 373,
 108,
 93,
 199,
 50,
 162,
 192,
 97,
 357,
 243,
 276,
 271,
 227,
 456,
 193,
 486,
 251,
 246,
 187,
 16,
 438,
 221,
 367,
 429,
 372,
 230,
 494,
 3

In [2]:
from torch_geometric.utils import from_scipy_sparse_matrix
edge_index = from_scipy_sparse_matrix(adj)
edge_index

(tensor([[   0,    0,    0,  ..., 2478, 2478, 2479],
         [1084, 1104, 1288,  ...,  931,  933,  999]]),
 tensor([1., 1., 1.,  ..., 1., 1., 1.]))

In [4]:
len(splits['idx_train']), len(splits['idx_test'])

(496, 1984)

In [3]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data

data = Data(x=torch.tensor(feat, dtype=torch.float),
            edge_index=edge_index[0],
            y=torch.tensor(labels, dtype=torch.long))

In [4]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv
from torch_geometric.nn import APPNP

class GraphSage(torch.nn.Module):
    def __init__(self, num_node_features, num_hidden, num_classes):
        super().__init__()
        self.appnp = APPNP(K=100, alpha=0.8, dropout=0.6, cached=True, add_self_loops=True, normalize=True)
        self.conv1 = SAGEConv(num_node_features, num_hidden)
        self.conv2 = SAGEConv(num_hidden, num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.message_passing(x, edge_index, self.conv1)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.message_passing(x, edge_index, self.conv2)
        
        return F.log_softmax(x, dim=1)
    
    def message_passing(self, x, edge_index, conv):
        # Perform message passing with the given convolutional layer
        return conv(x, edge_index)

In [14]:
data.y

tensor([5, 2, 2, 0, 1, 2, 2, 3, 1, 2, 2, 2, 0, 2, 1, 1, 3, 1, 2, 1, 0, 4, 2, 1,
        3, 2, 2, 3, 1, 2, 6, 2, 5, 2, 2, 3, 0, 1, 0, 2, 5, 3, 6, 4, 2, 2, 3, 2,
        3, 6, 4, 4, 1, 2, 3, 6, 2, 1, 2, 5, 2, 1, 5, 5, 0, 3, 2, 2, 6, 2, 1, 2,
        4, 3, 3, 3, 2, 3, 2, 6, 1, 0, 1, 0, 1, 3, 0, 2, 2, 1, 0, 6, 6, 6, 2, 1,
        6, 3, 3, 2, 3, 2, 6, 3, 1, 3, 0, 3, 0, 2, 0, 3, 6, 4, 1, 2, 6, 3, 2, 1,
        2, 4, 1, 2, 0, 0, 2, 2, 4, 2, 6, 1, 6, 3, 3, 2, 3, 1, 4, 0, 6, 5, 3, 5,
        1, 2, 0, 1, 1, 3, 3, 2, 3, 3, 3, 1, 0, 3, 3, 1, 1, 4, 4, 1, 2, 1, 3, 2,
        2, 6, 1, 2, 3, 6, 6, 3, 6, 2, 4, 2, 5, 0, 3, 6, 0, 6, 3, 6, 2, 1, 2, 4,
        1, 3, 1, 0, 2, 2, 2, 2, 3, 2, 0, 0, 0, 2, 4, 2, 0, 1, 2, 2, 4, 0, 3, 2,
        3, 2, 2, 4, 1, 6, 6, 6, 4, 6, 6, 1, 3, 3, 2, 1, 6, 6, 2, 3, 1, 0, 3, 2,
        2, 2, 2, 6, 2, 4, 4, 5, 4, 2, 2, 1, 2, 0, 4, 1, 1, 3, 2, 1, 2, 5, 2, 2,
        0, 4, 6, 1, 2, 3, 2, 5, 2, 4, 4, 6, 1, 5, 2, 5, 6, 1, 2, 3, 1, 3, 1, 4,
        2, 2, 1, 1, 3, 2, 3, 3, 2, 4, 6,

In [13]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv
from sklearn.model_selection import train_test_split, KFold
import numpy as np

best_val_acc = 0.0
best_model = None
best_test_acc = 0.0
best_test_predictions = None

k_folds = 8
kf = KFold(n_splits=k_folds, shuffle=False)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

accuracies = []
test_accuracies = []

for fold, (train_fold_idx, val_idx) in enumerate(kf.split(data.y)):
    print(f"Fold {fold + 1}/{k_folds}")

    data = data.to(device)

    model = GraphSage(num_node_features=data.x.shape[1],
                      num_hidden=128,
                      num_classes=(data.y.max() + 1).item()
                      ).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=.01, weight_decay=5e-3)

    best_epoch = 0
    for epoch in range(200):
        model.train()
        optimizer.zero_grad()
        out = model(data)
        loss = F.nll_loss(out[train_fold_idx], data.y[train_fold_idx])
        loss.backward()
        optimizer.step()

        model.eval()
        with torch.no_grad():
            pred = model(data).argmax(dim=1)
            val_acc = pred[val_idx].eq(data.y[val_idx]).sum().item() / len(val_idx)

            if val_acc > best_val_acc:
                best_val_acc = val_acc
                best_epoch = epoch
                best_model = model.state_dict().copy()

    print(f'Validation Accuracy: {val_acc} at Epoch {best_epoch}')

    # Load the best model for inference
    model.load_state_dict(best_model)
    model.eval()
    with torch.no_grad():
        pred = model(data).argmax(dim=1)
        acc = pred[idx_test_filtered].eq(data.y[idx_test_filtered]).sum().item() / len(idx_test_filtered)
      #  print(f"Test Accuracy: {acc}")
        if acc > best_test_acc:
            best_test_acc = acc
            best_test_predictions = pred[idx_test]
          #  np.savetxt('submissiontest.txt', best_test_predictions, fmt='%d')

Fold 1/8
Validation Accuracy: 0.1935483870967742 at Epoch 0
Fold 2/8
Validation Accuracy: 0.27419354838709675 at Epoch 0
Fold 3/8
Validation Accuracy: 0.16129032258064516 at Epoch 0
Fold 4/8
Validation Accuracy: 0.16129032258064516 at Epoch 0
Fold 5/8
Validation Accuracy: 0.22580645161290322 at Epoch 0
Fold 6/8
Validation Accuracy: 0.24193548387096775 at Epoch 0
Fold 7/8
Validation Accuracy: 0.0967741935483871 at Epoch 0
Fold 8/8
Validation Accuracy: 0.1935483870967742 at Epoch 0


# Hyperparameter testing

In [27]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv
from sklearn.model_selection import train_test_split, KFold
import numpy as np

best_val_acc = 0.0
best_model = None
best_test_acc = 0.0
best_test_predictions = None

k_folds = 15
kf = KFold(n_splits=k_folds, shuffle=False)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

accuracies = []
test_accuracies = []

for fold, (train_fold_idx, val_idx) in enumerate(kf.split(data.y)):
    print(f"Fold {fold + 1}/{k_folds}")
    data = data.to(device)

    model = GraphSage(num_node_features=data.x.shape[1],
                      num_hidden=128,
                      num_classes=(data.y.max() + 1).item()
                      ).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=3.5, weight_decay=5e-3)

    best_epoch = 0
    for epoch in range(100):
        model.train()
        optimizer.zero_grad()
        out = model(data)
        loss = F.nll_loss(out[train_fold_idx], data.y[train_fold_idx])
        loss.backward()
        optimizer.step()

        model.eval()
        with torch.no_grad():
            pred = model(data).argmax(dim=1)
            val_acc = pred[val_idx].eq(data.y[val_idx]).sum().item() / len(val_idx)

            if val_acc > best_val_acc:
                best_val_acc = val_acc
                best_epoch = epoch
                best_model = model.state_dict().copy()

    print(f"Validation Accuracy: {val_acc} at Epoch {best_epoch}")

    # Load the best model for inference
    model.load_state_dict(best_model)
    model.eval()
    with torch.no_grad():
        pred = model(data).argmax(dim=1)
        acc = pred[idx_test_filtered].eq(data.y[idx_test_filtered]).sum().item() / len(idx_test_filtered)
        print(f"Test Accuracy: {acc}")
        if acc > best_test_acc:
            best_test_acc = acc
            best_test_predictions = pred[idx_test]
          #  np.savetxt('submissiontest.txt', best_test_predictions, fmt='%d')
    print(f"Best Validation Accuracy: {best_val_acc} at Epoch {best_epoch}")

Fold 1/15
Validation Accuracy: 0.35294117647058826 at Epoch 70
Test Accuracy: 0.3012345679012346
Best Validation Accuracy: 0.47058823529411764 at Epoch 70
Fold 2/15
Validation Accuracy: 0.21212121212121213 at Epoch 0
Test Accuracy: 0.3012345679012346
Best Validation Accuracy: 0.47058823529411764 at Epoch 0
Fold 3/15
Validation Accuracy: 0.24242424242424243 at Epoch 0
Test Accuracy: 0.3012345679012346
Best Validation Accuracy: 0.47058823529411764 at Epoch 0
Fold 4/15
Validation Accuracy: 0.2727272727272727 at Epoch 0
Test Accuracy: 0.3012345679012346
Best Validation Accuracy: 0.47058823529411764 at Epoch 0
Fold 5/15
Validation Accuracy: 0.18181818181818182 at Epoch 0
Test Accuracy: 0.3012345679012346
Best Validation Accuracy: 0.47058823529411764 at Epoch 0
Fold 6/15
Validation Accuracy: 0.12121212121212122 at Epoch 0
Test Accuracy: 0.3012345679012346
Best Validation Accuracy: 0.47058823529411764 at Epoch 0
Fold 7/15
Validation Accuracy: 0.24242424242424243 at Epoch 0
Test Accuracy: 0.30

In [14]:
pred[idx_test_filtered]

tensor([2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 3, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1,
        2, 2, 2, 2, 2, 2, 0, 2, 2, 3, 1, 1, 1, 2, 2, 6, 1, 2, 0, 2, 1, 6, 2, 0,
        2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 0, 3, 0, 0, 2, 2, 6, 2, 2, 2, 2, 3, 1,
        2, 2, 2, 2, 3, 2, 6, 2, 2, 2, 2, 2, 2, 2, 6, 2, 2, 1, 2, 2, 2, 2, 2, 1,
        2, 2, 1, 1, 2, 2, 2, 0, 1, 1, 2, 2, 2, 2, 2, 1, 2, 6, 3, 2, 1, 2, 2, 1,
        6, 2, 2, 2, 1, 0, 2, 0, 4, 6, 2, 2, 1, 2, 0, 2, 3, 2, 2, 1, 2, 6, 3, 2,
        1, 2, 0, 1, 0, 6, 0, 2, 2, 2, 2, 2, 1, 2, 0, 0, 2, 1, 6, 2, 2, 2, 6, 3,
        6, 2, 6, 2, 1, 3, 2, 2, 2, 2, 1, 6, 3, 1, 2, 0, 2, 2, 1, 4, 2, 1, 2, 6,
        2, 2, 4, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 3, 2, 2, 6,
        2, 2, 1, 3, 3, 2, 2, 2, 2, 2, 2, 6, 2, 2, 2, 2, 6, 2, 3, 2, 2, 0, 0, 0,
        2, 6, 2, 2, 0, 2, 1, 2, 0, 1, 2, 2, 2, 2, 6, 2, 2, 2, 6, 0, 2, 6, 2, 2,
        2, 6, 2, 2, 2, 2, 1, 1, 2, 2, 2, 1, 2, 1, 1, 1, 0, 6, 2, 2, 2, 2, 2, 2,
        2, 2, 6, 4, 2, 2, 1, 2, 1, 2, 6,

In [18]:
data.y

tensor([5, 2, 2, 0, 1, 2, 2, 3, 1, 2, 2, 2, 0, 2, 1, 1, 3, 1, 2, 1, 0, 4, 2, 1,
        3, 2, 2, 3, 1, 2, 6, 2, 5, 2, 2, 3, 0, 1, 0, 2, 5, 3, 6, 4, 2, 2, 3, 2,
        3, 6, 4, 4, 1, 2, 3, 6, 2, 1, 2, 5, 2, 1, 5, 5, 0, 3, 2, 2, 6, 2, 1, 2,
        4, 3, 3, 3, 2, 3, 2, 6, 1, 0, 1, 0, 1, 3, 0, 2, 2, 1, 0, 6, 6, 6, 2, 1,
        6, 3, 3, 2, 3, 2, 6, 3, 1, 3, 0, 3, 0, 2, 0, 3, 6, 4, 1, 2, 6, 3, 2, 1,
        2, 4, 1, 2, 0, 0, 2, 2, 4, 2, 6, 1, 6, 3, 3, 2, 3, 1, 4, 0, 6, 5, 3, 5,
        1, 2, 0, 1, 1, 3, 3, 2, 3, 3, 3, 1, 0, 3, 3, 1, 1, 4, 4, 1, 2, 1, 3, 2,
        2, 6, 1, 2, 3, 6, 6, 3, 6, 2, 4, 2, 5, 0, 3, 6, 0, 6, 3, 6, 2, 1, 2, 4,
        1, 3, 1, 0, 2, 2, 2, 2, 3, 2, 0, 0, 0, 2, 4, 2, 0, 1, 2, 2, 4, 0, 3, 2,
        3, 2, 2, 4, 1, 6, 6, 6, 4, 6, 6, 1, 3, 3, 2, 1, 6, 6, 2, 3, 1, 0, 3, 2,
        2, 2, 2, 6, 2, 4, 4, 5, 4, 2, 2, 1, 2, 0, 4, 1, 1, 3, 2, 1, 2, 5, 2, 2,
        0, 4, 6, 1, 2, 3, 2, 5, 2, 4, 4, 6, 1, 5, 2, 5, 6, 1, 2, 3, 1, 3, 1, 4,
        2, 2, 1, 1, 3, 2, 3, 3, 2, 4, 6,

In [17]:
data.y[idx_test_filtered].shape

torch.Size([405])

In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv
from sklearn.model_selection import train_test_split, KFold
import numpy as np

best_val_acc = 0.0
best_model = None
best_test_acc = 0.0
best_test_predictions = None

k_folds = 10
kf = KFold(n_splits=k_folds, shuffle=False)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

accuracies = []
test_accuracies = []

for fold, (train_fold_idx, val_idx) in enumerate(kf.split(data.y)):
    print(f"Fold {fold + 1}/{k_folds}")

    data = data.to(device)

    model = GAT(num_node_features=data.x.shape[1],
                      num_hidden=100,
                      num_classes=(data.y.max() + 1).item()
                      ).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=2.0, weight_decay=5e-3)

    best_epoch = 0
    for epoch in range(100):
        model.train()
        optimizer.zero_grad()
        out = model(data)
        loss =  F.nll_loss(out[train_idx], data.y[train_idx])
        loss.backward()
        optimizer.step()

        model.eval()
        with torch.no_grad():
            pred = model(data).argmax(dim=1)
            val_acc = pred[val_idx].eq(data.y[val_idx]).sum().item() / len(val_idx)
            
            if val_acc > best_val_acc:
                best_val_acc = val_acc
                best_epoch = epoch
                best_model = model.state_dict().copy()

    print(f"Best Validation Accuracy: {best_val_acc} at Epoch {best_epoch}")

    # Load the best model for inference
    model.load_state_dict(best_model)
    model.eval()
    with torch.no_grad():
        if acc > best_test_acc:
            best_test_acc = acc
            best_test_predictions = pred[idx_test]
            np.savetxt('submissiontest.txt', best_test_predictions, fmt='%d')
