In [1]:
import scipy.sparse as sp
import numpy as np
import json
from pathlib import Path

path = Path('data_2024')
adj = sp.load_npz(path/'adj.npz')
feat  = np.load(path/'features.npy')
labels = np.load(path/'labels.npy')
splits = json.load(open(path/'splits.json'))
idx_train, idx_test = splits['idx_train'], splits['idx_test']

In [2]:
from torch_geometric.utils import from_scipy_sparse_matrix
edge_index = from_scipy_sparse_matrix(adj)
edge_index

(tensor([[   0,    0,    0,  ..., 2478, 2478, 2479],
         [1084, 1104, 1288,  ...,  931,  933,  999]]),
 tensor([1., 1., 1.,  ..., 1., 1., 1.]))

In [3]:
labels.shape

(496,)

In [4]:
feat.shape

(2480, 1390)

In [5]:
len(splits['idx_train']), len(splits['idx_test'])

(496, 1984)

In [6]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data

data = Data(x=torch.tensor(feat, dtype=torch.float),
            edge_index=edge_index[0],
            y=torch.tensor(labels, dtype=torch.long))

In [18]:
data

Data(x=[2480, 1390], edge_index=[2, 10100], y=[496])

In [7]:
data.train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
data.train_mask[idx_train] = 1
data.test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
data.test_mask[idx_test] = 1

In [20]:
data

Data(x=[2480, 1390], edge_index=[2, 10100], y=[496], train_mask=[2480], test_mask=[2480])

In [8]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv

class GraphSage(torch.nn.Module):
    def __init__(self, num_node_features, num_hidden, num_classes):
        super().__init__()
        self.conv1 = SAGEConv(num_node_features, num_hidden)
        self.conv2 = SAGEConv(num_hidden, num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)


In [9]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv
from sklearn.model_selection import train_test_split, KFold
import numpy as np

train_idx, test_idx = train_test_split(np.arange(len(data.y)), test_size=0.2, random_state=42)

k_folds = 5
kf = KFold(n_splits=k_folds, shuffle=True)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

accuracies = []
test_accuracies = []

for fold, (train_fold_idx, val_idx) in enumerate(kf.split(train_idx)):
    print(f"Fold {fold + 1}/{k_folds}")

    data = data.to(device)

    model = GraphSage(num_node_features=data.x.shape[1],
                num_hidden=32,
                num_classes=(data.y.max() + 1).item()
                ).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)

    model.train()
    for epoch in range(200):
        optimizer.zero_grad()
        out = model(data)
        loss = F.nll_loss(out[train_idx[train_fold_idx]], data.y[train_idx[train_fold_idx]])
        loss.backward()
        optimizer.step()

    model.eval()
    with torch.no_grad():
        pred = model(data).argmax(dim=1)
        acc = pred[train_idx[val_idx]].eq(data.y[train_idx[val_idx]]).sum().item() / len(val_idx)
        accuracies.append(acc)
        print(f"Accuracy: {acc}")
        
    with torch.no_grad():
        test_acc = pred[test_idx].eq(data.y[test_idx]).sum().item() / len(test_idx)
        test_accuracies.append(test_acc)
        print(f"Test Accuracy: {test_acc}")


avg_accuracy = np.mean(accuracies)
print(f"Average Accuracy: {avg_accuracy}")
avg_test_accuracy = np.mean(test_accuracies)
print(f"Average Test Accuracy: {avg_test_accuracy}")


Fold 1/5
Accuracy: 0.25
Test Accuracy: 0.26
Fold 2/5
Accuracy: 0.22784810126582278
Test Accuracy: 0.22
Fold 3/5
Accuracy: 0.21518987341772153
Test Accuracy: 0.25
Fold 4/5
Accuracy: 0.189873417721519
Test Accuracy: 0.29
Fold 5/5
Accuracy: 0.11392405063291139
Test Accuracy: 0.25
Average Accuracy: 0.19936708860759494
Average Test Accuracy: 0.254


In [10]:
model.eval()
pred = model(data).argmax(dim=1)
acc =  pred[test_idx].eq(data.y[test_idx]).sum().item() / len(test_idx)
print(f'Accuracy: {acc:.4f}')

Accuracy: 0.2500


In [11]:
#try normalizing 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
feat_normalized = scaler.fit_transform(feat)

In [12]:
data1 = Data(x=torch.tensor(feat_normalized, dtype=torch.float),
            edge_index=edge_index[0],
            y=torch.tensor(labels, dtype=torch.long))

In [13]:
data1.train_mask = torch.zeros(data1.num_nodes, dtype=torch.bool)
data1.train_mask[idx_train] = 1
data1.test_mask = torch.zeros(data1.num_nodes, dtype=torch.bool)
data1.test_mask[idx_test] = 1

In [14]:
data1

Data(x=[2480, 1390], edge_index=[2, 10100], y=[496], train_mask=[2480], test_mask=[2480])

In [15]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv
from sklearn.model_selection import train_test_split, KFold
import numpy as np

train_idx, test_idx = train_test_split(np.arange(len(data1.y)), test_size=0.2, random_state=42)

k_folds = 5
kf = KFold(n_splits=k_folds, shuffle=True)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

accuracies = []
test_accuracies = []

for fold, (train_fold_idx, val_idx) in enumerate(kf.split(train_idx)):
    print(f"Fold {fold + 1}/{k_folds}")

    data1 = data1.to(device)

    model = GraphSage(num_node_features=data1.x.shape[1],
                num_hidden=32,
                num_classes=(data1.y.max() + 1).item()
                ).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)

    model.train()
    for epoch in range(200):
        optimizer.zero_grad()
        out = model(data1)
        loss = F.nll_loss(out[train_idx[train_fold_idx]], data1.y[train_idx[train_fold_idx]])
        loss.backward()
        optimizer.step()

    model.eval()
    with torch.no_grad():
        pred = model(data1).argmax(dim=1)
        acc = pred[train_idx[val_idx]].eq(data1.y[train_idx[val_idx]]).sum().item() / len(val_idx)
        accuracies.append(acc)
        print(f"Accuracy: {acc}")
        
    with torch.no_grad():
        test_acc = pred[test_idx].eq(data1.y[test_idx]).sum().item() / len(test_idx)
        test_accuracies.append(test_acc)
        print(f"Test Accuracy: {test_acc}")


avg_accuracy = np.mean(accuracies)
print(f"Average Accuracy: {avg_accuracy}")
avg_test_accuracy = np.mean(test_accuracies)
print(f"Average Test Accuracy: {avg_test_accuracy}")


Fold 1/5
Accuracy: 0.25
Test Accuracy: 0.24
Fold 2/5
Accuracy: 0.1518987341772152
Test Accuracy: 0.22
Fold 3/5
Accuracy: 0.16455696202531644
Test Accuracy: 0.28
Fold 4/5
Accuracy: 0.11392405063291139
Test Accuracy: 0.2
Fold 5/5
Accuracy: 0.13924050632911392
Test Accuracy: 0.27
Average Accuracy: 0.1639240506329114
Average Test Accuracy: 0.242


In [16]:
model.eval()
pred = model(data).argmax(dim=1)
acc =  pred[test_idx].eq(data1.y[test_idx]).sum().item() / len(test_idx)
print(f'Accuracy: {acc:.4f}')

Accuracy: 0.1900
