In [23]:
import torch
from torch_geometric.data import Data
from torch_geometric.utils import from_scipy_sparse_matrix
import scipy.sparse as sp
import numpy as np
import json
from sklearn.decomposition import PCA


adj = sp.load_npz('./data_2024/adj.npz')
feat  = np.load('./data_2024/features.npy')
labels = np.load('./data_2024/labels.npy')
splits = json.load(open('./data_2024/splits.json'))
idx_train, idx_test = splits['idx_train'], splits['idx_test']


# Dimensionality Reduction
n_components = 256
pca = PCA(n_components=n_components)
reduced_feat = pca.fit_transform(feat)


# Converting the reduced features and other arrays to torch tensors
reduced_feat = torch.tensor(reduced_feat, dtype=torch.float)
full_labels = -1 * np.ones(shape=(reduced_feat.shape[0],), dtype=np.int64)
full_labels[idx_train] = labels
labels = torch.tensor(full_labels, dtype=torch.long)



edge_index, _ = from_scipy_sparse_matrix(adj)

# Converting numpy arrays to torch tensors
# feat = torch.tensor(feat, dtype=torch.float)
# full_labels = -1 * np.ones(shape=(feat.shape[0],), dtype=np.int64)
# full_labels[idx_train] = labels
# labels = torch.tensor(full_labels, dtype=torch.long)

data = Data(x=reduced_feat, edge_index=edge_index, y=labels)
# data = Data(x=feat, edge_index=edge_index, y=labels)

train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
train_mask[idx_train] = True
test_mask[idx_test] = True
data.train_mask = train_mask
data.test_mask = test_mask

idx_train =idx_train + idx_test
train_mask[idx_test] = True
data.train_mask = train_mask
data.train_mask.tolist().count(True)

2480

In [24]:
data

Data(x=[2480, 256], edge_index=[2, 10100], y=[2480], train_mask=[2480], test_mask=[2480])

In [25]:
set(data.y.tolist())

{-1, 0, 1, 2, 3, 4, 5, 6}

In [26]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GCN(torch.nn.Module):
    def __init__(self, num_node_features, num_hidden, num_classes):
        super().__init__()
        self.conv1 = GCNConv(num_node_features, num_hidden)
        self.hid1 = GCNConv(num_hidden, 16)
        self.hid2 = GCNConv(16, num_hidden)
        self.conv2 = GCNConv(num_hidden, num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.hid1(x, edge_index)
        x = F.dropout(x, training=self.training)
        x = self.hid2(x, edge_index)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

## GCN Model

In [27]:
from sklearn.model_selection import KFold, StratifiedKFold
import torch.nn as nn

device = torch.device('mps' if torch.cuda.is_available() else 'cpu')
k = 10
kf = StratifiedKFold(n_splits=k)
idx_train_np = np.array(idx_train)
labels = data.y.numpy()[idx_train_np]

for fold, (train_idx, val_idx) in enumerate(kf.split(idx_train_np, labels)):
    print(f"Fold {fold+1}/{k}")

    model = GCN(num_node_features=data.x.shape[1], 
                num_hidden=64,
                num_classes=(data.y.max()+1).item()
               ).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.09, weight_decay=5e-4)
    criterion = nn.CrossEntropyLoss(ignore_index=-1)
    
    
    data.train_mask = torch.zeros(data.y.size(0), dtype=torch.bool)
    data.val_mask = torch.zeros(data.y.size(0), dtype=torch.bool)
    data.train_mask[idx_train_np[train_idx]] = True
    data.val_mask[idx_train_np[val_idx]] = True
    
    best_val_acc = 0 
    best_model_state = None 
    
    for epoch in range(250):
        model.train()
        optimizer.zero_grad()
        out = model(data)
#         loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask], ignore_index=-1)
        loss = criterion(out[data.train_mask], data.y[data.train_mask])
        loss.backward()
        optimizer.step()
        
        if epoch % 10 == 0:
            model.eval()
            with torch.no_grad():
                model.eval()
                pred = model(data).argmax(dim=1)
                
                valid_labels_mask = data.y[data.val_mask] != -1  # Mask to select valid labels not equal to -1
                correct_predictions = pred[data.val_mask][valid_labels_mask] == data.y[data.val_mask][valid_labels_mask]
                correct = correct_predictions.sum()
                acc = int(correct) / int(valid_labels_mask.sum())
                
                if acc > best_val_acc and acc >= 0.84:
                    best_val_acc = acc
                    best_model_state = model.state_dict()
                    
                val_loss = F.nll_loss(out[data.val_mask], data.y[data.val_mask], ignore_index=-1)
                print(f'Epoch {epoch}: Train Loss: {loss.item()}, Val Loss: {val_loss.item()}, Val Acc: {acc:.4f}')
            
    if best_model_state is not None:
        torch.save(best_model_state, f'gcn_best_fold_{fold+1}.pt')


Fold 1/10
Epoch 0: Train Loss: 1.9373828172683716, Val Loss: 1.9410227537155151, Val Acc: 0.3000
Epoch 10: Train Loss: 0.39241471886634827, Val Loss: 0.8774828910827637, Val Acc: 0.8400
Epoch 20: Train Loss: 0.1726866066455841, Val Loss: 0.7101258635520935, Val Acc: 0.8600
Epoch 30: Train Loss: 0.10583116114139557, Val Loss: 1.1731590032577515, Val Acc: 0.8200
Epoch 40: Train Loss: 0.1497054100036621, Val Loss: 0.5822405815124512, Val Acc: 0.8400
Epoch 50: Train Loss: 0.05215824767947197, Val Loss: 1.2186845541000366, Val Acc: 0.7800
Epoch 60: Train Loss: 0.04691096022725105, Val Loss: 1.3242735862731934, Val Acc: 0.8000
Epoch 70: Train Loss: 0.04846368357539177, Val Loss: 1.4133754968643188, Val Acc: 0.8000
Epoch 80: Train Loss: 0.058618318289518356, Val Loss: 1.4088554382324219, Val Acc: 0.7800
Epoch 90: Train Loss: 0.05487129092216492, Val Loss: 2.0155463218688965, Val Acc: 0.7600
Epoch 100: Train Loss: 0.0745745450258255, Val Loss: 1.0297975540161133, Val Acc: 0.7800
Epoch 110: Tra

KeyboardInterrupt: 

## GAT Model

In [15]:
import torch
from torch_geometric.nn import GATConv
import torch.nn.functional as F

class GAT(torch.nn.Module):
    def __init__(self, num_node_features, num_hidden, num_classes, heads=12, output_heads=1):
        super(GAT, self).__init__()
        self.conv1 = GATConv(num_node_features, num_hidden, heads=heads, dropout=0.2)
        self.conv2 = GATConv(num_hidden*heads, num_classes, heads=output_heads, concat=False, dropout=0.1)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        # First Graph Attention Layer
        x = F.dropout(x,training=self.training)
        x = self.conv1(x, edge_index)
        x = F.relu(x)

        # Second Graph Attention Layer
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)
    


device = torch.device('mps' if torch.cuda.is_available() else 'cpu')
k = 10

kf = StratifiedKFold(n_splits=k)
idx_train_np = np.array(idx_train)
labels = data.y.numpy()[idx_train_np]

for fold, (train_idx, val_idx) in enumerate(kf.split(idx_train_np, labels)):
    print(f"Fold {fold+1}/{k}")
    model = GAT(num_node_features=data.x.shape[1], 
                num_hidden=128,
                num_classes=(data.y.max()+1).item()
               ).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.009, weight_decay=5e-4)
    criterion = nn.CrossEntropyLoss(ignore_index=-1)
    
    data.train_mask = torch.zeros(data.y.size(0), dtype=torch.bool)
    data.val_mask = torch.zeros(data.y.size(0), dtype=torch.bool)
    data.train_mask[idx_train_np[train_idx]] = True
    data.val_mask[idx_train_np[val_idx]] = True
    
    best_val_acc = 0 
    best_model_state = None 
    
    for epoch in range(200):
        model.train()
        optimizer.zero_grad()
        out = model(data)
#         loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask], ignore_index=-1)
        loss = criterion(out[data.train_mask], data.y[data.train_mask])
        loss.backward()
        optimizer.step()
        
        if epoch % 50 == 0:
            model.eval()
            with torch.no_grad():
                model.eval()
                pred = model(data).argmax(dim=1)
                correct = (pred[data.val_mask] == data.y[data.val_mask]).sum()
                acc = int(correct) / int(data.val_mask.sum())
                
                valid_labels_mask = data.y[data.val_mask] != -1  # Mask to select valid labels not equal to -1
                correct_predictions = pred[data.val_mask][valid_labels_mask] == data.y[data.val_mask][valid_labels_mask]
                correct = correct_predictions.sum()
                acc = int(correct) / int(valid_labels_mask.sum())
                
                if acc > best_val_acc and acc >= 0.86:
                    best_val_acc = acc
                    best_model_state = model.state_dict()
                    
                val_loss = F.nll_loss(out[data.val_mask], data.y[data.val_mask], ignore_index=-1)
                print(f'Epoch {epoch}: Train Loss: {loss.item()}, Val Loss: {val_loss.item()}, Val Acc: {acc:.4f}')
            
    if best_model_state is not None:
        torch.save(best_model_state, f'gat_best_fold_{fold+1}.pt')


Fold 1/10
Epoch 0: Train Loss: 1.9459551572799683, Val Loss: 1.9481924772262573, Val Acc: 0.5800
Epoch 50: Train Loss: 0.08833102881908417, Val Loss: 0.7390847206115723, Val Acc: 0.8400
Epoch 100: Train Loss: 0.09454435855150223, Val Loss: 0.5776011943817139, Val Acc: 0.8400
Epoch 150: Train Loss: 0.06947901844978333, Val Loss: 0.6622166633605957, Val Acc: 0.9000
Fold 2/10
Epoch 0: Train Loss: 1.9357792139053345, Val Loss: 1.9523398876190186, Val Acc: 0.5000
Epoch 50: Train Loss: 0.11293189972639084, Val Loss: 1.430882215499878, Val Acc: 0.7400
Epoch 100: Train Loss: 0.06969195604324341, Val Loss: 1.5973516702651978, Val Acc: 0.7200
Epoch 150: Train Loss: 0.06445493549108505, Val Loss: 1.8129699230194092, Val Acc: 0.7400
Fold 3/10
Epoch 0: Train Loss: 1.9636907577514648, Val Loss: 1.9586172103881836, Val Acc: 0.5600
Epoch 50: Train Loss: 0.10136140882968903, Val Loss: 1.3195126056671143, Val Acc: 0.7600
Epoch 100: Train Loss: 0.07852602750062943, Val Loss: 1.0999146699905396, Val Acc: 

KeyboardInterrupt: 

## GSage Model

In [21]:
import torch
from torch_geometric.nn import SAGEConv
import torch.nn.functional as F

class GraphSAGE(torch.nn.Module):
    def __init__(self, num_node_features, num_hidden, num_classes):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(num_node_features, num_hidden)
        self.conv2 = SAGEConv(num_hidden, num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        # First GraphSAGE Layer
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.5, training=self.training)

        # Second GraphSAGE Layer
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

    
from sklearn.model_selection import KFold
import torch.nn as nn

device = torch.device('mps' if torch.cuda.is_available() else 'cpu')
k = 9
kf = StratifiedKFold(n_splits=k)
idx_train_np = np.array(idx_train)
labels = data.y.numpy()[idx_train_np]

for fold, (train_idx, val_idx) in enumerate(kf.split(idx_train_np, labels)):
    print(f"Fold {fold+1}/{k}")
    model = GraphSAGE(num_node_features=data.x.shape[1], 
                num_hidden=128,
                num_classes=(data.y.max()+1).item()
               ).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.009, weight_decay=5e-4)
    criterion = nn.CrossEntropyLoss(ignore_index=-1)
    
    data.train_mask = torch.zeros(data.y.size(0), dtype=torch.bool)
    data.val_mask = torch.zeros(data.y.size(0), dtype=torch.bool)
    data.train_mask[idx_train_np[train_idx]] = True
    data.val_mask[idx_train_np[val_idx]] = True
    
    best_val_acc = 0 
    best_model_state = None 
    
    for epoch in range(200):
        model.train()
        optimizer.zero_grad()
        out = model(data)
#         loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask], ignore_index=-1)
        loss = criterion(out[data.train_mask], data.y[data.train_mask])
        loss.backward()
        optimizer.step()
        
        if epoch % 50 == 0:
            model.eval()
            with torch.no_grad():
                model.eval()
                pred = model(data).argmax(dim=1)
                correct = (pred[data.val_mask] == data.y[data.val_mask]).sum()
                acc = int(correct) / int(data.val_mask.sum())
                
                valid_labels_mask = data.y[data.val_mask] != -1  # Mask to select valid labels not equal to -1
                correct_predictions = pred[data.val_mask][valid_labels_mask] == data.y[data.val_mask][valid_labels_mask]
                correct = correct_predictions.sum()
                acc = int(correct) / int(valid_labels_mask.sum())
                
                if acc > best_val_acc and acc >= 0.85:
                    best_val_acc = acc
                    best_model_state = model.state_dict()
                    
                val_loss = F.nll_loss(out[data.val_mask], data.y[data.val_mask], ignore_index=-1)
                print(f'Epoch {epoch}: Train Loss: {loss.item()}, Val Loss: {val_loss.item()}, Val Acc: {acc:.4f}')
            
    if best_model_state is not None:
        torch.save(best_model_state, f'gsage_best_fold_{fold+1}.pt')


Fold 1/9
Epoch 0: Train Loss: 1.9196665287017822, Val Loss: 1.9170328378677368, Val Acc: 0.4286
Epoch 50: Train Loss: 0.0027849439065903425, Val Loss: 0.5366899371147156, Val Acc: 0.8393
Epoch 100: Train Loss: 0.00939025916159153, Val Loss: 0.5010973811149597, Val Acc: 0.8393
Epoch 150: Train Loss: 0.007293348200619221, Val Loss: 0.49514517188072205, Val Acc: 0.8393
Fold 2/9
Epoch 0: Train Loss: 1.9309641122817993, Val Loss: 1.931994915008545, Val Acc: 0.4545
Epoch 50: Train Loss: 0.003553807269781828, Val Loss: 0.9710485935211182, Val Acc: 0.7636
Epoch 100: Train Loss: 0.008319416083395481, Val Loss: 0.7978211641311646, Val Acc: 0.7091
Epoch 150: Train Loss: 0.00715416157618165, Val Loss: 0.8832887411117554, Val Acc: 0.6727
Fold 3/9
Epoch 0: Train Loss: 1.943326711654663, Val Loss: 1.9446052312850952, Val Acc: 0.4727
Epoch 50: Train Loss: 0.0032829379197210073, Val Loss: 0.8310878872871399, Val Acc: 0.7818
Epoch 100: Train Loss: 0.008050698786973953, Val Loss: 0.7070969343185425, Val 

KeyboardInterrupt: 

## Stacking GCN, GAT & GSage Models

In [28]:
from collections import defaultdict
from scipy.stats import mode

device = torch.device('mps' if torch.cuda.is_available() else 'cpu')
num_classes = (data.y.max() + 1).item() 


all_predictions = []

model_types = ['gcn', 'gat', 'gsage'] 
num_folds = 10

for model_type in model_types:
    for fold in range(1, num_folds + 1):
        if model_type == 'gcn':
            model = GCN(num_node_features=data.x.shape[1], 
                num_hidden=64,
                num_classes=(data.y.max()+1).item()
               ).to(device)
        elif model_type == 'gat':
            model = GAT(num_node_features=data.x.shape[1], 
                num_hidden=128,
                num_classes=(data.y.max()+1).item()
               ).to(device)
        elif model_type == 'gsage':
            model = GraphSAGE(num_node_features=data.x.shape[1], 
                num_hidden=128,
                num_classes=(data.y.max()+1).item()
               ).to(device)
            
        try:
            model_path = f'./{model_type}_best_fold_{fold}.pt' 
            model.load_state_dict(torch.load(model_path))
            model.eval()

            with torch.no_grad():
                out = model(data.to(device))
                preds = out.argmax(dim=1)
                all_predictions.append(preds.cpu().numpy())
        except:
            continue

all_predictions = np.array(all_predictions)
majority_votes, _ = mode(all_predictions, axis=0)
majority_votes = torch.tensor(majority_votes.squeeze(), dtype=torch.long)
correct = (majority_votes[data.val_mask] == data.y[data.val_mask]).sum()

  majority_votes, _ = mode(all_predictions, axis=0)


In [19]:
# model.eval()
# pred = model(data).argmax(dim=1)
# correct = (pred[data.val_mask] == data.y[data.val_mask]).sum()
# acc = int(correct) / int(data.val_mask.sum())
# print(f'Accuracy: {acc:.4f}')

## submitting the result

In [20]:
preds = majority_votes[idx_test]
np.savetxt('submission.txt', preds, fmt='%d')