In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.13.1+cu116.html
!pip install torch-sparse -f https://pytorch-geometric.com/whl/torch-1.13.1+cu116.html
!pip install torch-geometric
!pip install ogb

In [None]:
import torch
import torch.nn as nn

import torch_geometric
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from torch.utils.data import Dataset
import torch.nn.functional as F

from tqdm.notebook import tqdm

import numpy as np
import pandas as pd

import os

In [None]:
class GraphDataset(Dataset):
  def __init__(self, root, transform=None):

    self.data = []
    for graph_folder in tqdm(os.listdir(root)):
      graph_path = os.path.join(root, graph_folder)
      self.data.append(convert_to_Data(graph_path))

    self.create_idx_split()

    self.task_type = "regression"

    self.eval_metric = "rmse"


  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    if torch.is_tensor(idx):
      # If the input 'idx' is a tensor, return a list of data items corresponding to the indices
      return [self.data[i] for i in idx]
    else:
      return self.data[idx]

  def create_idx_split(self):
    split = {}

    avail = list(range(len(self.data)))

    train_prop = 0.7
    val_prop = 0.15
    test_prop = 0.15

    num_train = int(train_prop * len(self.data))
    num_val = int(val_prop * len(self.data))
    num_test = len(self.data) - num_train - num_val

    train_split = random.sample(avail, num_train)
    avail = list(set(avail) - set(train_split))

    val_split = random.sample(avail, num_val)
    avail = list(set(avail) - set(val_split))

    test_split = random.sample(avail, num_test)
    avail = list(set(avail) - set(test_split))

    split['train'] = torch.tensor(train_split)
    split['valid'] = torch.tensor(val_split)
    split['test'] = torch.tensor(test_split)

    self.split = split

  def get_idx_split(self):
    return self.split


In [None]:
# Load the dataset
dataset = torch.load("/content/drive/MyDrive/Summer_Invitational_2023_Datathon_Datasets/Test-Data-Processed/NK.pt")

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Device: {}'.format(device))

split_idx = dataset.get_idx_split()

# Check task type
print('Task type: {}'.format(dataset.task_type))

Device: cpu
Task type: regression


In [None]:
import math

bad_graphs = set()

for i in range(365):
  data = dataset[i]
  for i in range(data.x.shape[0]):
    row = data.x[i]
    for j in range(row.shape[0]):
      if math.isnan(data.x[i][j]):
        print("noo", i)
        data.x[i][j] = 0

  for i in range(data.y.shape[0]):
    row = data.y[i]
    for j in range(row.shape[0]):
      if math.isnan(data.y[i][j]):
        print("noo", i)
        data.y[i][j] = 0

  for i in range(data.edge_attr.shape[0]):
    row = data.edge_attr[i]
    for j in range(row.shape[0]):
      if math.isnan(data.edge_attr[i][j]):
        print("noo", i)
        data.edge_attr[i][j] = 0

'''
for i in bad_graphs:
  if i in split_idx['train']:
    a1 = split_idx['train'].tolist()
    a1.remove(i)
    split_idx['train'] = torch.tensor(a1)
  elif i in split_idx['valid']:
    a1 = split_idx['valid'].tolist()
    a1.remove(i)
    split_idx['valid'] = torch.tensor(a1)
  elif i in split_idx['test']:
    a1 = split_idx['test'].tolist()
    a1.remove(i)
    split_idx['test'] = torch.tensor(a1)

'''
print(bad_graphs)

noo 86
noo 87
noo 149
noo 250
noo 251
noo 160
noo 96
noo 97
noo 271
noo 272
noo 92
noo 93
noo 146
noo 147
noo 150
noo 192
noo 193
noo 247
noo 248
noo 149
noo 147
noo 90
noo 91
noo 143
noo 144
noo 88
noo 89
noo 144
noo 145
noo 160
noo 87
noo 88
noo 148
noo 249
noo 250
noo 152
noo 90
noo 91
noo 143
noo 144
noo 94
noo 95
noo 147
noo 148
noo 157
noo 158
noo 89
noo 90
noo 142
noo 143
noo 155
noo 156
noo 192
noo 193
noo 247
noo 248
noo 88
noo 89
noo 143
noo 144
noo 86
noo 87
noo 149
noo 248
noo 249
noo 88
noo 89
noo 151
noo 258
noo 259
noo 92
noo 93
noo 145
noo 146
noo 96
noo 97
noo 162
noo 270
noo 271
noo 161
noo 96
noo 97
noo 162
noo 270
noo 271
noo 94
noo 95
noo 160
noo 268
noo 269
noo 92
noo 93
noo 147
noo 148
noo 94
noo 95
noo 148
noo 149
noo 154
noo 150
noo 152
noo 160
noo 92
noo 93
noo 145
noo 146
noo 160
noo 160
noo 161
noo 94
noo 95
noo 147
noo 148
noo 94
noo 95
noo 160
noo 268
noo 269
noo 92
noo 93
noo 147
noo 148
noo 153
noo 86
noo 87
noo 149
noo 252
noo 253
noo 90
noo 91
noo 144


In [None]:
train_loader = DataLoader(dataset[split_idx["train"]], batch_size=2, shuffle=True, num_workers=0)
valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=2, shuffle=False, num_workers=0)
test_loader = DataLoader(dataset[split_idx["test"]], batch_size=2, shuffle=False, num_workers=0)

In [None]:
from torch_geometric.nn import GCNConv, GATConv

class GCN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers,
                 dropout, return_embeds=False):
        super(GCN, self).__init__()

        self.conv_layer = GCNConv


        # A list of GCNConv layers
        self.convs = torch.nn.ModuleList(
            [self.conv_layer(in_channels=input_dim, out_channels=hidden_dim)] +
            [self.conv_layer(in_channels=hidden_dim, out_channels=hidden_dim)
                for i in range(num_layers-2)] +
            [self.conv_layer(in_channels=hidden_dim, out_channels=output_dim)]
        )

        # A list of 1D batch normalization layers
        self.bns = torch.nn.ModuleList([
            torch.nn.BatchNorm1d(num_features=hidden_dim)
                for i in range(num_layers-1)
        ])


        # The log softmax layer
        self.softmax = torch.nn.LogSoftmax()

        # Probability of an element to be zeroed
        self.dropout = dropout

        # Skip classification layer and return node embeddings
        self.return_embeds = return_embeds

    def reset_parameters(self):
        for conv in self.convs:
            conv.reset_parameters()
        for bn in self.bns:
            bn.reset_parameters()

    def forward(self, x, adj_t):
        # TODO: Implement this function that takes the feature tensor x,
        # edge_index tensor adj_t and returns the output tensor as
        # shown in the figure.
        for conv, bn in zip(self.convs[:-1], self.bns):
            x1 = F.relu(bn(conv(x, adj_t)))
            if self.training:
                x1 = F.dropout(x1, p=self.dropout)
            x = x1
        x = self.convs[-1](x, adj_t)
        out = x if self.return_embeds else self.softmax(x)

        return out


In [None]:
from ogb.graphproppred.mol_encoder import AtomEncoder
from torch_geometric.nn import global_add_pool, global_mean_pool

### GCN to predict graph property
class GCN_Graph(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers, dropout):
        super(GCN_Graph, self).__init__()

        # Load encoders for Atoms in molecule graphs
        # self.node_encoder = AtomEncoder(hidden_dim)

        # Node embedding model
        # Note that the input_dim and output_dim are set to hidden_dim
        self.gnn_node = GCN(input_dim, hidden_dim,
            hidden_dim, num_layers, dropout, return_embeds=True)

        # Node pooling layer
        self.pool = global_mean_pool

        # Output layer
        self.linear1 = torch.nn.Linear(hidden_dim, hidden_dim)
        self.linear2 = torch.nn.Linear(hidden_dim, hidden_dim)
        self.linear3 = torch.nn.Linear(hidden_dim, output_dim)

        self.input_dim = input_dim


    def reset_parameters(self):
      self.gnn_node.reset_parameters()
      self.linear1.reset_parameters()
      self.linear2.reset_parameters()
      self.linear3.reset_parameters()

    def forward(self, batched_data):
        # Extract important attributes of our mini-batch
        x, edge_index, batch = batched_data.x, batched_data.edge_index, batched_data.batch


        # print(x.shape)
        # embed = self.node_encoder(x)
        # embed = torch.hstack([x for i in range(self.input_dim)])
        # embed = torch.div(embed, 1)
        # print(embed.shape)

        embed = self.gnn_node(x, edge_index)
        features = self.pool(embed, batch)
        out = self.linear1(features)
        out = self.linear2(out)
        out = self.linear3(out)

        return out

In [None]:
from ogb.graphproppred.mol_encoder import AtomEncoder
from torch_geometric.nn import global_add_pool, global_mean_pool

### GCN to predict graph property
class GCN_Graph_Classification(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers, dropout):
        super(GCN_Graph_Classification, self).__init__()

        # Load encoders for Atoms in molecule graphs
        # self.node_encoder = AtomEncoder(hidden_dim)

        # Node embedding model
        # Note that the input_dim and output_dim are set to hidden_dim
        self.gnn_node = GCN(input_dim, hidden_dim,
            hidden_dim, num_layers, dropout, return_embeds=True)

        # Node pooling layer
        self.pool = global_mean_pool

        # Output layer
        self.linear1 = torch.nn.Linear(hidden_dim, hidden_dim)
        self.linear2 = torch.nn.Linear(hidden_dim, hidden_dim)
        self.linear3 = torch.nn.Linear(hidden_dim, output_dim)

        self.sigmoid = torch.nn.Sigmoid()

        self.input_dim = input_dim


    def reset_parameters(self):
      self.gnn_node.reset_parameters()
      self.linear1.reset_parameters()
      self.linear2.reset_parameters()
      self.linear3.reset_parameters()

    def forward(self, batched_data):
        # Extract important attributes of our mini-batch
        x, edge_index, batch = batched_data.x, batched_data.edge_index, batched_data.batch


        # print(x.shape)
        # embed = self.node_encoder(x)
        # embed = torch.hstack([x for i in range(self.input_dim)])
        # embed = torch.div(embed, 1)
        # print(embed.shape)

        embed = self.gnn_node(x, edge_index)
        features = self.pool(embed, batch)
        out = self.linear1(features)
        out = self.linear2(out)
        out = self.linear3(out)
        out = self.sigmoid(out)

        return out

In [None]:
from pandas.core.series import LossySetitemError
def train(model, device, data_loader, optimizer, loss_fn, task="classification"):
    model.train()
    loss = 0
    total_loss = 0
    batches = 0

    for step, batch in enumerate(tqdm(data_loader, desc="Iteration")):
      batches += 1
      batch = batch.to(device)

      if batch.x.shape[0] == 1 or batch.batch[-1] == 0:
          pass
      else:
        ## ignore nan targets (unlabeled) when computing training loss.
        is_labeled = batch.y == batch.y

        if task == "classification":
          batch.y[0] = 1 if batch.y[0] > 0 else 0
          batch.y[1] = 1 if batch.y[1] > 0 else 0

        optimizer.zero_grad()
        out = model(batch)
        loss = loss_fn(out[is_labeled], batch.y[is_labeled].float())
        total_loss += loss

        loss.backward()
        optimizer.step()

    return total_loss / batches

In [None]:
# The evaluation function
def eval(model, device, loader, evaluator, save_model_results=False, save_file=None):
    model.eval()
    y_true = []
    y_pred = []

    for step, batch in enumerate(tqdm(loader, desc="Iteration")):
        batch = batch.to(device)

        if batch.x.shape[0] == 1:
            pass
        else:
            with torch.no_grad():
                pred = model(batch)

            y_true.append(batch.y.view(pred.shape).detach().cpu())
            y_pred.append(pred.detach().cpu())

    y_true = torch.cat(y_true, dim = 0).numpy()
    y_pred = torch.cat(y_pred, dim = 0).numpy()

    input_dict = {"y_true": y_true, "y_pred": y_pred}

    if save_model_results:
        print ("Saving Model Predictions")

        # Create a pandas dataframe with a two columns
        # y_pred | y_true
        data = {}
        data['y_pred'] = y_pred.reshape(-1)
        data['y_true'] = y_true.reshape(-1)

        df = pd.DataFrame(data=data)
        # Save to csv
        df.to_csv('ogbg-molhiv_graph_' + save_file + '.csv', sep=',', index=False)

    return evaluator.eval(input_dict)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

class RegressionEvaluator():
  def __init__(self):
    pass

  def eval(self, input_dict):
    y_true = input_dict['y_true']
    y_pred = input_dict['y_pred']

    mse = nn.MSELoss()
    out = mse(torch.tensor(y_true), torch.tensor(y_pred))
    r2 = r2_score(y_true, y_pred)
    print(r2)
    return {'rmse': out, 'r2': r2}

class ClassificationEvaluator():
  def __init__(self):
    pass

  def eval(self, input_dict):
    y_true = input_dict['y_true']
    print(type(y_true))
    for j in range(y_true.shape[0]):
      y_true[j] = 1 if y_true[j] > 0 else 0
    y_pred = input_dict['y_pred']

    acc = (y_pred.round() == y_true).mean()

    r2 = r2_score(y_true, y_pred)
    print(r2)
    return {'rmse': acc, 'r2': r2}

In [None]:
args = {
    'device': device,
    'num_layers': 10,
    'hidden_dim': 256,
    'dropout': 0.5,
    'lr': 0.001,
    'epochs': 20,
    'num_node_features': 7
}
args

{'device': 'cpu',
 'num_layers': 10,
 'hidden_dim': 256,
 'dropout': 0.5,
 'lr': 0.001,
 'epochs': 20,
 'num_node_features': 7}

In [None]:
task_type = "classification"

if task_type == "classification":
  model = GCN_Graph_Classification(args['num_node_features'], args['hidden_dim'],
            1, args['num_layers'],
            args['dropout']).to(device)
  evaluator = ClassificationEvaluator()
elif task_type == "regression":
  model = GCN_Graph(args['num_node_features'], args['hidden_dim'],
              1, args['num_layers'],
              args['dropout']).to(device)
  evaluator = RegressionEvaluator()

In [None]:
import copy

if task_type =="classification":
  model.reset_parameters()

  optimizer = torch.optim.Adam(model.parameters(), lr=args['lr'])
  loss_fn = torch.nn.BCEWithLogitsLoss()

  best_model = None
  best_valid_acc = 0

  for epoch in range(1, 1 + args["epochs"]):
    print('Training...')
    loss = train(model, device, train_loader, optimizer, loss_fn, task_type)

    print('Evaluating...')
    train_result = eval(model, device, train_loader, evaluator)
    val_result = eval(model, device, valid_loader, evaluator)
    test_result = eval(model, device, test_loader, evaluator)

    train_acc, valid_acc, test_acc = train_result[dataset.eval_metric], val_result[dataset.eval_metric], test_result[dataset.eval_metric]

    if valid_acc > best_valid_acc:
        best_valid_acc = valid_acc
        best_model = copy.deepcopy(model)
    print(f'Epoch: {epoch:02d}, '
          f'Loss: {loss:.4f}, '
          f'Train: {train_acc:.4f}, '
          f'Valid: {valid_acc:.4f}, '
          f'Test: {test_acc:.4f}')


elif task_type == "regression":
  model.reset_parameters()

  optimizer = torch.optim.Adam(model.parameters(), lr=args['lr'])
  loss_fn = torch.nn.MSELoss()

  best_model = None
  best_valid_error = 1000000

  r2_list = []

  for epoch in range(1, 1 + args["epochs"]):
    print('Training...')
    loss = train(model, device, train_loader, optimizer, loss_fn)

    print('Evaluating...')
    train_result = eval(model, device, train_loader, evaluator)
    val_result = eval(model, device, valid_loader, evaluator)
    test_result = eval(model, device, test_loader, evaluator)

    train_error, valid_error, test_error = train_result[dataset.eval_metric], val_result[dataset.eval_metric], test_result[dataset.eval_metric]
    trr2, var2, ter2 = train_result['r2'], val_result['r2'], test_result['r2']

    r2_list.append(trr2)

    if valid_error < best_valid_error:
        best_valid_error = valid_error
        best_model = copy.deepcopy(model)
    print(f'Epoch: {epoch:02d}, '
          f'Loss: {loss:.4f}, '
          f'Train: {train_error:.4f}, '
          f'Valid: {valid_error:.4f}, '
          f'Test: {test_error:.4f}')


Training...


Iteration:   0%|          | 0/128 [00:00<?, ?it/s]

Evaluating...


Iteration:   0%|          | 0/128 [00:00<?, ?it/s]

<class 'numpy.ndarray'>
-0.7692354013114404


Iteration:   0%|          | 0/27 [00:00<?, ?it/s]

<class 'numpy.ndarray'>
-0.9268254699529648


Iteration:   0%|          | 0/28 [00:00<?, ?it/s]

<class 'numpy.ndarray'>
-0.5985609049300582
Epoch: 01, Loss: 0.6927, Train: 0.5647, Valid: 0.5185, Test: 0.6250
Training...


Iteration:   0%|          | 0/128 [00:00<?, ?it/s]

Evaluating...


Iteration:   0%|          | 0/128 [00:00<?, ?it/s]

<class 'numpy.ndarray'>
-0.7691827745254078


Iteration:   0%|          | 0/27 [00:00<?, ?it/s]

<class 'numpy.ndarray'>
-0.9267677113374224


Iteration:   0%|          | 0/28 [00:00<?, ?it/s]

<class 'numpy.ndarray'>
-0.5985116731030498
Epoch: 02, Loss: 0.6877, Train: 0.5647, Valid: 0.5185, Test: 0.6250
Training...


Iteration:   0%|          | 0/128 [00:00<?, ?it/s]

Evaluating...


Iteration:   0%|          | 0/128 [00:00<?, ?it/s]

<class 'numpy.ndarray'>
-0.769331836851417


Iteration:   0%|          | 0/27 [00:00<?, ?it/s]

<class 'numpy.ndarray'>
-0.9269275120692226


Iteration:   0%|          | 0/28 [00:00<?, ?it/s]

<class 'numpy.ndarray'>
-0.5986468628342285
Epoch: 03, Loss: 0.6877, Train: 0.5647, Valid: 0.5185, Test: 0.6250
Training...


Iteration:   0%|          | 0/128 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

In [None]:
import matplotlib.pyplot as plt
plt.plot(range(1, args['epochs'] + 1), r2_list, marker='o', linestyle='-', color='b')
plt.xlabel('Number of Epochs')
plt.ylabel('R^2 Score')
plt.title('R^2 Score vs Number of Epochs')
plt.grid(True)
plt.show()