<a href="https://colab.research.google.com/github/erickuo5124/MLG_HW2/blob/main/GNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q torch-scatter -f https://pytorch-geometric.com/whl/torch-1.8.0+cu101.html
!pip install -q torch-sparse -f https://pytorch-geometric.com/whl/torch-1.8.0+cu101.html
!pip install -q torch-geometric

In [None]:
# hyperparameter
LR = 0.0000004
HIDDEN_LAYER = 4096
EMBEDDING_DIM = 4096
NUM_EPOCH = 60
DATASET = 3

In [None]:
import pandas as pd

train = pd.read_csv(f'/content/drive/MyDrive/MLG/hw2/hw2_data/dataset{DATASET}/train.csv')
test = pd.read_csv(f'/content/drive/MyDrive/MLG/hw2/hw2_data/dataset{DATASET}/test.csv')
content = pd.read_csv(f'/content/drive/MyDrive/MLG/hw2/hw2_data/dataset{DATASET}/content.csv', delimiter='\t', header=None, index_col=0).sort_index()

In [None]:
import torch
from torch_geometric.data import Data

edge_index = train.loc[train['label'] == 1]
x = torch.tensor(content.values, dtype=torch.float)
data = Data(x=x)
data

Data(x=[877, 1703])

In [None]:
from torch.nn import Linear, CosineSimilarity, Softmax
from torch_geometric.nn import GCNConv
from torch_geometric.utils import dropout_adj
import torch.nn.functional as F

class Net(torch.nn.Module):
  def __init__(self, hidden_layer=HIDDEN_LAYER, embedding_dim=EMBEDDING_DIM):
    super(Net, self).__init__()
    torch.manual_seed(12345)
    self.classifier = Linear(data.num_features, hidden_layer)
    self.conv1 = GCNConv(hidden_layer, hidden_layer)
    self.conv2 = GCNConv(hidden_layer, hidden_layer)
    self.conv3 = GCNConv(hidden_layer, EMBEDDING_DIM)
    self.similarity = torch.nn.CosineSimilarity(dim=1, eps=1e-6)

  def forward(self, x, train_edges, pred_edges):
    train_edges = train_edges.loc[train_edges['label'] == 1]
    train_edges = torch.tensor([train_edges['from'].values, train_edges['to'].values]).cuda()

    # Encoder
    h = self.classifier(x.cuda())
    h = self.conv1(h, train_edges)
    h = torch.tanh(h)
    dropout_adj(train_edges, p=0.4)
    h = self.conv3(h, train_edges)
    h = torch.tanh(h)
    nodes = torch.squeeze(h)
    
    # Decoder
    ids_from = torch.tensor(pred_edges['from'].values).cuda()
    ids_to = torch.tensor(pred_edges['to'].values).cuda()
    preds = self.similarity(torch.index_select(nodes, 0, ids_from), torch.index_select(nodes, 0, ids_to))
    preds = torch.tanh(torch.abs(preds))
    preds = F.normalize(preds,dim=0,p=100)
    
    return preds

In [None]:
from IPython.display import Javascript  # Restrict height of output cell.
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))

from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.model_selection import train_test_split

model = Net().cuda()
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Rprop(model.parameters(), lr=LR)
torch.set_printoptions(precision=8, sci_mode=False)

train_data, test_data = train_test_split(train, shuffle=True, test_size=0.2)

for epoch in range(NUM_EPOCH):
  optimizer.zero_grad()
  train_edges, test_edges = train_test_split(train_data, shuffle=True)
  out = model(x, train_edges, test_edges)
  label = torch.tensor(test_edges['label'].values, dtype=torch.float).cuda()
  loss = criterion(out, label)

  print(f'Epoch: {epoch:03d}, Loss: {loss:.6f}')
  loss.backward()
  optimizer.step()

  pred = model(x, train_data, test_data)
  y = torch.tensor(test_data['label'].values, dtype=torch.float).cuda()
  print(f'roc_auc_score: {roc_auc_score(y.cpu().detach().numpy(), pred.cpu().detach().numpy())}, average_precision_score: {average_precision_score(y.cpu().detach().numpy(), pred.cpu().detach().numpy())}')


<IPython.core.display.Javascript object>

Epoch: 000, Loss: 0.502727
roc_auc_score: 0.8594513851167843, average_precision_score: 0.8775211398176048
Epoch: 001, Loss: 0.522951
roc_auc_score: 0.8603566902045989, average_precision_score: 0.8782061233406864
Epoch: 002, Loss: 0.514393
roc_auc_score: 0.8614883215643672, average_precision_score: 0.8791726074641626
Epoch: 003, Loss: 0.491055
roc_auc_score: 0.8626953950147866, average_precision_score: 0.8800394940912433
Epoch: 004, Loss: 0.511394
roc_auc_score: 0.8639477337195969, average_precision_score: 0.8808084209472744
Epoch: 005, Loss: 0.525319
roc_auc_score: 0.8654113102782306, average_precision_score: 0.8819554126514614
Epoch: 006, Loss: 0.511169
roc_auc_score: 0.8672520972901201, average_precision_score: 0.8832246033470358
Epoch: 007, Loss: 0.511457
roc_auc_score: 0.8693041221558331, average_precision_score: 0.8849421599402322
Epoch: 008, Loss: 0.513558
roc_auc_score: 0.8720049490011467, average_precision_score: 0.8871443233240144
Epoch: 009, Loss: 0.502690
roc_auc_score: 0.87

In [None]:

import csv

upload = model(data.x, edge_index, test)
upload = upload.tolist()

with open('/content/drive/MyDrive/MLG/hw2/hw2_data/upload.csv', 'w') as f:
  writer = csv.writer(f)
  writer.writerow(['id', 'prob'])
  for pred, (_, edge) in zip(upload, test.iterrows()):
    writer.writerow([edge['id'], pred])

In [None]:
# new feature
# softmax
# xgboost
# random forest

In [None]:
# dropout
# ranger
# optim
