In [None]:
!pip install -q torch-scatter -f https://pytorch-geometric.com/whl/torch-1.8.0+cu101.html
!pip install -q torch-sparse -f https://pytorch-geometric.com/whl/torch-1.8.0+cu101.html
!pip install -q torch-geometric

In [None]:
# hyperparameter
LR = 0.001
HIDDEN_LAYER = 64
EMBEDDING_DIM = 16
NUM_EPOCH = 3000
DATASET = 2

In [None]:
import pandas as pd

train = pd.read_csv(f'/content/drive/MyDrive/MLG/hw2/hw2_data/dataset{DATASET}/train.csv')
test = pd.read_csv(f'/content/drive/MyDrive/MLG/hw2/hw2_data/dataset{DATASET}/test.csv')
content = pd.read_csv(f'/content/drive/MyDrive/MLG/hw2/hw2_data/dataset{DATASET}/content.csv', delimiter='\t', header=None, index_col=0).sort_index()

In [None]:
import torch
from torch_geometric.data import Data

edge_index = train.loc[train['label'] == 1]
x = torch.tensor(content.values, dtype=torch.float)
data = Data(x=x)
data

Data(x=[3312, 3703])

In [None]:
from torch.nn import Linear, CosineSimilarity, Softmax
from torch_geometric.nn import GCNConv
import torch.nn.functional as F

class Net(torch.nn.Module):
  def __init__(self, hidden_layer=HIDDEN_LAYER, embedding_dim=EMBEDDING_DIM):
    super(Net, self).__init__()
    torch.manual_seed(12345)
    self.classifier = Linear(data.num_features, hidden_layer)
    self.conv1 = GCNConv(hidden_layer, hidden_layer)
    self.conv2 = GCNConv(hidden_layer, hidden_layer)
    self.conv3 = GCNConv(hidden_layer, EMBEDDING_DIM)
    self.similarity = torch.nn.CosineSimilarity(dim=1, eps=1e-6)

  def forward(self, x, train_edges, pred_edges):
    train_edges = train_edges.loc[train_edges['label'] == 1]
    train_edges = torch.tensor([train_edges['from'].values, train_edges['to'].values]).cuda()

    # Encoder
    h = self.classifier(x.cuda())
    h = self.conv1(h, train_edges)
    h = torch.tanh(h)
    h = self.conv2(h, train_edges)
    h = torch.tanh(h)
    h = self.conv3(h, train_edges)
    h = torch.tanh(h)
    nodes = torch.squeeze(h)
    
    # Decoder
    ids_from = torch.tensor(pred_edges['from'].values).cuda()
    ids_to = torch.tensor(pred_edges['to'].values).cuda()
    preds = self.similarity(torch.index_select(nodes, 0, ids_from), torch.index_select(nodes, 0, ids_to))
    preds = torch.tanh(torch.abs(preds))
    preds = F.normalize(preds,dim=0,p=100)
    
    return preds

In [None]:
from IPython.display import Javascript  # Restrict height of output cell.
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))

from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.model_selection import train_test_split

model = Net().cuda()
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
torch.set_printoptions(precision=8, sci_mode=False)

for epoch in range(NUM_EPOCH):
  optimizer.zero_grad()
  train_edges, test_edges = train_test_split(train, shuffle=True)
  out = model(x, train_edges, test_edges)
  label = torch.tensor(test_edges['label'].values, dtype=torch.float).cuda()
  loss = criterion(out, label)

  print(out)
  print(f'Epoch: {epoch:03d}, Loss: {loss:.6f}')
  print(f'roc_auc_score: {roc_auc_score(label.cpu().detach().numpy(), out.cpu().detach().numpy())}, average_precision_score: {average_precision_score(label.cpu().detach().numpy(), out.cpu().detach().numpy())}')
  loss.backward()
  optimizer.step()

<IPython.core.display.Javascript object>

[1;30;43m串流輸出內容已截斷至最後 5000 行。[0m
        0.00439103], device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 1767, Loss: 0.101682
roc_auc_score: 0.9990902425420131, average_precision_score: 0.9990987097357158
tensor([0.87793291, 0.77550662, 0.03320405,  ..., 0.01720970, 0.04546427,
        0.91248375], device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 1768, Loss: 0.098976
roc_auc_score: 0.9993814560524525, average_precision_score: 0.999371804933529
tensor([0.00348452, 0.04607078, 0.02006158,  ..., 0.01290058, 0.87272483,
        0.93461996], device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 1769, Loss: 0.103905
roc_auc_score: 0.9990655520842129, average_precision_score: 0.9990357477844498
tensor([0.00508036, 0.01452826, 0.64877135,  ..., 0.90933758, 0.91823870,
        0.92474395], device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 1770, Loss: 0.096473
roc_auc_score: 0.9993800685862414, average_precision_score: 0.999370877699513
tensor([0.02811126, 0.94048375, 0.00423921,  ..., 0.01628706, 0.44607

In [None]:
import csv

upload = model(data.x, edge_index, test)
upload = upload.tolist()

with open('/content/drive/MyDrive/MLG/hw2/hw2_data/upload.csv', 'w') as f:
  writer = csv.writer(f)
  writer.writerow(['id', 'prob'])
  for pred, (_, edge) in zip(upload, test.iterrows()):
    writer.writerow([edge['id'], pred])

In [None]:
# new feature
# softmax
# xgboost
# random forest