In [None]:
%matplotlib inline

Error: Session cannot generate requests


Link Prediction using Graph Neural Networks
===========================================

In the :doc:`introduction <1_introduction>`, you have already learned
the basic workflow of using GNNs for node classification,
i.e. predicting the category of a node in a graph. This tutorial will
teach you how to train a GNN for link prediction, i.e. predicting the
existence of an edge between two arbitrary nodes in a graph.

By the end of this tutorial you will be able to

-  Build a GNN-based link prediction model.
-  Train and evaluate the model on a small DGL-provided dataset.

(Time estimate: 28 minutes)


In [150]:
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import itertools
import numpy as np
import pandas as pd
import scipy.sparse as sp

Overview of Link Prediction with GNN
------------------------------------

Many applications such as social recommendation, item recommendation,
knowledge graph completion, etc., can be formulated as link prediction,
which predicts whether an edge exists between two particular nodes. This
tutorial shows an example of predicting whether a citation relationship,
either citing or being cited, between two papers exists in a citation
network.

This tutorial follows a relatively simple practice from
`SEAL <https://papers.nips.cc/paper/2018/file/53f0d7c537d99b3824f0f99d62ea2428-Paper.pdf>`__.
It formulates the link prediction problem as a binary classification
problem as follows:

-  Treat the edges in the graph as *positive examples*.
-  Sample a number of non-existent edges (i.e. node pairs with no edges
   between them) as *negative* examples.
-  Divide the positive examples and negative examples into a training
   set and a test set.
-  Evaluate the model with any binary classification metric such as Area
   Under Curve (AUC).

In some domains such as large-scale recommender systems or information
retrieval, you may favor metrics that emphasize good performance of
top-K predictions. In these cases you may want to consider other metrics
such as mean average precision, and use other negative sampling methods,
which are beyond the scope of this tutorial.

Loading graph and features
--------------------------

Following the :doc:`introduction <1_introduction>`, this tutorial
first loads the Cora dataset.




In [151]:
df_node

Unnamed: 0,Label,Id,업종,company_id
0,(주)지에이,2248138829,일반용 전기 조명장치 제조업,0
1,(주)시안,1248192047,플라스틱 창호 제조업,1
2,(주)우룡,2258102381,석회 및 플라스터 제조업,2
3,한라시멘트(주),2268122191,시멘트 제조업,3
4,(주)네오콘크리트,3108700548,일반 화물자동차 운송업,4
...,...,...,...,...
4045,창원레미콘,6098620864,제조업,4045
4046,송효,6088129192,제조업,4046
4047,신흥흄관,6038135739,제조업,4047
4048,주식회사거산개발,5368100115,건설용 석제품 제조업,4048


In [520]:

df_node = pd.read_csv("/Users/choeseung-won/Deep_Study/Business_Partner_Project/Data/강원_node.csv")
df_edge = pd.read_csv("/Users/choeseung-won/Deep_Study/Business_Partner_Project/Data/강원_edge.csv")

# 전처리
df_node = df_node[df_node['Id'].isin(df_edge.Source) | df_node['Id'].isin(df_edge.Target)]
df_node = df_node.drop_duplicates('Id').rename(columns={ "from_업종명10차": "sector"}).iloc[:, :3]
# df_node['company_id'] = df_node.reindex().index
df_node = df_node.drop_duplicates('Id').reset_index()


df_edge = df_edge[['Source', 'Target']].astype(int)
df_edge['Source'] = df_edge.Source.map(lambda x: df_node[df_node['Id'] == x].index[0])
df_edge['Target'] = df_edge.Target.map(lambda x: df_node[df_node['Id'] == x].index[0])
# 자기자신
df_edge = df_edge.append(pd.DataFrame({'Source': df_node.index.to_numpy(), 'Target': df_node.index.to_numpy()}))

# One - Hot
df_node = pd.get_dummies(df_node,prefix='', prefix_sep='', columns=['sector'])
# to check by secter
# df_node[df_node['1차 금속제품 도매업'] == 1]

len(df_node.iloc[:,3:].to_numpy())

3230

In [521]:
df_node

Unnamed: 0,index,Label,Id,1차 금속제품 도매업,"1차 유리제품, 유리섬유 및 광학용 유리 제조업",가금류 가공 및 저장 처리업,가금류 도축업,가방 및 기타 보호용 케이스 제조업,가전제품 및 부품 도매업,가정용 비전기식 조리 및 난방 기구 제조업,...,합성섬유 제조업,합성수지 및 기타 플라스틱 물질 제조업,합성수지선 건조업,혼성 및 재생 플라스틱 소재 물질 제조업,"화물운송 중개, 대리 및 관련 서비스업",화물자동차 및 특수목적용 자동차 제조업,화약 및 불꽃제품 제조업,화장품 제조업,화학 살균·살충제 및 농업용 약제 제조업,화학섬유 방적업
0,0,(주)지에이,2248138829,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,(주)시안,1248192047,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,(주)우룡,2258102381,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,한라시멘트(주),2268122191,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,(주)네오콘크리트,3108700548,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3225,4045,창원레미콘,6098620864,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3226,4046,송효,6088129192,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3227,4047,신흥흄관,6038135739,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3228,4048,주식회사거산개발,5368100115,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [522]:

u, v = torch.LongTensor(df_edge.Source.to_numpy()), torch.LongTensor(df_edge.Target.to_numpy())
g = dgl.graph((u, v))
# g.ndata['sector'] = torch.ones(g.num_nodes(), 16)
g.ndata['sector'] = torch.FloatTensor(df_node.iloc[:,3:].to_numpy())
g

Graph(num_nodes=3230, num_edges=6125,
      ndata_schemes={'sector': Scheme(shape=(398,), dtype=torch.float32)}
      edata_schemes={})

In [410]:
import dgl.data

dataset = dgl.data.CoraGraphDataset()
g = dataset[0]
dataset[0]

  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.


Graph(num_nodes=2708, num_edges=10556,
      ndata_schemes={'feat': Scheme(shape=(1433,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.int64), 'val_mask': Scheme(shape=(), dtype=torch.bool), 'test_mask': Scheme(shape=(), dtype=torch.bool), 'train_mask': Scheme(shape=(), dtype=torch.bool)}
      edata_schemes={})

Prepare training and testing sets
---------------------------------

This tutorial randomly picks 10% of the edges for positive examples in
the test set, and leave the rest for the training set. It then samples
the same number of edges for negative examples in both sets.




In [523]:
# Split edge set for training and testing
u, v = g.edges()

eids = np.arange(g.number_of_edges())
eids = np.random.permutation(eids)
test_size = int(len(eids) * 0.1)
train_size = g.number_of_edges() - test_size
test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]
train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]]

# Find all negative edges and split them for training and testing
adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())))
adj_neg = 1 - adj.todense() - np.eye(g.number_of_nodes())
neg_u, neg_v = np.where(adj_neg != 0)

neg_eids = np.random.choice(len(neg_u), g.number_of_edges() // 2)
test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]]
train_neg_u, train_neg_v = neg_u[neg_eids[test_size:]], neg_v[neg_eids[test_size:]]

When training, you will need to remove the edges in the test set from
the original graph. You can do this via ``dgl.remove_edges``.

<div class="alert alert-info"><h4>Note</h4><p>``dgl.remove_edges`` works by creating a subgraph from the
   original graph, resulting in a copy and therefore could be slow for
   large graphs. If so, you could save the training and test graph to
   disk, as you would do for preprocessing.</p></div>




In [524]:
train_g = dgl.remove_edges(g, eids[:test_size])
train_g

Graph(num_nodes=3230, num_edges=5513,
      ndata_schemes={'sector': Scheme(shape=(398,), dtype=torch.float32)}
      edata_schemes={})

Define a GraphSAGE model
------------------------

This tutorial builds a model consisting of two
`GraphSAGE <https://arxiv.org/abs/1706.02216>`__ layers, each computes
new node representations by averaging neighbor information. DGL provides
``dgl.nn.SAGEConv`` that conveniently creates a GraphSAGE layer.




In [525]:
from dgl.nn import SAGEConv

# ----------- 2. create model -------------- #
# build a two-layer GraphSAGE model
class GraphSAGE(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats, 'mean')
        self.conv2 = SAGEConv(h_feats, h_feats, 'mean')
    
    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

The model then predicts the probability of existence of an edge by
computing a score between the representations of both incident nodes
with a function (e.g. an MLP or a dot product), which you will see in
the next section.

\begin{align}\hat{y}_{u\sim v} = f(h_u, h_v)\end{align}




Positive graph, negative graph, and ``apply_edges``
---------------------------------------------------

In previous tutorials you have learned how to compute node
representations with a GNN. However, link prediction requires you to
compute representation of *pairs of nodes*.

DGL recommends you to treat the pairs of nodes as another graph, since
you can describe a pair of nodes with an edge. In link prediction, you
will have a *positive graph* consisting of all the positive examples as
edges, and a *negative graph* consisting of all the negative examples.
The *positive graph* and the *negative graph* will contain the same set
of nodes as the original graph.  This makes it easier to pass node
features among multiple graphs for computation.  As you will see later,
you can directly fed the node representations computed on the entire
graph to the positive and the negative graphs for computing pair-wise
scores.

The following code constructs the positive graph and the negative graph
for the training set and the test set respectively.




In [526]:
train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes())
train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes())

test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes())
test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes())

The benefit of treating the pairs of nodes as a graph is that you can
use the ``DGLGraph.apply_edges`` method, which conveniently computes new
edge features based on the incident nodes’ features and the original
edge features (if applicable).

DGL provides a set of optimized builtin functions to compute new
edge features based on the original node/edge features. For example,
``dgl.function.u_dot_v`` computes a dot product of the incident nodes’
representations for each edge.




In [527]:
import dgl.function as fn

class DotPredictor(nn.Module):
    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            # Compute a new edge feature named 'score' by a dot-product between the
            # source node feature 'h' and destination node feature 'h'.
            g.apply_edges(fn.u_dot_v('h', 'h', 'score'))
            # u_dot_v returns a 1-element vector for each edge so you need to squeeze it.
            return g.edata['score'][:, 0]

You can also write your own function if it is complex.
For instance, the following module produces a scalar score on each edge
by concatenating the incident nodes’ features and passing it to an MLP.




In [528]:
class MLPPredictor(nn.Module):
    def __init__(self, h_feats):
        super().__init__()
        self.W1 = nn.Linear(h_feats * 2, h_feats)
        self.W2 = nn.Linear(h_feats, 1)

    def apply_edges(self, edges):
        """
        Computes a scalar score for each edge of the given graph.

        Parameters
        ----------
        edges :
            Has three members ``src``, ``dst`` and ``data``, each of
            which is a dictionary representing the features of the
            source nodes, the destination nodes, and the edges
            themselves.

        Returns
        -------
        dict
            A dictionary of new edge features.
        """
        h = torch.cat([edges.src['h'], edges.dst['h']], 1)
        return {'score': self.W2(F.relu(self.W1(h))).squeeze(1)}

    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            g.apply_edges(self.apply_edges)
            return g.edata['score']

<div class="alert alert-info"><h4>Note</h4><p>The builtin functions are optimized for both speed and memory.
   We recommend using builtin functions whenever possible.</p></div>

<div class="alert alert-info"><h4>Note</h4><p>If you have read the :doc:`message passing
   tutorial <3_message_passing>`, you will notice that the
   argument ``apply_edges`` takes has exactly the same form as a message
   function in ``update_all``.</p></div>




Training loop
-------------

After you defined the node representation computation and the edge score
computation, you can go ahead and define the overall model, loss
function, and evaluation metric.

The loss function is simply binary cross entropy loss.

\begin{align}\mathcal{L} = -\sum_{u\sim v\in \mathcal{D}}\left( y_{u\sim v}\log(\hat{y}_{u\sim v}) + (1-y_{u\sim v})\log(1-\hat{y}_{u\sim v})) \right)\end{align}

The evaluation metric in this tutorial is AUC.




In [591]:
model = GraphSAGE(train_g.ndata['sector'].shape[1], 16)
# You can replace DotPredictor with MLPPredictor.
#pred = MLPPredictor(16)
pred = DotPredictor()

def compute_loss(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score])
    labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])])
    return F.binary_cross_entropy_with_logits(scores, labels)

def compute_auc(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score]).numpy()
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
    return roc_auc_score(labels, scores)

The training loop goes as follows:

<div class="alert alert-info"><h4>Note</h4><p>This tutorial does not include evaluation on a validation
   set. In practice you should save and evaluate the best model based on
   performance on the validation set.</p></div>




In [593]:
# ----------- 3. set up loss and optimizer -------------- #
# in this case, loss will in training loop
optimizer = torch.optim.Adam(itertools.chain(model.parameters(), pred.parameters()), lr=0.005)

# ----------- 4. training -------------------------------- #
all_logits = []
for e in range(4000):
    # forward
    h = model(train_g, train_g.ndata['sector'])
    pos_score = pred(train_pos_g, h)
    neg_score = pred(train_neg_g, h)
    loss = compute_loss(pos_score, neg_score)
    
    # backward
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if e % 100 == 0:
        print('In epoch {}, loss: {}'.format(e, loss))

# ----------- 5. check results ------------------------ #
from sklearn.metrics import roc_auc_score
with torch.no_grad():
    pos_score = pred(test_pos_g, h)
    neg_score = pred(test_neg_g, h)
    print('AUC', compute_auc(pos_score, neg_score))

In epoch 0, loss: 0.0780123695731163
In epoch 100, loss: 0.07802049070596695
In epoch 200, loss: 0.07800426334142685
In epoch 300, loss: 0.07800045609474182
In epoch 400, loss: 0.07799781113862991
In epoch 500, loss: 0.07799553126096725
In epoch 600, loss: 0.07799343019723892
In epoch 700, loss: 0.07799173146486282
In epoch 800, loss: 0.07798992842435837
In epoch 900, loss: 0.07798810303211212
In epoch 1000, loss: 0.07798633724451065
In epoch 1100, loss: 0.07798434793949127
In epoch 1200, loss: 0.0779823511838913
In epoch 1300, loss: 0.07798025757074356
In epoch 1400, loss: 0.07797809690237045
In epoch 1500, loss: 0.07798545807600021
In epoch 1600, loss: 0.07797832787036896
In epoch 1700, loss: 0.07803481817245483
In epoch 1800, loss: 0.07799342274665833
In epoch 1900, loss: 0.07796847075223923
In epoch 2000, loss: 0.07797038555145264
In epoch 2100, loss: 0.07796832174062729
In epoch 2200, loss: 0.07798602432012558
In epoch 2300, loss: 0.07800441235303879
In epoch 2400, loss: 0.0780642

In [531]:
# for test

array([   3,    2,  978,  979, 3168, 2377, 1315, 3217, 1432, 2039])

In [594]:
df_edge = pd.read_csv("/Users/choeseung-won/Deep_Study/Business_Partner_Project/Data/강원_edge.csv")
df_node = pd.read_csv("/Users/choeseung-won/Deep_Study/Business_Partner_Project/Data/강원_node.csv")
# 전처리
df_node = df_node[df_node['Id'].isin(df_edge.Source) | df_node['Id'].isin(df_edge.Target)]
df_node = df_node.drop_duplicates('Id').rename(columns={ "from_업종명10차": "sector"}).iloc[:, :3]
# df_node['company_id'] = df_node.reindex().index
df_node = df_node.drop_duplicates('Id').reset_index()


In [595]:
# Query
from scipy import spatial

tree = spatial.KDTree(h.tolist())
index_ids = tree.query(h[2186].tolist(), 10)[1]

index_ids

array([2186,  701, 2959, 1113, 2485, 2371, 1999,  116, 2090, 2547])

In [596]:
df_node.iloc[2485]

index                          3028
Label                 농업회사법인 바른돈(주)
Id                       2248166359
sector    육류 기타 가공 및 저장처리업 (가금류 제외)
Name: 2485, dtype: object

In [597]:
df_node.query(' Label == "썬테크" ')

Unnamed: 0,index,Label,Id,sector
96,104,썬테크,6068105423,제조업


## 시각화

In [584]:
h_item =  h.detach().numpy()
h_item

array([[ 8.986108  , -2.2076368 , -0.8030824 , ..., -0.89777654,
        -1.4314513 , -0.35052428],
       [-1.6831468 ,  0.6553683 , -2.7194645 , ...,  0.5242904 ,
         0.27856055,  1.8463794 ],
       [ 1.1049949 , -0.3580565 , -0.26305506, ..., -3.8180847 ,
        -3.091681  , -2.783371  ],
       ...,
       [ 3.0306036 , -3.2818565 , -0.97863245, ..., -1.2619554 ,
        -5.142031  , -1.5927371 ],
       [ 1.6045824 ,  0.33844247, -1.1228217 , ..., -4.6500573 ,
         0.1652442 , -2.4617867 ],
       [-0.19553982,  0.18960921, -0.16223857, ..., -0.56367284,
        -0.16317903, -0.34244335]], dtype=float32)

In [585]:
%matplotlib inline

import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

from bokeh.models import *
from bokeh.plotting import *
from bokeh.io import *
from bokeh.tile_providers import *
from bokeh.palettes import *
from bokeh.transform import *
from bokeh.layouts import *

from bokeh.plotting import figure, show
from bokeh.sampledata.iris import flowers
from bokeh.models import HoverTool

model = TSNE(learning_rate=300)
transformed = model.fit_transform(h_item)

In [586]:
len(transformed)
df_node

Unnamed: 0,index,Label,Id,sector
0,0,(주)지에이,2248138829,일반용 전기 조명장치 제조업
1,1,(주)시안,1248192047,플라스틱 창호 제조업
2,2,(주)우룡,2258102381,석회 및 플라스터 제조업
3,3,한라시멘트(주),2268122191,시멘트 제조업
4,4,(주)네오콘크리트,3108700548,일반 화물자동차 운송업
...,...,...,...,...
3225,4045,창원레미콘,6098620864,제조업
3226,4046,송효,6088129192,제조업
3227,4047,신흥흄관,6038135739,제조업
3228,4048,주식회사거산개발,5368100115,건설용 석제품 제조업


In [588]:
df_node['x'] = transformed[:, 0]
df_node['y'] = transformed[:, 1]
df_node
movies = df_node[['index', 'x', 'y', 'Label', 'sector']]

In [589]:
output_notebook()

p = figure(title = "Movie t-SNE by GNN")
p.xaxis.axis_label = 'x'
p.yaxis.axis_label = 'y'

color_column = []
for genre in movies['sector'].unique().tolist():
    color_column.append(genre)

c = p.circle(x='x', y='y', color='red', fill_alpha=0.2, size=3, source=movies)

c = p.circle(x='x', 
             y='y', 
             legend_field="sector",
             color=factor_cmap('sector', d3['Category20'][20], color_column),
             fill_alpha=1, 
             size=3, 
             source=movies)


circle_hover = HoverTool(tooltips=[('title:', '@sector')], 
                         mode='mouse', 
                         point_policy='follow_mouse', 
                         renderers=[c])
circle_hover.renderers.append(c)

# mouse hover와 legend 정보 부착
p.tools.append(circle_hover)
p.legend.label_text_font_size = '5pt'
p.legend.location = 'top_left'

show(p)

