# Link Prediction with GNN
---

## Overview
* Graph에 Edge가 존재하는 경우 positive, 존재하지 않을 경우 negative한 데이터로 학습

In [2]:
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import itertools
import numpy as np
import pandas as pd
import scipy.sparse as sp
%matplotlib inline

Using backend: pytorch


## dgl(Deep Graph Libary) graph data type에 맞추기 위한 데이터 전처리

1. node index의 max 값이 node의 최대 개수이다. 
   -> node index 를 edge가 연결된 node의 구성으로만 해야 dimension이 맞게 된다. 

2. dgl.ndata, dgl.edata 를 통해 node 와 edge의 feature 등록이 가능하다. 



In [4]:

df_node = pd.read_csv("/Users/choeseung-won/Deep_Study/Business_Partner_Project/Data/company_node.csv")
df_edge = pd.read_csv("/Users/choeseung-won/Deep_Study/Business_Partner_Project/Data/company_edge.csv")


df_node.Id = df_node.Id.fillna(0).astype(int)
df_edge = df_edge[['Source', 'Target']].fillna(0).astype(int)

# 전처리
df_node = df_node[df_node['Id'].isin(df_edge.Source) | df_node['Id'].isin(df_edge.Target)]
df_node = df_node.drop_duplicates('Id').rename(columns={ "from_업종명10차": "sector"}).iloc[:, :3]
# df_node['company_id'] = df_node.reindex().index
df_node = df_node.drop_duplicates('Id').reset_index()


# df_edge = df_edge[['Source', 'Target']].fillna(0).astype(int)
df_edge['Source'] = df_edge.Source.map(lambda x: df_node[df_node['Id'] == x].index[0])
df_edge['Target'] = df_edge.Target.map(lambda x: df_node[df_node['Id'] == x].index[0])
# 자기자신
df_edge = df_edge.append(pd.DataFrame({'Source': df_node.index.to_numpy(), 'Target': df_node.index.to_numpy()}))

# # One - Hot
# df_node = pd.get_dummies(df_node,prefix='', prefix_sep='', columns=['sector'])
# to check by secter
# df_node[df_node['1차 금속제품 도매업'] == 1]

df_node.to_csv

Unnamed: 0,index,Label,Id,업종명10차
0,0,한화글로벌에셋(주),2028163842,지주회사
1,1,미래스틸(주),2448701051,1차 금속제품 도매업
2,2,(주)명성중공업,4188139383,육상 금속 골조 구조재 제조업
3,3,동신종합철강(주),4018155158,그 외 기타 1차 철강 제조업
4,4,(주)탑에너지,5768700852,일반전기 공사업
...,...,...,...,...
352790,352790,코로나냉열기상사,6090793964,"철물, 금속 파스너 및 수공구 도매업"
352791,352791,도무스,6082062526,기타 일반 및 생활 숙박시설 운영업
352792,352792,주식회사 디일공이,6058625381,시계 및 귀금속 소매업
352793,352793,(주)동성택시,6098103123,택시 운송업


In [5]:
df_node.to_csv("/Users/choeseung-won/Deep_Study/Business_Partner_Project/Data/company_node_complete.csv", index=False)
df_edge.to_csv("/Users/choeseung-won/Deep_Study/Business_Partner_Project/Data/company_edge_complete.csv", index=False)

In [3]:

u, v = torch.LongTensor(df_edge.Source.to_numpy()), torch.LongTensor(df_edge.Target.to_numpy())
g = dgl.graph((u, v))
# g.ndata['sector'] = torch.ones(g.num_nodes(), 16)
g.ndata['sector'] = torch.FloatTensor(df_node.iloc[:,3:].to_numpy())
g

Graph(num_nodes=3230, num_edges=6125,
      ndata_schemes={'sector': Scheme(shape=(398,), dtype=torch.float32)}
      edata_schemes={})

In [4]:
# Split edge set for training and testing
u, v = g.edges()

eids = np.arange(g.number_of_edges())
eids = np.random.permutation(eids)
test_size = int(len(eids) * 0.1)
train_size = g.number_of_edges() - test_size
test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]
train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]]

# Find all negative edges and split them for training and testing
adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())))
adj_neg = 1 - adj.todense() - np.eye(g.number_of_nodes())
neg_u, neg_v = np.where(adj_neg != 0)

neg_eids = np.random.choice(len(neg_u), g.number_of_edges() // 2)
test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]]
train_neg_u, train_neg_v = neg_u[neg_eids[test_size:]], neg_v[neg_eids[test_size:]]

* Test data의 분리 

test 데이터를 분리하기 위해서 `dgl.remove_edges`를 사용하여 제거 한다. 


In [5]:
adj

<3230x3230 sparse matrix of type '<class 'numpy.float64'>'
	with 6125 stored elements in COOrdinate format>

In [6]:
train_g = dgl.remove_edges(g, eids[:test_size])
train_g

Graph(num_nodes=3230, num_edges=5513,
      ndata_schemes={'sector': Scheme(shape=(398,), dtype=torch.float32)}
      edata_schemes={})

# Define a GraphSAGE model
------------------------

* input : graph with pos, nag
* output: graph with scored 

Task 에 따라서 loss를 달리 잡을 수 있다. 이 경우 link prediction 이기 때문에 Graph의 edge의 연결 여부를 score 로 계산하여 loss를 계산

## 1. Embedding (GraphSAGE) 
Node의 feature들을 SageConv 방식으로 Embedding
1. con1 (SAGEConv) 적용
2. 활성화 함수(relu)
3. conv2 (SAGEConv) 적용


## 2. Prediction (DotPredictor)
Embedding 된 h 를 통해 edge 들의 연결 여부를 score로 등록


## 3. Train(loss)
Predict 된 score(pos + nag)를 가진 graph와 실제 graph(label) 를 loss fuction(binary_entrophy)을 사용하여 학습


In [7]:
from dgl.nn import SAGEConv

# ----------- 2. create model -------------- #
# build a two-layer GraphSAGE model
class GraphSAGE(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats, 'mean')
        self.conv2 = SAGEConv(h_feats, h_feats, 'mean')
    
    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        
        return h

In [8]:
train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes())
train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes())

test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes())
test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes())

In [9]:
import dgl.function as fn

class DotPredictor(nn.Module):
    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            # Compute a new edge feature named 'score' by a dot-product between the
            # source node feature 'h' and destination node feature 'h'.
            g.apply_edges(fn.u_dot_v('h', 'h', 'score'))
            # u_dot_v returns a 1-element vector for each edge so you need to squeeze it.
            return g.edata['score'][:, 0]

In [10]:
model = GraphSAGE(train_g.ndata['sector'].shape[1], 16)
# You can replace DotPredictor with MLPPredictor.
#pred = MLPPredictor(16)
pred = DotPredictor()

def compute_loss(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score])
    labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])])

    return F.binary_cross_entropy_with_logits(scores, labels)

def compute_auc(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score]).numpy()
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
    return roc_auc_score(labels, scores)

In [11]:
msg_fn = fn.copy_src('h', 'm')

isinstance(train_g.ndata['sector'], tuple)
train_g.dstdata['sector'].shape

torch.Size([3230, 398])

In [14]:
# ----------- 3. set up loss and optimizer -------------- #
# in this case, loss will in training loop
optimizer = torch.optim.Adam(itertools.chain(model.parameters(), pred.parameters()), lr=0.005)

# ----------- 4. training -------------------------------- #
all_logits = []
for e in range(500):
    # forward
    h = model(train_g, train_g.ndata['sector'])
    pos_score = pred(train_pos_g, h)
    neg_score = pred(train_neg_g, h)
    loss = compute_loss(pos_score, neg_score)


    # backward
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if e % 100 == 0:
        print('In epoch {}, loss: {}'.format(e, loss))

# ----------- 5. check results ------------------------ #
from sklearn.metrics import roc_auc_score
with torch.no_grad():
    pos_score = pred(test_pos_g, h)
    neg_score = pred(test_neg_g, h)
    print('AUC', compute_auc(pos_score, neg_score))

In epoch 0, loss: 0.1457456648349762
In epoch 100, loss: 0.08935333043336868
In epoch 200, loss: 0.07932838797569275
In epoch 300, loss: 0.07555355876684189
In epoch 400, loss: 0.07356900721788406
In epoch 500, loss: 0.07229695469141006
In epoch 600, loss: 0.0714627206325531
In epoch 700, loss: 0.07081196457147598
In epoch 800, loss: 0.07016823440790176
In epoch 900, loss: 0.06988298147916794
AUC 0.7490374962621213


# Test section

데이터를 학습된 graph 데이터(h_item) 과 맞춰주기 위하여 

In [15]:
df_edge = pd.read_csv("/Users/choeseung-won/Deep_Study/Business_Partner_Project/Data/강원_edge.csv")
df_node = pd.read_csv("/Users/choeseung-won/Deep_Study/Business_Partner_Project/Data/강원_node.csv")
# 전처리
df_node = df_node[df_node['Id'].isin(df_edge.Source) | df_node['Id'].isin(df_edge.Target)]
df_node = df_node.drop_duplicates('Id').rename(columns={ "from_업종명10차": "sector"}).iloc[:, :3]
# df_node['company_id'] = df_node.reindex().index
df_node = df_node.drop_duplicates('Id').reset_index()


In [24]:
# Query
from scipy import spatial

target_idx = 1701

tree = spatial.KDTree(h.tolist())
index_ids = tree.query(h[target_idx].tolist(), 10)[1]

df_node.iloc[target_idx]

index                   1969
Label               한일시멘트(주)
Id                4638701012
sector    건설용 석재 채굴 및 쇄석 생산업
Name: 1701, dtype: object

In [25]:

df_node.iloc[index_ids]

Unnamed: 0,index,Label,Id,sector
1701,1969,한일시멘트(주),4638701012,건설용 석재 채굴 및 쇄석 생산업
1228,1374,안동석재산업(주),5088112765,건설용 석재 채굴 및 쇄석 생산업
1584,1825,(합자)태광레미콘,2228111203,건설용 석재 채굴 및 쇄석 생산업
2216,2651,정선엔지니어링,2298131379,제조업
2577,3150,메카,6458600020,제조업
1790,2083,신우하이텍,1378122945,제조업
251,267,대호이앤지,8108700709,제조업
1542,1772,케이투앤,2248153266,제조업
1942,2289,원광산업,2248137951,제조업
1444,1644,일광,1408152881,제조업


# Visulaize section

In [26]:
h_item =  h.detach().numpy()
h_item.shape

(3230, 16)

In [27]:
%matplotlib inline

import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

from bokeh.models import *
from bokeh.plotting import *
from bokeh.io import *
from bokeh.tile_providers import *
from bokeh.palettes import *
from bokeh.transform import *
from bokeh.layouts import *

from bokeh.plotting import figure, show
from bokeh.sampledata.iris import flowers
from bokeh.models import HoverTool

model = TSNE(n_components=2,learning_rate=300)
transformed = model.fit_transform(h_item)

In [None]:
df_node['x'] = transformed[:, 0]
df_node['y'] = transformed[:, 1]
df_node
movies = df_node[['index', 'x', 'y', 'Label', 'sector']]

Error: Session cannot generate requests

In [None]:
output_notebook()

p = figure(title = "Movie t-SNE by GNN")
p.xaxis.axis_label = 'x'
p.yaxis.axis_label = 'y'

color_column = []
for genre in movies['sector'].unique().tolist():
    color_column.append(genre)

c = p.circle(x='x', y='y', color='red', fill_alpha=0.2, size=3, source=movies)

c = p.circle(x='x', 
             y='y', 
             legend_field="sector",
             color=factor_cmap('sector', d3['Category20'][20], color_column),
             fill_alpha=1, 
             size=3, 
             source=movies)


circle_hover = HoverTool(tooltips=[('Label:', '@Label'), ('sector:', '@sector')], 
                         mode='mouse', 
                         point_policy='follow_mouse', 
                         renderers=[c])
circle_hover.renderers.append(c)

# mouse hover와 legend 정보 부착
p.tools.append(circle_hover)
p.legend.label_text_font_size = '5pt'
p.legend.location = 'top_left'

show(p)

Error: Session cannot generate requests