In [1]:
import urllib.request
import pandas as pd
urllib.request.urlretrieve(
    'https://data.dgl.ai/tutorial/dataset/members.csv', './members.csv')
urllib.request.urlretrieve(
    'https://data.dgl.ai/tutorial/dataset/interactions.csv', './interactions.csv')

members = pd.read_csv('./members.csv')
members.head()

interactions = pd.read_csv('./interactions.csv')
interactions.head()


Unnamed: 0,Src,Dst,Weight
0,0,1,0.043591
1,0,2,0.282119
2,0,3,0.370293
3,0,4,0.73057
4,0,5,0.821187


In [3]:
import dgl
from dgl.data import DGLDataset
import torch
import os

class KarateClubDataset(DGLDataset):
    def __init__(self):
        super().__init__(name='karate_club')

    def process(self):
        nodes_data = pd.read_csv('./members.csv')
        edges_data = pd.read_csv('./interactions.csv')
        node_features = torch.from_numpy(nodes_data['Age'].to_numpy())
        print(node_features)
        node_labels = torch.from_numpy(nodes_data['Club'].astype('category').cat.codes.to_numpy())
        edge_features = torch.from_numpy(edges_data['Weight'].to_numpy())
        edges_src = torch.from_numpy(edges_data['Src'].to_numpy())
        edges_dst = torch.from_numpy(edges_data['Dst'].to_numpy())
        print(edge_features)
        self.graph = dgl.graph((edges_src, edges_dst), num_nodes=nodes_data.shape[0])
        self.graph.ndata['feat'] = node_features
        self.graph.ndata['label'] = node_labels
        self.graph.edata['weight'] = edge_features

        # If your dataset is a node classification dataset, you will need to assign
        # masks indicating whether a node belongs to training, validation, and test set.
        n_nodes = nodes_data.shape[0]
        n_train = int(n_nodes * 0.6)
        n_val = int(n_nodes * 0.2)
        train_mask = torch.zeros(n_nodes, dtype=torch.bool)
        val_mask = torch.zeros(n_nodes, dtype=torch.bool)
        test_mask = torch.zeros(n_nodes, dtype=torch.bool)
        train_mask[:n_train] = True
        val_mask[n_train:n_train + n_val] = True
        test_mask[n_train + n_val:] = True
        self.graph.ndata['train_mask'] = train_mask
        self.graph.ndata['val_mask'] = val_mask
        self.graph.ndata['test_mask'] = test_mask

    def __getitem__(self, i):
        return self.graph

    def __len__(self):
        return 1

dataset = KarateClubDataset()
graph = dataset[0]

print(graph)

tensor([44, 37, 37, 40, 30, 32, 36, 47, 35, 37, 35, 46, 46, 48, 41, 49, 46, 38,
        44, 41, 48, 34, 43, 41, 40, 34, 38, 42, 42, 44, 48, 41, 35, 46])
tensor([0.0436, 0.2821, 0.3703, 0.7306, 0.8212, 0.0592, 0.1290, 0.5208, 0.9413,
        0.8049, 0.5622, 0.1401, 0.3370, 0.0937, 0.1479, 0.5500, 0.5330, 0.8024,
        0.5267, 0.2724, 0.8261, 0.8021, 0.5614, 0.7074, 0.4958, 0.0016, 0.7004,
        0.9994, 0.8923, 0.5914, 0.7833, 0.0370, 0.3245, 0.2271, 0.4633, 0.2147,
        0.2377, 0.7345, 0.1060, 0.0365, 0.1786, 0.0978, 0.6268, 0.5527, 0.9900,
        0.9305, 0.9693, 0.6085, 0.2851, 0.7237, 0.8807, 0.3584, 0.4228, 0.5872,
        0.4046, 0.4230, 0.2269, 0.2452, 0.9186, 0.7828, 0.6702, 0.9842, 0.6230,
        0.8770, 0.7705, 0.0792, 0.2061, 0.1296, 0.2526, 0.0133, 0.3873, 0.9578,
        0.3104, 0.6005, 0.9137, 0.9917, 0.6545, 0.9800, 0.0474, 0.6714, 0.7191,
        0.5558, 0.3882, 0.2935, 0.2078, 0.0057, 0.2992, 0.8579, 0.1052, 0.1317,
        0.8670, 0.2297, 0.1604, 0.0749, 0.8305,

In [1]:
import dgl.data

dataset = dgl.data.CoraGraphDataset()
g = dataset[0]

Using backend: pytorch


  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.


In [4]:
print(g.ndata['feat'].shape)

torch.Size([2708, 1433])


In [75]:
import networkx as nx

with open('./graphvite/data/edges.txt', 'r') as f:
    edges = f.readlines()
    for line in edges:
        line = line[:len(line)-1]
    #print(len(edges))
    
edge_tuple_list = []
for line in edges:
    source, sink = line.split(" ")
    edge_tuple_list.append((int(source), int(sink)))
G = nx.Graph()
G.add_edges_from(edge_tuple_list)
print(len(edge_tuple_list))
print(len(G.edges))

59294
16085


In [71]:
print(len(G.edges))

16085


In [26]:
graph = dgl.from_networkx(G)

In [27]:
print(graph)

Graph(num_nodes=3816, num_edges=32119,
      ndata_schemes={}
      edata_schemes={})


In [42]:
nodes_list = list(G.nodes)
print(nodes_list[1])


356


In [48]:
import numpy as np
feature_matrix = np.zeros((3816, 4093))
print(feature_matrix.shape)


(3816, 4093)


In [49]:
for idx in range(len(nodes_list)):
    #print(nodes_list[idx])
    feature_matrix[idx][nodes_list[idx]] = 1

In [51]:
import torch
graph.ndata['feat'] = torch.tensor(feature_matrix)

In [55]:
print(graph.ndata['feat'].shape)
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import itertools
import numpy as np
import scipy.sparse as sp

torch.Size([3816, 4093])


In [56]:
# Split edge set for training and testing
u, v = g.edges()

eids = np.arange(g.number_of_edges())
eids = np.random.permutation(eids)
test_size = int(len(eids) * 0.1)
train_size = g.number_of_edges() - test_size
test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]
train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]]

# Find all negative edges and split them for training and testing
adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())))
adj_neg = 1 - adj.todense() - np.eye(g.number_of_nodes())
neg_u, neg_v = np.where(adj_neg != 0)

neg_eids = np.random.choice(len(neg_u), g.number_of_edges() // 2)
test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]]
train_neg_u, train_neg_v = neg_u[neg_eids[test_size:]], neg_v[neg_eids[test_size:]]

In [57]:
train_g = dgl.remove_edges(g, eids[:test_size])

In [58]:
from dgl.nn import SAGEConv

# ----------- 2. create model -------------- #
# build a two-layer GraphSAGE model
class GraphSAGE(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats, 'mean')
        self.conv2 = SAGEConv(h_feats, h_feats, 'mean')

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

In [59]:
train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes())
train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes())

test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes())
test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes())

In [60]:
import dgl.function as fn

class DotPredictor(nn.Module):
    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            # Compute a new edge feature named 'score' by a dot-product between the
            # source node feature 'h' and destination node feature 'h'.
            g.apply_edges(fn.u_dot_v('h', 'h', 'score'))
            # u_dot_v returns a 1-element vector for each edge so you need to squeeze it.
            return g.edata['score'][:, 0]

In [61]:
class MLPPredictor(nn.Module):
    def __init__(self, h_feats):
        super().__init__()
        self.W1 = nn.Linear(h_feats * 2, h_feats)
        self.W2 = nn.Linear(h_feats, 1)

    def apply_edges(self, edges):
        """
        Computes a scalar score for each edge of the given graph.

        Parameters
        ----------
        edges :
            Has three members ``src``, ``dst`` and ``data``, each of
            which is a dictionary representing the features of the
            source nodes, the destination nodes, and the edges
            themselves.

        Returns
        -------
        dict
            A dictionary of new edge features.
        """
        h = torch.cat([edges.src['h'], edges.dst['h']], 1)
        return {'score': self.W2(F.relu(self.W1(h))).squeeze(1)}

    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            g.apply_edges(self.apply_edges)
            return g.edata['score']

In [62]:
model = GraphSAGE(train_g.ndata['feat'].shape[1], 16)
# You can replace DotPredictor with MLPPredictor.
#pred = MLPPredictor(16)
pred = DotPredictor()

def compute_loss(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score])
    labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])])
    return F.binary_cross_entropy_with_logits(scores, labels)

def compute_auc(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score]).numpy()
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
    return roc_auc_score(labels, scores)

In [70]:
# ----------- 3. set up loss and optimizer -------------- #
# in this case, loss will in training loop
optimizer = torch.optim.Adam(itertools.chain(model.parameters(), pred.parameters()), lr=0.01)

# ----------- 4. training -------------------------------- #
all_logits = []
for e in range(100):
    # forward
    h = model(train_g, train_g.ndata['feat'])
    pos_score = pred(train_pos_g, h)
    neg_score = pred(train_neg_g, h)
    loss = compute_loss(pos_score, neg_score)

    # backward
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if e % 5 == 0:
        print('In epoch {}, loss: {}'.format(e, loss))

# ----------- 5. check results ------------------------ #
from sklearn.metrics import roc_auc_score
with torch.no_grad():
    pos_score = pred(test_pos_g, h)
    neg_score = pred(test_neg_g, h)
    print('AUC', compute_auc(pos_score, neg_score))


# Thumbnail Courtesy: Link Prediction with Neo4j, Mark Needham
# sphinx_gallery_thumbnail_path = '_static/blitz_4_link_predict.png'

In epoch 0, loss: 2.2543563318322413e-05
In epoch 5, loss: 0.0005897232331335545
In epoch 10, loss: 0.000199247631826438
In epoch 15, loss: 0.0005901537369936705
In epoch 20, loss: 8.533140498911962e-05
In epoch 25, loss: 3.924851989722811e-05
In epoch 30, loss: 4.098360659554601e-05
In epoch 35, loss: 4.3439391447464004e-05
In epoch 40, loss: 3.518634184729308e-05
In epoch 45, loss: 2.4735683837207034e-05
In epoch 50, loss: 1.82808144018054e-05
In epoch 55, loss: 1.4667443792859558e-05
In epoch 60, loss: 1.2528727893368341e-05
In epoch 65, loss: 1.1160030226164963e-05
In epoch 70, loss: 1.0218993338639848e-05
In epoch 75, loss: 9.525849236524664e-06
In epoch 80, loss: 8.991590220830403e-06
In epoch 85, loss: 8.567641998524778e-06
In epoch 90, loss: 8.222795258916449e-06
In epoch 95, loss: 7.933307642815635e-06
AUC 0.8469432402686373
