In [1]:
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np

import torch as th
import json

Using backend: pytorch


In [2]:
# graph

In [15]:
class GraphConstructor(object):
    def __init__(self):
        self.num_editor_features = 10
        self.num_page_features = 15
        self.num_editor_page_features = 8
        self.num_collab_dir_feat = 2
        self.num_collab_undir_feat = 2

    def construct_graph(self, sample):
        self.editor_nodes = [d['event_user_id'] for d in sample['user_article']]
        self.editor_to_ind = {j:i for i,j in enumerate(self.editor_nodes)}
        self.ind_to_editor = {i:j for j,i in self.editor_to_ind.items()}
        self.editor_page_links = [(self.editor_to_ind[i],0) for i in self.editor_nodes]
        self.collab_links_directed = [(self.editor_to_ind[pair['event_user_id']],self.editor_to_ind[pair['event_user_id_r']]) 
                                     for pair in sample['collaboration']['directed']]
        self.collab_links_undirected = [(self.editor_to_ind[d['pair'][0]],self.editor_to_ind[d['pair'][1]]) 
                                    for d in sample['collaboration']['undirected']]
        
        g = dgl.heterograph({
             ('editor', 'edits', 'page'): self.editor_page_links,
             ('editor', 'collab-dir', 'editor'): self.collab_links_directed,
             ('editor', 'collab-undir', 'editor'): self.collab_links_undirected + \
                            [(j,i) for i,j in self.collab_links_undirected]})
        return g
    
    def format_page_features(self,sample):
        page_features = [list(sample['article'].values())]
        page_features = th.tensor([[i if not np.isnan(i) else 0 for i in sample_feat]
                                    for sample_feat in page_features])
        return page_features

    def format_editor_features(self, sample):
        editor_features_lookup = pd.DataFrame(sample['editor'])\
                                            .set_index('event_user_id').to_dict('index')
        null_dict = {i: 0 for i in range(self.num_editor_features)}
        editor_features = [list(editor_features_lookup.get(i,null_dict).values()) 
                                     for i in self.editor_nodes]
        editor_features = th.tensor([[i if not np.isnan(i) else 0 for i in sample_feat]
                                    for sample_feat in editor_features])
        return editor_features
    
    def format_node_features(self,sample):
        page_feat = self.format_page_features(sample)
        editor_feat = self.format_editor_features(sample)
        return {'page':page_feat,'editor':editor_feat}
    
    def format_edge_features(self,sample):
        features = {}
        # directed
        if len(sample['collaboration']['directed']) > 0:
            collab_dir_lookup = pd.DataFrame(sample['collaboration']['directed'])\
                                    .set_index(['event_user_id','event_user_id_r']).to_dict('index')
            null_dict = {i: 0 for i in range(self.num_collab_dir_feat)}
            ft = [list(collab_dir_lookup.get((self.ind_to_editor[i],self.ind_to_editor[j]),null_dict).values()) 
                         for i,j in self.collab_links_directed]
            features['collab-dir'] = th.tensor([[i if not np.isnan(i) else 0 for i in sample_feat]
                                    for sample_feat in ft])

            collab_undir_lookup = pd.DataFrame(sample['collaboration']['undirected'])
            collab_undir_lookup['id_1'],collab_undir_lookup['id_2'] = zip(*list(collab_undir_lookup['pair']))
            collab_undir_lookup = collab_undir_lookup.set_index(['id_1','id_2']).drop('pair',axis=1).to_dict('index')
            null_dict = {i: 0 for i in range(self.num_collab_undir_feat)}
            collab_undir_features = [list(
                        collab_undir_lookup.get((self.ind_to_editor[i],self.ind_to_editor[j]),null_dict).values()) 
                                     for i,j in self.collab_links_undirected]
            ft = collab_undir_features + collab_undir_features
            features['collab-undir'] = th.tensor([[i if not np.isnan(i) else 0 for i in sample_feat]
                                    for sample_feat in ft])
            
        # edits
        editor_article_features_lookup = pd.DataFrame(sample['user_article'])\
                                                .set_index('event_user_id').to_dict('index')
        null_dict = {i: 0 for i in range(self.num_editor_page_features)}
        ft = [list(editor_article_features_lookup.get(i,null_dict).values()) 
                                             for i in self.editor_nodes]
        features['edits'] = th.tensor([[i if not np.isnan(i) else 0 for i in sample_feat]
                                    for sample_feat in ft])
        
        return features
    
    def make_graph(self, sample):
        graph = self.construct_graph(sample)        
        graph.ndata['features'] = self.format_node_features(sample)
        graph.edata['features'] = self.format_edge_features(sample)
        return graph

    #def format_features(graph):
        

In [4]:
with open('/srv/home/christinedk/wp_internship/features/activity_fanpov.json','rb') as f:
    page_history = json.load(f)
with open('/srv/home/christinedk/wp_internship/features/editorsfanpov_v2.json','rb') as f:
    editor_history = json.load(f)
    
pos_samples = [{**e,**p} for e,p in zip(page_history, editor_history)]

In [5]:
pos = []
for sample in pos_samples:
    graphmaker = GraphConstructor()
    g = graphmaker.make_graph(sample)
    pos.append((g,1))

In [6]:
with open('/srv/home/christinedk/wp_internship/negative_features/activity_fanpov.json','rb') as f:
    page_history = json.load(f) 
with open('/srv/home/christinedk/wp_internship/negative_features/editorsfanpov_v2.json','rb') as f:
    editor_history = json.load(f)
    
neg_samples = [{**e,**p} for e,p in zip(page_history, editor_history)]

In [7]:
neg = []
for sample in neg_samples:
    graphmaker = GraphConstructor()
    g = graphmaker.make_graph(sample)
    neg.append((g,0))

In [8]:
dataset = pos + neg

In [9]:
from dgl.dataloading import GraphDataLoader
from torch.utils.data.sampler import SubsetRandomSampler

num_examples = len(dataset)
num_test = int(num_examples * 0.2)

test_sampler = SubsetRandomSampler(torch.arange(num_test))
train_sampler = SubsetRandomSampler(torch.arange(num_test, num_examples))

BATCH_SIZE=64
train_dataloader = GraphDataLoader(
    dataset, sampler=train_sampler, batch_size=BATCH_SIZE, drop_last=False)
test_dataloader = GraphDataLoader(
    dataset, sampler=test_sampler, batch_size=BATCH_SIZE, drop_last=False)

In [10]:
import dgl.nn.pytorch as dglnn
import torch.nn as nn

class RGCN(nn.Module):
    def __init__(self, in_feats, hid_feats, out_feats, rel_names):
        super().__init__()

        self.conv1 = dglnn.HeteroGraphConv({
            rel: dglnn.GraphConv(in_feats, hid_feats)
            for rel in rel_names}, aggregate='sum')
        
        self.conv2 = dglnn.HeteroGraphConv({
            rel: dglnn.GraphConv(hid_feats, out_feats)
            for rel in rel_names}, aggregate='sum')

    def forward(self, graph, inputs):
        # inputs is features of nodes
        h = self.conv1(graph, inputs)
        h = {k: F.relu(v) for k, v in h.items()}
        h = self.conv2(graph, h)
        return h

class HeteroClassifier(nn.Module):
    def __init__(self, in_dim, hidden_dim, n_classes, rel_names):
        super().__init__()

        self.rgcn = RGCN(in_dim, hidden_dim, hidden_dim, rel_names)
        self.classify = nn.Linear(hidden_dim, n_classes)

    def forward(self, g):
        h = g.ndata['features']
        h = self.rgcn(g, h)
        with g.local_scope():
            g.ndata['h'] = h
            # Calculate graph representation by average readout.
            hg = 0
            for ntype in g.ntypes:
                hg = hg + dgl.mean_nodes(g, 'h', ntype=ntype)
            return self.classify(hg)

In [12]:
from tqdm import tqdm

In [16]:
# etypes is the list of edge types as strings.
model = HeteroClassifier(10, 256, 2, etypes)
opt = torch.optim.Adam(model.parameters())
num_correct = 0
num_tests = 0

for epoch in range(30):
    for batched_graph, labels in tqdm(train_dataloader):
        logits = model(batched_graph)
        loss = F.cross_entropy(logits, labels)
        opt.zero_grad()
        loss.backward()
        opt.step()
        num_correct += (logits.argmax(1) == labels).sum().item()
        num_tests += len(labels)
    print('Epoch {}    Train accuracy: {}'.format(epoch, num_correct / num_tests))

100%|██████████| 17/17 [00:01<00:00, 14.00it/s]
 12%|█▏        | 2/17 [00:00<00:01, 14.40it/s]

Epoch 0    Train accuracy: 0.527643064985451


100%|██████████| 17/17 [00:01<00:00, 15.65it/s]
 12%|█▏        | 2/17 [00:00<00:01, 12.38it/s]

Epoch 1    Train accuracy: 0.5407371483996121


100%|██████████| 17/17 [00:01<00:00, 15.24it/s]
 12%|█▏        | 2/17 [00:00<00:01, 13.58it/s]

Epoch 2    Train accuracy: 0.553831231813773


100%|██████████| 17/17 [00:01<00:00, 16.48it/s]
 12%|█▏        | 2/17 [00:00<00:01, 13.44it/s]

Epoch 3    Train accuracy: 0.5521338506304558


100%|██████████| 17/17 [00:01<00:00, 13.29it/s]
 12%|█▏        | 2/17 [00:00<00:01, 10.54it/s]

Epoch 4    Train accuracy: 0.5524733268671193


100%|██████████| 17/17 [00:01<00:00, 15.80it/s]
 12%|█▏        | 2/17 [00:00<00:01, 13.89it/s]

Epoch 5    Train accuracy: 0.5478499838344649


100%|██████████| 17/17 [00:01<00:00, 15.66it/s]
 12%|█▏        | 2/17 [00:00<00:01, 13.82it/s]

Epoch 6    Train accuracy: 0.5492586947485104


100%|██████████| 17/17 [00:01<00:00, 15.52it/s]
 12%|█▏        | 2/17 [00:00<00:00, 16.81it/s]

Epoch 7    Train accuracy: 0.5493452958292919


100%|██████████| 17/17 [00:01<00:00, 15.71it/s]
 12%|█▏        | 2/17 [00:00<00:00, 16.40it/s]

Epoch 8    Train accuracy: 0.5455329238064447


100%|██████████| 17/17 [00:01<00:00, 16.26it/s]
 12%|█▏        | 2/17 [00:00<00:00, 15.05it/s]

Epoch 9    Train accuracy: 0.5431619786614937


100%|██████████| 17/17 [00:01<00:00, 15.19it/s]
 12%|█▏        | 2/17 [00:00<00:00, 18.97it/s]

Epoch 10    Train accuracy: 0.5467771801428446


100%|██████████| 17/17 [00:01<00:00, 13.97it/s]
 12%|█▏        | 2/17 [00:00<00:01, 12.84it/s]

Epoch 11    Train accuracy: 0.545021015195603


100%|██████████| 17/17 [00:01<00:00, 15.27it/s]
 12%|█▏        | 2/17 [00:00<00:01, 14.65it/s]

Epoch 12    Train accuracy: 0.5476385883757368


100%|██████████| 17/17 [00:01<00:00, 13.74it/s]
 12%|█▏        | 2/17 [00:00<00:01, 13.46it/s]

Epoch 13    Train accuracy: 0.5478037965913815


100%|██████████| 17/17 [00:01<00:00, 13.77it/s]
 12%|█▏        | 2/17 [00:00<00:01, 10.85it/s]

Epoch 14    Train accuracy: 0.5485935984481086


100%|██████████| 17/17 [00:01<00:00, 13.97it/s]
 12%|█▏        | 2/17 [00:00<00:00, 16.41it/s]

Epoch 15    Train accuracy: 0.5462536372453928


100%|██████████| 17/17 [00:01<00:00, 15.59it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

Epoch 16    Train accuracy: 0.546984652250813


100%|██████████| 17/17 [00:01<00:00, 14.44it/s]
 12%|█▏        | 2/17 [00:00<00:01, 13.84it/s]

Epoch 17    Train accuracy: 0.5433236340122858


100%|██████████| 17/17 [00:01<00:00, 14.68it/s]
  6%|▌         | 1/17 [00:00<00:02,  6.55it/s]

Epoch 18    Train accuracy: 0.543417223952218


100%|██████████| 17/17 [00:01<00:00, 12.59it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

Epoch 19    Train accuracy: 0.5432589718719689


100%|██████████| 17/17 [00:02<00:00,  6.72it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

Epoch 20    Train accuracy: 0.5440395362800794


100%|██████████| 17/17 [00:01<00:00, 14.57it/s]
 12%|█▏        | 2/17 [00:00<00:00, 15.28it/s]

Epoch 21    Train accuracy: 0.5428092760779473


100%|██████████| 17/17 [00:02<00:00,  8.10it/s]
 12%|█▏        | 2/17 [00:00<00:01, 14.44it/s]

Epoch 22    Train accuracy: 0.5443427655716274


100%|██████████| 17/17 [00:01<00:00, 14.15it/s]
 12%|█▏        | 2/17 [00:00<00:01, 14.47it/s]

Epoch 23    Train accuracy: 0.5439702554154543


100%|██████████| 17/17 [00:01<00:00, 15.38it/s]
 12%|█▏        | 2/17 [00:00<00:00, 16.33it/s]

Epoch 24    Train accuracy: 0.5441707080504364


100%|██████████| 17/17 [00:01<00:00, 13.93it/s]
 12%|█▏        | 2/17 [00:00<00:00, 16.09it/s]

Epoch 25    Train accuracy: 0.545064537790047


100%|██████████| 17/17 [00:02<00:00,  8.07it/s]
 12%|█▏        | 2/17 [00:00<00:01, 13.31it/s]

Epoch 26    Train accuracy: 0.5440959873549592


100%|██████████| 17/17 [00:01<00:00, 13.22it/s]
 12%|█▏        | 2/17 [00:00<00:01, 13.46it/s]

Epoch 27    Train accuracy: 0.5442011916308716


100%|██████████| 17/17 [00:01<00:00, 13.78it/s]
  6%|▌         | 1/17 [00:00<00:01,  9.25it/s]

Epoch 28    Train accuracy: 0.5442991404394796


100%|██████████| 17/17 [00:01<00:00, 13.73it/s]

Epoch 29    Train accuracy: 0.543873262204979





In [25]:
import dgl.function as fn

class HeteroRGCNLayer(nn.Module):
    def __init__(self, in_size, out_size, etypes):
        super(HeteroRGCNLayer, self).__init__()
        # W_r for each relation
        self.weight = nn.ModuleDict({
                name : nn.Linear(in_size, out_size) for name in etypes
            })

    def forward(self, G, feat_dict):
        # The input is a dictionary of node features for each type
        funcs = {}
        for srctype, etype, dsttype in G.canonical_etypes:
            # Compute W_r * h
            Wh = self.weight[etype](feat_dict[srctype])
            # Save it in graph for message passing
            G.nodes[srctype].data['Wh_%s' % etype] = Wh
            # Specify per-relation message passing functions: (message_func, reduce_func).
            # Note that the results are saved to the same destination feature 'h', which
            # hints the type wise reducer for aggregation.
            funcs[etype] = (fn.copy_u('Wh_%s' % etype, 'm'), fn.mean('m', 'h'))
        # Trigger message passing of multiple types.
        # The first argument is the message passing functions for each relation.
        # The second one is the type wise reducer, could be "sum", "max",
        # "min", "mean", "stack"
        G.multi_update_all(funcs, 'sum')
        # return the updated node feature dictionary
        return {ntype : G.nodes[ntype].data['h'] for ntype in G.ntypes}

In [26]:
class HeteroRGCN(nn.Module):
    def __init__(self, G, in_size, hidden_size, out_size):
        super(HeteroRGCN, self).__init__()
        # Use trainable node embeddings as featureless inputs.
        embed_dict = {ntype : nn.Parameter(torch.Tensor(G.number_of_nodes(ntype), in_size))
                      for ntype in G.ntypes}
        for key, embed in embed_dict.items():
            nn.init.xavier_uniform_(embed)
        self.embed = nn.ParameterDict(embed_dict)
        # create layers
        self.layer1 = HeteroRGCNLayer(in_size, hidden_size, G.etypes)
        self.layer2 = HeteroRGCNLayer(hidden_size, out_size, G.etypes)

    def forward(self, G):
        h_dict = self.layer1(G, self.embed)
        h_dict = {k : F.leaky_relu(h) for k, h in h_dict.items()}
        h_dict = self.layer2(G, h_dict)
        # get paper logits
        return h_dict['paper']

In [27]:
model = HeteroRGCN(G, 10, 10, 3)

opt = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

best_val_acc = 0
best_test_acc = 0

for epoch in range(100):
    logits = model(G)
    # The loss is computed only for labeled nodes.
    loss = F.cross_entropy(logits[train_idx], labels[train_idx])

    pred = logits.argmax(1)
    train_acc = (pred[train_idx] == labels[train_idx]).float().mean()
    val_acc = (pred[val_idx] == labels[val_idx]).float().mean()
    test_acc = (pred[test_idx] == labels[test_idx]).float().mean()

    if best_val_acc < val_acc:
        best_val_acc = val_acc
        best_test_acc = test_acc

    opt.zero_grad()
    loss.backward()
    opt.step()

    if epoch % 5 == 0:
        print('Loss %.4f, Train Acc %.4f, Val Acc %.4f (Best %.4f), Test Acc %.4f (Best %.4f)' % (
            loss.item(),
            train_acc.item(),
            val_acc.item(),
            best_val_acc.item(),
            test_acc.item(),
            best_test_acc.item(),
        ))

NameError: name 'G' is not defined

In [23]:
import dgl
import numpy as np
import torch as th
from dgl.nn import RelGraphConv

g = dgl.graph(([0,1,2,3,2,5], [1,2,3,4,0,3]))
feat = th.ones(6, 10)
conv = RelGraphConv(10, 2, 3, regularizer='basis', num_bases=2)
etype = th.tensor(np.array([0,1,2,0,1,2]).astype(np.int64))
res = conv(g, feat, etype)

In [None]:
u = [0, 1, 0, 0, 1]
v = [0, 1, 2, 3, 2]
g = dgl.heterograph({('_U', '_E', '_V') : (u, v)})
u_fea = th.rand(2, 5)
v_fea = th.rand(4, 5)
conv = GraphConv(5, 2, norm='both', weight=True, bias=True)
res = conv(g, (u_fea, v_fea))

In [30]:
from dgl.contrib.data import load_data
data = load_data(dataset='aifb')
num_nodes = data.num_nodes
num_rels = data.num_rels
num_classes = data.num_classes
labels = data.labels
train_idx = data.train_idx
# split training and validation set
val_idx = train_idx[:len(train_idx) // 5]
train_idx = train_idx[len(train_idx) // 5:]

# edge type and normalization factor
edge_type = torch.from_numpy(data.edge_type)
edge_norm = torch.from_numpy(data.edge_norm).unsqueeze(1)

labels = torch.from_numpy(labels).view(-1)

Downloading /home/christinedk/.dgl/aifb.tgz from https://data.dgl.ai/dataset/aifb.tgz...
Extracting file to /home/christinedk/.dgl/aifb
Loading dataset aifb
Graph loaded, frequencies counted.
Number of nodes:  8285
Number of relations:  91
Number of edges:  66371
4 classes: {'http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id2instance', 'http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id3instance', 'http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id1instance', 'http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id4instance'}
Loading training set
Loading test set
Number of classes:  4
removing nodes that are more than 3 hops away
