In [2]:
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F

import dgl.data

In [5]:
# Generate a synthetic dataset with 10000 graphs, ranging from 10 to 500 nodes.
dataset = dgl.data.GINDataset('PROTEINS', self_loop=True)

Downloading /home/christinedk/.dgl/GINDataset.zip from https://raw.githubusercontent.com/weihua916/powerful-gnns/master/dataset.zip...
Extracting file to /home/christinedk/.dgl/GINDataset


In [6]:
print('Node feature dimensionality:', dataset.dim_nfeats)
print('Number of graph categories:', dataset.gclasses)

Node feature dimensionality: 3
Number of graph categories: 2


In [7]:
from dgl.dataloading import GraphDataLoader
from torch.utils.data.sampler import SubsetRandomSampler

num_examples = len(dataset)
num_train = int(num_examples * 0.8)

train_sampler = SubsetRandomSampler(torch.arange(num_train))
test_sampler = SubsetRandomSampler(torch.arange(num_train, num_examples))

train_dataloader = GraphDataLoader(
    dataset, sampler=train_sampler, batch_size=5, drop_last=False)
test_dataloader = GraphDataLoader(
    dataset, sampler=test_sampler, batch_size=5, drop_last=False)


In [8]:
it = iter(train_dataloader)
batch = next(it)
print(batch)

[Graph(num_nodes=279, num_edges=1319,
      ndata_schemes={'label': Scheme(shape=(), dtype=torch.int64), 'attr': Scheme(shape=(3,), dtype=torch.float32)}
      edata_schemes={}), tensor([0, 0, 0, 0, 0])]


In [9]:
batched_graph, labels = batch
print('Number of nodes for each graph element in the batch:', batched_graph.batch_num_nodes())
print('Number of edges for each graph element in the batch:', batched_graph.batch_num_edges())

# Recover the original graph elements from the minibatch
graphs = dgl.unbatch(batched_graph)
print('The original graphs in the minibatch:')
print(graphs)

Number of nodes for each graph element in the batch: tensor([84,  7, 68, 36, 84])
Number of edges for each graph element in the batch: tensor([438,  33, 354, 174, 320])
The original graphs in the minibatch:
[Graph(num_nodes=84, num_edges=438,
      ndata_schemes={'label': Scheme(shape=(), dtype=torch.int64), 'attr': Scheme(shape=(3,), dtype=torch.float32)}
      edata_schemes={}), Graph(num_nodes=7, num_edges=33,
      ndata_schemes={'label': Scheme(shape=(), dtype=torch.int64), 'attr': Scheme(shape=(3,), dtype=torch.float32)}
      edata_schemes={}), Graph(num_nodes=68, num_edges=354,
      ndata_schemes={'label': Scheme(shape=(), dtype=torch.int64), 'attr': Scheme(shape=(3,), dtype=torch.float32)}
      edata_schemes={}), Graph(num_nodes=36, num_edges=174,
      ndata_schemes={'label': Scheme(shape=(), dtype=torch.int64), 'attr': Scheme(shape=(3,), dtype=torch.float32)}
      edata_schemes={}), Graph(num_nodes=84, num_edges=320,
      ndata_schemes={'label': Scheme(shape=(), dtype=to

In [10]:
from dgl.nn import GraphConv

class GCN(nn.Module):
    def __init__(self, in_feats, h_feats, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GraphConv(in_feats, h_feats)
        self.conv2 = GraphConv(h_feats, num_classes)

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        g.ndata['h'] = h
        return dgl.mean_nodes(g, 'h')

In [12]:
# Create the model with given dimensions
model = GCN(dataset.dim_nfeats, 16, dataset.gclasses)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

for epoch in range(20):
    for batched_graph, labels in train_dataloader:
        pred = model(batched_graph, batched_graph.ndata['attr'].float())
        loss = F.cross_entropy(pred, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

num_correct = 0
num_tests = 0
for batched_graph, labels in test_dataloader:
    pred = model(batched_graph, batched_graph.ndata['attr'].float())
    num_correct += (pred.argmax(1) == labels).sum().item()
    num_tests += len(labels)

print('Test accuracy:', num_correct / num_tests)

Test accuracy: 0.2914798206278027


In [None]:
# making my own dataset

In [13]:
import urllib.request
import pandas as pd
urllib.request.urlretrieve(
    'https://data.dgl.ai/tutorial/dataset/members.csv', './members.csv')
urllib.request.urlretrieve(
    'https://data.dgl.ai/tutorial/dataset/interactions.csv', './interactions.csv')

members = pd.read_csv('./members.csv')
members.head()

interactions = pd.read_csv('./interactions.csv')
interactions.head()


Unnamed: 0,Src,Dst,Weight
0,0,1,0.043591
1,0,2,0.282119
2,0,3,0.370293
3,0,4,0.73057
4,0,5,0.821187


In [14]:
import dgl
from dgl.data import DGLDataset
import torch
import os

class KarateClubDataset(DGLDataset):
    def __init__(self):
        super().__init__(name='karate_club')

    def process(self):
        nodes_data = pd.read_csv('./members.csv')
        edges_data = pd.read_csv('./interactions.csv')
        node_features = torch.from_numpy(nodes_data['Age'].to_numpy())
        node_labels = torch.from_numpy(nodes_data['Club'].astype('category').cat.codes.to_numpy())
        edge_features = torch.from_numpy(edges_data['Weight'].to_numpy())
        edges_src = torch.from_numpy(edges_data['Src'].to_numpy())
        edges_dst = torch.from_numpy(edges_data['Dst'].to_numpy())

        self.graph = dgl.graph((edges_src, edges_dst), num_nodes=nodes_data.shape[0])
        self.graph.ndata['feat'] = node_features
        self.graph.ndata['label'] = node_labels
        self.graph.edata['weight'] = edge_features

        # If your dataset is a node classification dataset, you will need to assign
        # masks indicating whether a node belongs to training, validation, and test set.
        n_nodes = nodes_data.shape[0]
        n_train = int(n_nodes * 0.6)
        n_val = int(n_nodes * 0.2)
        train_mask = torch.zeros(n_nodes, dtype=torch.bool)
        val_mask = torch.zeros(n_nodes, dtype=torch.bool)
        test_mask = torch.zeros(n_nodes, dtype=torch.bool)
        train_mask[:n_train] = True
        val_mask[n_train:n_train + n_val] = True
        test_mask[n_train + n_val:] = True
        self.graph.ndata['train_mask'] = train_mask
        self.graph.ndata['val_mask'] = val_mask
        self.graph.ndata['test_mask'] = test_mask

    def __getitem__(self, i):
        return self.graph

    def __len__(self):
        return 1

dataset = KarateClubDataset()
graph = dataset[0]

print(graph)

Graph(num_nodes=34, num_edges=156,
      ndata_schemes={'feat': Scheme(shape=(), dtype=torch.int64), 'label': Scheme(shape=(), dtype=torch.int8), 'train_mask': Scheme(shape=(), dtype=torch.bool), 'val_mask': Scheme(shape=(), dtype=torch.bool), 'test_mask': Scheme(shape=(), dtype=torch.bool)}
      edata_schemes={'weight': Scheme(shape=(), dtype=torch.float64)})


  


In [15]:
urllib.request.urlretrieve(
    'https://data.dgl.ai/tutorial/dataset/graph_edges.csv', './graph_edges.csv')
urllib.request.urlretrieve(
    'https://data.dgl.ai/tutorial/dataset/graph_properties.csv', './graph_properties.csv')
edges = pd.read_csv('./graph_edges.csv')
properties = pd.read_csv('./graph_properties.csv')

edges.head()

properties.head()

class SyntheticDataset(DGLDataset):
    def __init__(self):
        super().__init__(name='synthetic')

    def process(self):
        edges = pd.read_csv('./graph_edges.csv')
        properties = pd.read_csv('./graph_properties.csv')
        self.graphs = []
        self.labels = []

        # Create a graph for each graph ID from the edges table.
        # First process the properties table into two dictionaries with graph IDs as keys.
        # The label and number of nodes are values.
        label_dict = {}
        num_nodes_dict = {}
        for _, row in properties.iterrows():
            label_dict[row['graph_id']] = row['label']
            num_nodes_dict[row['graph_id']] = row['num_nodes']

        # For the edges, first group the table by graph IDs.
        edges_group = edges.groupby('graph_id')

        # For each graph ID...
        for graph_id in edges_group.groups:
            # Find the edges as well as the number of nodes and its label.
            edges_of_id = edges_group.get_group(graph_id)
            src = edges_of_id['src'].to_numpy()
            dst = edges_of_id['dst'].to_numpy()
            num_nodes = num_nodes_dict[graph_id]
            label = label_dict[graph_id]

            # Create a graph and add it to the list of graphs and labels.
            g = dgl.graph((src, dst), num_nodes=num_nodes)
            self.graphs.append(g)
            self.labels.append(label)

        # Convert the label list to tensor for saving.
        self.labels = torch.LongTensor(self.labels)

    def __getitem__(self, i):
        return self.graphs[i], self.labels[i]

    def __len__(self):
        return len(self.graphs)

dataset = SyntheticDataset()
graph, label = dataset[0]
print(graph, label)

Graph(num_nodes=15, num_edges=45,
      ndata_schemes={}
      edata_schemes={}) tensor(0)


In [None]:
# heterograph

In [18]:
import dgl
import numpy as np

ratings = dgl.heterograph(
    {('user', '+1', 'movie') : (np.array([0, 0, 1]), np.array([0, 1, 0])),
     ('user', '-1', 'movie') : (np.array([2]), np.array([1]))})

In [19]:
g = dgl.heterograph({
     ('user', 'follows', 'user'): [(0, 1), (1, 2)],
     ('user', 'plays', 'game'): [(0, 0), (1, 0), (1, 1), (2, 1)],
     ('developer', 'develops', 'game'): [(0, 0), (1, 1)],
     })

In [20]:
g.number_of_nodes('user')

3

In [27]:
import scipy.io
import urllib.request

data_url = 'https://data.dgl.ai/dataset/ACM.mat'
data_file_path = '/tmp/ACM.mat'

urllib.request.urlretrieve(data_url, data_file_path)
data = scipy.io.loadmat(data_file_path)
print(list(data.keys()))

['__header__', '__version__', '__globals__', 'TvsP', 'PvsA', 'PvsV', 'AvsF', 'VvsC', 'PvsL', 'PvsC', 'A', 'C', 'F', 'L', 'P', 'T', 'V', 'PvsT', 'CNormPvsA', 'RNormPvsA', 'CNormPvsC', 'RNormPvsC', 'CNormPvsT', 'RNormPvsT', 'CNormPvsV', 'RNormPvsV', 'CNormVvsC', 'RNormVvsC', 'CNormAvsF', 'RNormAvsF', 'CNormPvsL', 'RNormPvsL', 'stopwords', 'nPvsT', 'nT', 'CNormnPvsT', 'RNormnPvsT', 'nnPvsT', 'nnT', 'CNormnnPvsT', 'RNormnnPvsT', 'PvsP', 'CNormPvsP', 'RNormPvsP']


In [61]:
data['PvsA'].it

In [69]:
nz = [(i, data['PvsA'][i,j], j) for i, j in zip(*data['PvsA'].nonzero())]

In [70]:
nz = [(i, j) for i, j in zip(*data['PvsA'].nonzero())]

In [71]:
graph = dgl.heterograph({('paper', 'written-by', 'author'):nz})

In [None]:
# for today: construct a heterograph with weight features

In [74]:
import torch as th

In [78]:
graph.ndata['x'] = {'paper':th.ones(graph.num_nodes('paper'), 3)}               # node feature of length 3

In [79]:
edges = th.tensor([0, 0, 0, 1]), th.tensor([1, 2, 3, 3])
weights = th.tensor([0.1, 0.6, 0.9, 0.7])  # weight of each edge
g = dgl.graph(edges)
g.edata['w'] = weights  # give it a name 'w'

In [80]:
g

Graph(num_nodes=4, num_edges=4,
      ndata_schemes={}
      edata_schemes={'w': Scheme(shape=(), dtype=torch.float32)})

In [None]:
# with our data

In [88]:
import json

In [186]:
with open('/srv/home/christinedk/wp_internship/features/activity_fanpov.json','rb') as f:
    page_history = json.load(f)

with open('/srv/home/christinedk/wp_internship/features/talk_fanpov.json','rb') as f:
    talk_history = json.load(f)
    c
with open('/srv/home/christinedk/wp_internship/features/editorsfanpov_v2.json','rb') as f:
    editor_history = json.load(f)

In [101]:
sample_page = page_history[0]

In [194]:
sample_collab = editor_history[0]

In [None]:
# graph

In [326]:
editor_nodes = [d['event_user_id'] for d in sample_page['user_article']]
editor_to_ind = {j:i for i,j in enumerate(editor_nodes)}
editor_page_links = [(editor_to_ind[i],0) for i in editor_nodes]
collab_links_directed = [(editor_to_ind[pair['event_user_id']],editor_to_ind[pair['event_user_id_r']]) 
                             for pair in sample_collab['collaboration']['directed']]
collab_links_undirected = [(editor_to_ind[d['pair'][0]],editor_to_ind[d['pair'][1]]) 
                            for d in sample_collab['collaboration']['undirected']]

In [327]:
g = dgl.heterograph({
     ('editor', 'edits', 'page'): editor_page_links,
     ('editor', 'collab-dir', 'editor'): collab_links_directed,
     ('editor', 'collab-undir', 'editor'): collab_links_undirected + [(j,i) for i,j in collab_links_undirected]})

In [None]:
# features

In [312]:
NUM_EDITOR_FEATURES = 10
NUM_PAGE_FEATURES = 15
NUM_EDITOR_PAGE_FEATURES = 8
NUM_COLLAB_DIR_FEAT = 2
NUM_COLLAB_UNDIR_FEAT = 2

In [267]:
page_features = th.tensor([list(sample_page['article'].values())])

In [287]:
editor_article_features_lookup = pd.DataFrame(sample_page['user_article'])\
                                        .set_index('event_user_id').to_dict('index')
null_dict = {i: 0 for i in range(NUM_EDITOR_PAGE_FEATURES)}
editor_article_features = th.tensor([list(editor_article_features_lookup.get(i,null_dict).values()) 
                                     for i,_ in editor_page_links])

In [270]:
editor_features_lookup = pd.DataFrame(sample_collab['editor'])\
                                    .set_index('event_user_id').to_dict('index')
null_dict = {i: 0 for i in range(NUM_EDITOR_FEATURES)}
editor_features = th.tensor([list(editor_features_lookup.get(i,null_dict).values()) for i in editor_nodes])

In [316]:
collab_dir_lookup = pd.DataFrame(sample_collab['collaboration']['directed'])\
                        .set_index(['event_user_id','event_user_id_r']).to_dict('index')
null_dict = {i: 0 for i in range(NUM_COLLAB_DIR_FEAT)}
collab_dir_features = th.tensor([list(collab_dir_lookup.get((i,j),null_dict).values()) 
                                 for i,j in collab_links_directed])

In [332]:
collab_undir_lookup = pd.DataFrame(sample_collab['collaboration']['undirected'])
collab_undir_lookup = pd.concat([collab_undir_lookup,
                                pd.DataFrame(collab_undir_lookup.pair.to_list(),columns=['id_1','id_2'])],axis=1)\
                            .set_index(['id_1','id_2'])\
                            .drop('pair',axis=1)\
                            .to_dict('index')

null_dict = {i: 0 for i in range(NUM_COLLAB_UNDIR_FEAT)}
collab_undir_features = [list(collab_undir_lookup.get((i,j),null_dict).values()) for i,j in collab_links_undirected]
collab_undir_features = th.tensor(collab_undir_features + collab_undir_features)

In [333]:
g.ndata['features'] = {'page':page_features,
                            'editor':editor_features}

g.edata['features'] = {'collab-dir':collab_dir_features,
                       'collab-undir':collab_undir_features,
                       'edits':editor_article_features}