In [1]:
import pandas as pd
import networkx as nx
import csv
import numpy as np
from random import randint
from sklearn.linear_model import LogisticRegression


## Data Cleaning

## Graph auto encoder

In [1]:
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import scipy.sparse as sp
from tqdm import tqdm

from preprocessing import read_graph, retrieve_subgraph


device = 'cpu' 

Using backend: pytorch


In [2]:
from gensim.models.doc2vec import Doc2Vec
doc2vec_model= Doc2Vec.load("d2v.model")

In [3]:
G, abstract, _, _ = read_graph()
G = retrieve_subgraph(G, min_nb_nodes=3)
attrs_n = []
for i, node in enumerate(G.nodes()):
    G.nodes[int(node)]['id'] = int(node)
    G.nodes[int(node)]['feat'] = doc2vec_model.dv.get_vector(int(node))

G = dgl.from_networkx(G, node_attrs=['id','feat']) # already undirected
print(G)
G.ndata['_ID'] = torch.arange(G.num_nodes())
node_features = G.ndata['feat']
num_features = node_features.shape[1]

Number of nodes: 138499
Number of edges: 1091955
Number of authors : 174961
The minimum degree of the nodes in the graph is : 1
The maximum degree of the nodes in the graph is : 3037
The mean degree of the nodes in the graph is : 15.76841710048448
The median degree of the nodes in the graph is : 9.0
Number of nodes in subgraph: 115041
Number of edges in subgraph: 928816


  return th.as_tensor(data, dtype=dtype)


Graph(num_nodes=115041, num_edges=1857632,
      ndata_schemes={'id': Scheme(shape=(), dtype=torch.int64), 'feat': Scheme(shape=(50,), dtype=torch.float32)}
      edata_schemes={})


In [5]:
device = torch.device('cuda')
negative_sampler = dgl.dataloading.negative_sampler.Uniform(5)
sampler = dgl.dataloading.MultiLayerNeighborSampler([5, 10])
train_dataloader = dgl.dataloading.EdgeDataLoader(
    # The following arguments are specific to NodeDataLoader.
    G,                                  # The graph
    torch.arange(G.number_of_edges()),  # The edges to iterate over
    sampler,                                # The neighbor sampler
    negative_sampler=negative_sampler,      # The negative sampler
    device=device,                          # Put the MFGs on CPU or GPU
    # The following arguments are inherited from PyTorch DataLoader.
    batch_size=1024,    # Batch size
    shuffle=True,       # Whether to shuffle the nodes for every epoch
    drop_last=False,    # Whether to drop the last incomplete batch
    num_workers=0       # Number of sampler processes
)

In [6]:
input_nodes, pos_graph, neg_graph, mfgs = next(iter(train_dataloader))
print('Number of input nodes:', len(input_nodes))
print('Positive graph # nodes:', pos_graph.number_of_nodes(), '# edges:', pos_graph.number_of_edges())
print('Negative graph # nodes:', neg_graph.number_of_nodes(), '# edges:', neg_graph.number_of_edges())
print(mfgs)

Number of input nodes: 72094
Positive graph # nodes: 6900 # edges: 1024
Negative graph # nodes: 6900 # edges: 5120
[Block(num_src_nodes=72094, num_dst_nodes=36956, num_edges=175932), Block(num_src_nodes=36956, num_dst_nodes=6900, num_edges=53451)]


In [7]:
device = torch.device('cpu')
from graph_models import SageModel, inference, DotPredictor
model = SageModel(node_features.shape[1], 128).to(device)
predictor = DotPredictor().to(device)
opt = torch.optim.Adam(list(model.parameters()) + list(predictor.parameters()))

### Training

In [5]:
import tqdm
import sklearn.metrics

best_accuracy = 0
best_model_path = 'model.pt'
for epoch in range(1):
    with tqdm.tqdm(train_dataloader) as tq:
        for step, (input_nodes, pos_graph, neg_graph, mfgs) in enumerate(tq):
            # feature copy from CPU to GPU takes place here
            inputs = mfgs[0].srcdata['feat']

            outputs = model(mfgs, inputs)
            pos_score = predictor(pos_graph, outputs)
            neg_score = predictor(neg_graph, outputs)

            score = torch.cat([pos_score, neg_score])
            label = torch.cat([torch.ones_like(pos_score), torch.zeros_like(neg_score)])
            loss = F.binary_cross_entropy_with_logits(score, label)

            opt.zero_grad()
            loss.backward()
            opt.step()

            tq.set_postfix({'loss': '%.03f' % loss.item()}, refresh=False)

            if (step + 1) % 500 == 0:


                # Note that this tutorial do not train the whole model to the end.
                break

NameError: name 'train_dataloader' is not defined

### Loading and getting embeddings

In [None]:
device = torch.device('cpu')

best_model_path = 'model.pt'
model = SageModel(node_features.shape[1], 128).to(device)
model.load_state_dict(torch.load(best_model_path))

<All keys matched successfully>

In [None]:
node_embeddings = model.get_hidden(G, node_features)


### Prediction model

In [196]:
device = torch.device('cuda')
negative_sampler = dgl.dataloading.negative_sampler.Uniform(1)
sampler = dgl.dataloading.MultiLayerNeighborSampler([0, 0]) # We need no message flows
train_classif_dataloader = dgl.dataloading.EdgeDataLoader(
    # The following arguments are specific to NodeDataLoader.
    G,                                  # The graph
    torch.arange(G.number_of_edges()),  # The edges to iterate over
    sampler,                                # The neighbor sampler
    negative_sampler=negative_sampler,      
    device=device,                         
    batch_size=1024,    
    shuffle=True,       # Whether to shuffle the nodes for every epoch
    drop_last=False,    # Whether to drop the last incomplete batch
    num_workers=0       # Number of sampler processes
)

In [197]:
input_nodes, pos_graph, neg_graph, mfgs = next(iter(train_classif_dataloader))
print('Number of input nodes:', len(input_nodes))
print('Positive graph # nodes:', pos_graph.number_of_nodes(), '# edges:', pos_graph.number_of_edges())
print('Negative graph # nodes:', neg_graph.number_of_nodes(), '# edges:', neg_graph.number_of_edges())
print(mfgs)

Number of input nodes: 2957
Positive graph # nodes: 2957 # edges: 1024
Negative graph # nodes: 2957 # edges: 1024
[Block(num_src_nodes=2957, num_dst_nodes=2957, num_edges=0), Block(num_src_nodes=2957, num_dst_nodes=2957, num_edges=0)]


In [209]:
class MLP(nn.Module):
    def __init__(self, n_hidden, n_input) -> None:
        super(MLP, self).__init__()
        self.n_hidden = n_hidden
        self.n_input = n_input
        self.f1 = nn.Linear(n_input, n_hidden)
        self.f2 = nn.Linear(n_hidden, n_hidden)
        self.f3 = nn.Linear(n_hidden, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        self.dropout = nn.Dropout(p=0.5)

    def forward(self, x):
        x = self.relu(self.f1(x))
        x = self.dropout(self.relu(self.f2(x)))
        output = self.sigmoid(self.f3(x))
        return output



In [214]:
device = torch.device('cuda')
batch_size = 1000
epochs = 20
mlp = MLP(n_hidden=100, n_input=256).to(device)
optimizer = torch.optim.Adam(mlp.parameters(), lr=0.001)
criterion = nn.BCELoss()

In [215]:
def train(model, embeddings, dataloader):
    best_accuracy = 0
    best_model_path = 'classif.pt'
    for epoch in range(epochs):
        losses = []
        with tqdm(dataloader) as tq:
            for step, (input_nodes, pos_graph, neg_graph, _) in enumerate(tq):
                with torch.no_grad():
                    src, dst = pos_graph.edges()
                    src_emb = embeddings[pos_graph.nodes[src].data['_ID']]
                    dst_emb = embeddings[pos_graph.nodes[dst].data['_ID']]
                    x = torch.cat([src_emb, dst_emb], dim=1)
                    n_pos = x.shape[0]

                    src_neg, dst_neg = neg_graph.edges()
                    src_emb_neg = embeddings[neg_graph.nodes[src_neg].data['_ID']]
                    dst_emb_neg = embeddings[neg_graph.nodes[dst_neg].data['_ID']]
                    x_neg = torch.cat([src_emb_neg, dst_emb_neg], dim=1)
                    n_neg = x_neg.shape[0]
                    
                x_tot = torch.cat([x, x_neg], dim=0).to(device)
                y = model(x_tot)
                
                pos_label = torch.ones(n_pos)
                target = torch.cat([pos_label, torch.zeros(n_neg)]).to(device)

                loss = F.binary_cross_entropy_with_logits(y.squeeze(), target)
                losses.append(loss.item())
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                tq.set_postfix({'loss': '%.03f' % loss.item()}, refresh=False)
        print(f'Epoch {epoch} : mean loss {np.mean(losses)}')

In [216]:
train(mlp, node_embeddings, train_classif_dataloader)

100%|██████████| 1815/1815 [00:42<00:00, 42.38it/s, loss=0.536]
  0%|          | 1/1815 [00:00<03:31,  8.57it/s, loss=0.528]

Epoch 0 : mean loss 0.5392503546289176


 35%|███▍      | 628/1815 [00:14<00:27, 43.84it/s, loss=0.529]


KeyboardInterrupt: 

## Doc2Vec

In [48]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from preprocessing import read_graph
from time import time 
from nltk.tokenize import word_tokenize
_, abstracts, _, _ = read_graph()
# ~4 mins
data = list(abstract.values())
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)]

In [None]:
max_epochs = 200
vec_size = 50
alpha = 0.025

model = Doc2Vec(vector_size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=2,
                hs=1,
                dm =1)
  
model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    t_start = time()
    
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=1)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha
    print('Epoch {0}. Elapsed {1} s'.format(epoch, time() - t_start))

model.save("d2v.model")
print("Model Saved")