In [1]:
import pandas as pd
import networkx as nx
import csv
import numpy as np
from random import randint
from sklearn.linear_model import LogisticRegression


## Data Cleaning

In [1]:
import torch
from torch_geometric.data import Data
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

## Graph auto encoder

In [1]:
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import itertools
import numpy as np
import scipy.sparse as sp
from tqdm import tqdm

from preprocessing import read_graph, retrieve_subgraph


device = 'cpu' 

In [2]:
from gensim.models.doc2vec import Doc2Vec
doc2vec_model= Doc2Vec.load("d2v.model")

In [3]:
G, abstract, _, _ = read_graph()
G = retrieve_subgraph(G, min_nb_nodes=3)
attrs_n = []
for i, node in tqdm(enumerate(G.nodes())):
    G.nodes[int(node)]['feat'] = doc2vec_model.dv.get_vector(int(node))

G = dgl.from_networkx(G, node_attrs=['feat']) # already undirected
print(G)
node_features = G.ndata['feat']
num_features = node_features.shape[1]

Number of nodes: 138499
Number of edges: 1091955
Number of authors : 174961
The minimum degree of the nodes in the graph is : 1
The maximum degree of the nodes in the graph is : 3037
The mean degree of the nodes in the graph is : 15.76841710048448
The median degree of the nodes in the graph is : 9.0


26498it [00:00, 263071.02it/s]

Number of nodes in subgraph: 115041
Number of edges in subgraph: 928816


115041it [00:00, 255764.53it/s]
  return th.as_tensor(data, dtype=dtype)


Graph(num_nodes=115041, num_edges=1857632,
      ndata_schemes={'feat': Scheme(shape=(50,), dtype=torch.float32)}
      edata_schemes={})


In [4]:
device = torch.device('cuda')
negative_sampler = dgl.dataloading.negative_sampler.Uniform(5)
sampler = dgl.dataloading.MultiLayerNeighborSampler([5, 10])
train_dataloader = dgl.dataloading.EdgeDataLoader(
    # The following arguments are specific to NodeDataLoader.
    G,                                  # The graph
    torch.arange(G.number_of_edges()),  # The edges to iterate over
    sampler,                                # The neighbor sampler
    negative_sampler=negative_sampler,      # The negative sampler
    device=device,                          # Put the MFGs on CPU or GPU
    # The following arguments are inherited from PyTorch DataLoader.
    batch_size=1024,    # Batch size
    shuffle=True,       # Whether to shuffle the nodes for every epoch
    drop_last=False,    # Whether to drop the last incomplete batch
    num_workers=0       # Number of sampler processes
)

In [5]:
input_nodes, pos_graph, neg_graph, mfgs = next(iter(train_dataloader))
print('Number of input nodes:', len(input_nodes))
print('Positive graph # nodes:', pos_graph.number_of_nodes(), '# edges:', pos_graph.number_of_edges())
print('Negative graph # nodes:', neg_graph.number_of_nodes(), '# edges:', neg_graph.number_of_edges())
print(mfgs)

Number of input nodes: 72174
Positive graph # nodes: 6887 # edges: 1024
Negative graph # nodes: 6887 # edges: 5120
[Block(num_src_nodes=72174, num_dst_nodes=36925, num_edges=175776), Block(num_src_nodes=36925, num_dst_nodes=6887, num_edges=53134)]


In [6]:
from graph_models import SageModel, inference, DotPredictor
model = SageModel(node_features.shape[1], 128).to(device)
predictor = DotPredictor().to(device)
opt = torch.optim.Adam(list(model.parameters()) + list(predictor.parameters()))

### Training

In [5]:
import tqdm
import sklearn.metrics

best_accuracy = 0
best_model_path = 'model.pt'
for epoch in range(1):
    with tqdm.tqdm(train_dataloader) as tq:
        for step, (input_nodes, pos_graph, neg_graph, mfgs) in enumerate(tq):
            # feature copy from CPU to GPU takes place here
            inputs = mfgs[0].srcdata['feat']

            outputs = model(mfgs, inputs)
            pos_score = predictor(pos_graph, outputs)
            neg_score = predictor(neg_graph, outputs)

            score = torch.cat([pos_score, neg_score])
            label = torch.cat([torch.ones_like(pos_score), torch.zeros_like(neg_score)])
            loss = F.binary_cross_entropy_with_logits(score, label)

            opt.zero_grad()
            loss.backward()
            opt.step()

            tq.set_postfix({'loss': '%.03f' % loss.item()}, refresh=False)

            if (step + 1) % 500 == 0:


                # Note that this tutorial do not train the whole model to the end.
                break

NameError: name 'train_dataloader' is not defined

### Loading and getting embeddings

In [7]:
device = torch.device('cpu')

best_model_path = 'model.pt'
model = SageModel(node_features.shape[1], 128).to(device)
model.load_state_dict(torch.load(best_model_path))
embeddings = model.conv1(G, node_features.to(device)) # Confianc

### Prediction model

In [8]:
class MLP(nn.Module):
    def __init__(self, n_hidden, n_input) -> None:
        super(MLP, self).__init__()
        self.n_hidden = n_hidden
        self.n_input = n_input
        self.f1 = nn.Linear(n_input, n_hidden)
        self.f2 = nn.Linear(n_hidden, n_hidden)
        self.f3 = nn.Linear(n_hidden, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        self.dropout = nn.Dropout()

    def forward(self, x):
        x = self.relu(self.f1(x))
        x = self.dropout(self.relu(self.f2(x)))
        output = self.sigmoid(self.f3(x))
        return output



In [9]:
nodes = G.nodes()
edges = G.edges()
ledges = []
for i in tqdm(range(len(edges[0]))):
    ledges.append((edges[0][i].item(), edges[1][i].item()))
np.random.shuffle(ledges)
train_edges = ledges[:round(0.6*len(ledges))]
test_edges = ledges[round(0.6*len(ledges)):]

100%|██████████| 1857632/1857632 [00:08<00:00, 209305.30it/s]


In [26]:
from utils import random_edge 
device = torch.device('cuda')
batch_size = 1000
epochs = 500
mlp = MLP(n_hidden=200, n_input=256).to(device)
optimizer = torch.optim.Adam(mlp.parameters(), lr=0.001)
criterion = nn.BCELoss()

In [24]:
for epoch in range(epochs):
    losses = []
    with tqdm(range(len(train_edges)//batch_size)) as tq:
        for n_batch in tq:
            batch = train_edges[n_batch*batch_size:(n_batch+1)*batch_size]
            x = torch.FloatTensor(size=(batch_size,256))
            x_neg = torch.FloatTensor(size=(batch_size,256))
            for n, (i,j) in enumerate(batch):
                feature = torch.cat([embeddings[i], embeddings[j]])
                a, b= random_edge(len(nodes))
                feature_neg = torch.cat([embeddings[a], embeddings[b]])
                x[n] = feature
                x_neg[n] = feature_neg
            x = torch.cat([x, x_neg])
            y = mlp(x)
            target = torch.ones(batch_size)
            target = torch.cat([target, torch.zeros(batch_size)])
            loss = criterion(y.squeeze(), target)
            losses.append(loss.item())

            tq.set_postfix({'loss': '%.03f' % loss.item()}, refresh=False)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    print(f'Epoch : {epoch} Mean Loss : {np.mean(losses)}')
        

  0%|          | 0/1114 [00:00<?, ?it/s]

## Doc2Vec

In [48]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from preprocessing import read_graph
from time import time 
from nltk.tokenize import word_tokenize
_, abstracts, _, _ = read_graph()
# ~4 mins
data = list(abstract.values())
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)]

In [None]:
max_epochs = 200
vec_size = 50
alpha = 0.025

model = Doc2Vec(vector_size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=2,
                hs=1,
                dm =1)
  
model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    t_start = time()
    
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=1)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha
    print('Epoch {0}. Elapsed {1} s'.format(epoch, time() - t_start))

model.save("d2v.model")
print("Model Saved")