In [1]:
import pandas as pd
import networkx as nx
import csv
import numpy as np
from random import randint
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import scipy.sparse as sp
from tqdm import tqdm

from preprocessing import read_graph, retrieve_subgraph
from gensim.models.doc2vec import Doc2Vec


device = 'cpu' 

## Node2Vec

In [39]:
import os.path as osp

import matplotlib.pyplot as plt
import torch
from sklearn.manifold import TSNE

from torch_geometric.datasets import Planetoid
from torch_geometric.nn import Node2Vec

dataset = 'Cora'
path = osp.join( '..', 'data', dataset)
dataset = Planetoid(path, dataset)
data = dataset[0]

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = Node2Vec(data.edge_index, embedding_dim=128, walk_length=20,
                    context_size=10, walks_per_node=10,
                    num_negative_samples=1, p=1, q=1, sparse=True).to(device)

loader = model.loader(batch_size=128, shuffle=True, num_workers=4)
optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.01)

Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.test.index
Processing...
Done!


In [41]:
G.edges()

(tensor([     0,      0,      1,  ..., 137582, 137584, 138384]),
 tensor([     1,      2,      0,  ..., 137584, 137582, 137738]))

In [40]:
data.edge_index

tensor([[   0,    0,    0,  ..., 2707, 2707, 2707],
        [ 633, 1862, 2582,  ...,  598, 1473, 2706]])

In [None]:


def train():
    model.train()
    total_loss = 0
    for pos_rw, neg_rw in loader:
        optimizer.zero_grad()
        loss = model.loss(pos_rw.to(device), neg_rw.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

@torch.no_grad()
def test():
    model.eval()
    z = model()
    acc = model.test(z[data.train_mask], data.y[data.train_mask],
                        z[data.test_mask], data.y[data.test_mask],
                        max_iter=150)
    return acc

for epoch in range(1, 101):
    loss = train()
    acc = test()
    print(f'Epoch: {epoch:02d}, Loss: {loss:.4f}, Acc: {acc:.4f}')



## Graph auto encoder

In [3]:
doc2vec_model= Doc2Vec.load("d2v.model")

In [33]:
G, abstract, _, _ = read_graph()
G = retrieve_subgraph(G, min_nb_nodes=-1)
attrs_n = []
for i, node in enumerate(G.nodes()):
    G.nodes[int(node)]['id'] = int(node)
    feat_vec = doc2vec_model.dv.get_vector(int(node))
    G.nodes[int(node)]['feat'] = torch.rand(size=doc2vec_model.dv.get_vector(int(node)).shape)

G = dgl.from_networkx(G, node_attrs=['id','feat']) # already undirected
print(G)
G.ndata['_ID'] = torch.arange(G.num_nodes())

node_features = G.ndata['feat']
num_features = node_features.shape[1]

Number of nodes: 138499
Number of edges: 1091955
Number of authors : 174961
The minimum degree of the nodes in the graph is : 1
The maximum degree of the nodes in the graph is : 3037
The mean degree of the nodes in the graph is : 15.76841710048448
The median degree of the nodes in the graph is : 9.0
Number of nodes in subgraph: 138499
Number of edges in subgraph: 1091955
Graph(num_nodes=138499, num_edges=2183910,
      ndata_schemes={'id': Scheme(shape=(), dtype=torch.int64), 'feat': Scheme(shape=(50,), dtype=torch.float32)}
      edata_schemes={})


In [34]:
device = torch.device('cuda')
negative_sampler = dgl.dataloading.negative_sampler.Uniform(5)
sampler = dgl.dataloading.MultiLayerNeighborSampler([20, -1, -1])
train_dataloader = dgl.dataloading.EdgeDataLoader(
    # The following arguments are specific to NodeDataLoader.
    G,                                  # The graph
    torch.arange(G.number_of_edges()),  # The edges to iterate over
    sampler,                                # The neighbor sampler
    negative_sampler=negative_sampler,      # The negative sampler
    device=device,                          # Put the MFGs on CPU or GPU
    # The following arguments are inherited from PyTorch DataLoader.
    batch_size=2*1024,    # Batch size on peut faire x2
    shuffle=True,       # Whether to shuffle the nodes for every epoch
    drop_last=False,    # Whether to drop the last incomplete batch
    num_workers=0       # Number of sampler processes
)

In [35]:
device = torch.device('cuda')
from graph_models import SageModel, inference, DotPredictor, GATModel#, DeepGAT
model = DeepGAT(node_features.shape[1], 64, 4, F.elu).to(device)
predictor = DotPredictor().to(device)
opt = torch.optim.Adam(list(model.parameters()) + list(predictor.parameters()))

In [24]:
from dgl.nn import GATConv
class DeepGAT(nn.Module):
    def __init__(self, in_feats, h_feats, num_heads, nonlinearity):
        super(DeepGAT, self).__init__()
        self.gat1 = GATConv(in_feats, h_feats, num_heads)
        self.gat2 = GATConv(h_feats * num_heads, h_feats, num_heads)
        self.gat3 = GATConv(h_feats * num_heads, h_feats, num_heads+2)
        self.h_feats = h_feats
        self.nonlinearity = nonlinearity
        self.num_heads = num_heads

    def forward(self, mfgs, x):
        h_dst = x[:mfgs[0].num_dst_nodes()]
        h = self.gat1(mfgs[0], (x, h_dst))
        h = h.view(-1, h.size(1) * h.size(2))
        h = self.nonlinearity(h)

        h_dst = h[:mfgs[1].num_dst_nodes()]
        h = self.gat2(mfgs[1], (h, h_dst))
        h = h.view(-1, h.size(1) * h.size(2))
        h = self.nonlinearity(h)

        h_dst = h[:mfgs[2].num_dst_nodes()]
        h = self.gat3(mfgs[2], (h, h_dst))
        h = torch.mean(h, dim=1)
        return h

    def get_hidden(self, graph, x):
        with torch.no_grad():
            h = self.gat1(graph, x)
            h = h.view(-1, h.size(1) * h.size(2))
            h = self.nonlinearity(h)

            h = self.gat2(graph, x)
            h = h.view(-1, h.size(1) * h.size(2))
            h = self.nonlinearity(h)

            h = self.gat3(graph, h)
            h = torch.mean(h, dim=1)
        return h

### Training

In [36]:
import tqdm
import sklearn.metrics
device = torch.device('cuda')
best_accuracy = 0
min_loss = np.inf
best_model_path = 'last_gnn_r.pt'
epochs = 2

for epoch in range(epochs):
    losses = []
    with tqdm.tqdm(train_dataloader) as tq:
        for step, (input_nodes, pos_graph, neg_graph, mfgs) in enumerate(tq):
            # feature copy from CPU to GPU takes place here
            inputs = mfgs[0].srcdata['feat']

            outputs = model(mfgs, inputs)
            pos_score = predictor(pos_graph, outputs)
            neg_score = predictor(neg_graph, outputs)

            score = torch.cat([pos_score, neg_score])
            label = torch.cat([torch.ones_like(pos_score), torch.zeros_like(neg_score)])
            loss = F.binary_cross_entropy_with_logits(score, label)
            losses.append(loss.item())
            opt.zero_grad()
            loss.backward()
            opt.step()

            tq.set_postfix({'loss': '%.03f' % loss.item()}, refresh=False)
    if np.mean(losses) < min_loss:
        min_loss = np.mean(losses)
        torch.save(model.state_dict(), best_model_path)
    print(f'Epoch {epoch} : Train mean loss {np.mean(losses)}')
    


 19%|█▊        | 198/1067 [03:11<13:58,  1.04it/s, loss=0.693]


KeyboardInterrupt: 

0.615 / 0.622 for DeepGat 1 epoch doc2vec 
0.693 / 0.693 : for DeepGat 1 epoch random features 

In [9]:
#torch.save(model.state_dict(), 'model3.pt')

### Loading and getting embeddings

In [7]:
device = torch.device('cpu')

best_model_path = 'last_gnn.pt'
model = GATModel(node_features.shape[1], 64, 4, F.elu).to(device)
#model = SageModel(node_features.shape[1], 128).to(device)
model.load_state_dict(torch.load(best_model_path))
node_embeddings = model.get_hidden(G, node_features)

### Prediction model

In [8]:
from utils import edge_train_val_split
eid_train, eid_val = edge_train_val_split(G)
device = torch.device('cuda')
negative_sampler = dgl.dataloading.negative_sampler.Uniform(1)
sampler = dgl.dataloading.MultiLayerNeighborSampler([0, 0]) # We need no message flows
train_classif_dataloader = dgl.dataloading.EdgeDataLoader(
    # The following arguments are specific to NodeDataLoader.
    G,                                  # The graph
    eid_train,  # The edges to iterate over
    sampler,                                # The neighbor sampler
    negative_sampler=negative_sampler,      
    device=device,                         
    batch_size=1024,    
    shuffle=True,       # Whether to shuffle the nodes for every epoch
    drop_last=False,    # Whether to drop the last incomplete batch
    num_workers=0       # Number of sampler processes
)
val_classif_dataloader = dgl.dataloading.EdgeDataLoader(
    # The following arguments are specific to NodeDataLoader.
    G,                                  # The graph
    eid_val,  # The edges to iterate over
    sampler,                                # The neighbor sampler
    negative_sampler=negative_sampler,      
    device=device,                         
    batch_size=1024,    
    shuffle=True,       # Whether to shuffle the nodes for every epoch
    drop_last=False,    # Whether to drop the last incomplete batch
    num_workers=0       # Number of sampler processes
)

In [9]:
from graph_models import MLP  

device = torch.device('cuda')
epochs = 10
mlp = MLP(n_hidden=128, n_input=2*64).to(device)
optimizer = torch.optim.Adam(mlp.parameters(), lr=0.0005)
criterion = nn.BCEWithLogitsLoss()

In [32]:
#embed = torch.cat([G.ndata['feat'], node_embeddings], dim=1)

In [11]:
from graph_models import train_classif
train_classif(
    mlp, 
    node_embeddings, 
    train_classif_dataloader, 
    val_classif_dataloader, 
    criterion, 
    device,
    optimizer, 
    epochs=10, 
    name_model='c1.pt'
    )

100%|██████████| 1920/1920 [00:49<00:00, 39.14it/s, loss=0.091]
  0%|          | 1/1920 [00:00<04:56,  6.47it/s, loss=0.116]

Epoch 0 : Train mean loss 0.15259063851941998 : Val mean loss 0.11131948131684945


100%|██████████| 1920/1920 [00:42<00:00, 45.48it/s, loss=0.097]
  0%|          | 1/1920 [00:00<04:52,  6.55it/s, loss=0.095]

Epoch 1 : Train mean loss 0.1061549846509782 : Val mean loss 0.10470481227352241


 16%|█▋        | 316/1920 [00:07<00:38, 41.68it/s, loss=0.099]


KeyboardInterrupt: 

## Load test data

In [24]:
def retrieve_embeddings(G, embeddings):
    node_pairs = list()
    with open('test.txt', 'r') as f:
        for line in f:
            t = line.split(',')
            node_pairs.append((int(t[0]), int(t[1])))
    x = torch.zeros((len(node_pairs), 2*embeddings.shape[1]))
    for i, (src, dst) in enumerate(node_pairs):
        src_emb = embeddings[G.nodes[src].data['_ID']]
        dst_emb = embeddings[G.nodes[dst].data['_ID']]
        line = torch.cat([src_emb, dst_emb], dim=1)
        x[i,:] = line
    return x

In [25]:
#from preprocessing import retrieve_embeddings
X_test = retrieve_embeddings(G, node_embeddings)

In [26]:
with torch.no_grad():
    y_tens = torch.sigmoid(mlp(X_test.to(device))).cpu().numpy()
y_pred = y_tens[:,0]


In [30]:
import csv
def retrieve_embeddings(G, embeddings):
    node_pairs = list()
    with open('test.txt', 'r') as f:
        for line in f:
            t = line.split(',')
            node_pairs.append((int(t[0]), int(t[1])))
    x = torch.zeros((len(node_pairs), 2*embeddings.shape[1]))
    for i, (src, dst) in enumerate(node_pairs):
        src_emb = embeddings[G.nodes[src].data['_ID']]
        dst_emb = embeddings[G.nodes[dst].data['_ID']]
        line = torch.cat([src_emb, dst_emb], dim=1)
        x[i,:] = line
    return x


In [32]:
predictions = zip(range(len(y_pred)), y_pred)
with open("submission_gat.csv","w") as pred:
    csv_out = csv.writer(pred)
    csv_out.writerow(['id','predicted'])
    for row in predictions:
        csv_out.writerow(row) 

## Doc2Vec

In [48]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from preprocessing import read_graph
from time import time 
from nltk.tokenize import word_tokenize
_, abstracts, _, _ = read_graph()
# ~4 mins
data = list(abstract.values())
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)]

In [None]:
max_epochs = 200
vec_size = 50
alpha = 0.025

model = Doc2Vec(vector_size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=2,
                hs=1,
                dm =1)
  
model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    t_start = time()
    
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=1)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha
    print('Epoch {0}. Elapsed {1} s'.format(epoch, time() - t_start))

model.save("d2v.model")
print("Model Saved")