In [1]:
import pandas as pd
import networkx as nx
import csv
import numpy as np
from random import randint
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import scipy.sparse as sp
from tqdm import tqdm
from graph_models import SageModel, inference, DotPredictor, GATModel, DeepGAT
from gensim.models import KeyedVectors

from preprocessing import read_graph, retrieve_subgraph
from gensim.models.doc2vec import Doc2Vec


device = 'cpu' 

Using backend: pytorch


## Node2Vec

In [None]:
from stellargraph.data import BiasedRandomWalk
from gensim.models import Word2Vec


def node2vec_embedding(graph, name):
    rw = BiasedRandomWalk(graph)
    walks = rw.run(graph.nodes(), n=num_walks, length=walk_length, p=p, q=q)
    print(f"Number of random walks for '{name}': {len(walks)}")

    model = Word2Vec(
        walks,
        vector_size=dimensions,
        window=window_size,
        min_count=0,
        sg=1,
        workers=workers,
        iter=num_iter,
    )

    def get_embedding(u):
        return model.wv[u]

    return get_embedding

In [7]:
import multiprocessing

In [27]:
p = 1.0
q = 1.0
dimensions = 50
num_walks = 10
walk_length = 80
window_size = 10
num_iter = 1
workers = multiprocessing.cpu_count() - 4

In [18]:
G, abstract, _, _ = read_graph()
G = retrieve_subgraph(G, min_nb_nodes=-1)

Number of nodes: 138499
Number of edges: 1091955
Number of authors : 174961
The minimum degree of the nodes in the graph is : 1
The maximum degree of the nodes in the graph is : 3037
The mean degree of the nodes in the graph is : 15.76841710048448
The median degree of the nodes in the graph is : 9.0
Number of nodes in subgraph: 138499
Number of edges in subgraph: 1091955


In [22]:
from stellargraph import StellarGraph
graph = StellarGraph.from_networkx(G)

In [34]:
model = Word2Vec(
    walks,
    vector_size=dimensions,
    window=window_size,
    min_count=0,
    sg=1,
    workers=workers,
    epochs=1
)

word_vectors = model.wv

word_vectors.save("word2vec.wordvectors")

## Graph auto encoder

In [2]:
wv = KeyedVectors.load("word2vec.wordvectors", mmap='r')

In [12]:
doc2vec_model= Doc2Vec.load("d2v.model")

In [13]:
graph, abstract, _, _ = read_graph()
graph = retrieve_subgraph(graph, min_nb_nodes=-1)
attrs_n = []
for i, node in enumerate(graph.nodes()):
    graph.nodes[int(node)]['id'] = int(node)
    feat_vec = doc2vec_model.dv.get_vector(int(node)) # torch.rand(size=doc2vec_model.dv.get_vector(int(node)).shape)
    graph.nodes[int(node)]['feat'] = feat_vec

graph = dgl.from_networkx(graph, node_attrs=['id','feat']) # already undirected
graph.ndata['_ID'] = torch.arange(graph.num_nodes())

node_features = graph.ndata['feat']

Number of nodes: 138499
Number of edges: 1091955
Number of authors : 174961
The minimum degree of the nodes in the graph is : 1
The maximum degree of the nodes in the graph is : 3037
The mean degree of the nodes in the graph is : 15.76841710048448
The median degree of the nodes in the graph is : 9.0
Number of nodes in subgraph: 138499
Number of edges in subgraph: 1091955


In [3]:
G, abstract, _, _ = read_graph()
G = retrieve_subgraph(G, min_nb_nodes=-1)
attrs_n = []
for i, node in enumerate(G.nodes()):
    G.nodes[int(node)]['id'] = int(node)
    #feat_vec = doc2vec_model.dv.get_vector(int(node)) # torch.rand(size=doc2vec_model.dv.get_vector(int(node)).shape)
    G.nodes[int(node)]['feat'] = wv[int(node)]

G = dgl.from_networkx(G, node_attrs=['id','feat']) # already undirected
print(G)
G.ndata['_ID'] = torch.arange(G.num_nodes())

node_features = G.ndata['feat']
num_features = node_features.shape[1]

Number of nodes: 138499
Number of edges: 1091955
Number of authors : 174961
The minimum degree of the nodes in the graph is : 1
The maximum degree of the nodes in the graph is : 3037
The mean degree of the nodes in the graph is : 15.76841710048448
The median degree of the nodes in the graph is : 9.0
Number of nodes in subgraph: 138499
Number of edges in subgraph: 1091955


  return th.as_tensor(data, dtype=dtype)


Graph(num_nodes=138499, num_edges=2183910,
      ndata_schemes={'id': Scheme(shape=(), dtype=torch.int64), 'feat': Scheme(shape=(50,), dtype=torch.float32)}
      edata_schemes={})


In [4]:
del wv

In [12]:
device = torch.device('cuda')
negative_sampler = dgl.dataloading.negative_sampler.Uniform(5)
sampler = dgl.dataloading.MultiLayerNeighborSampler([-1, -1])
train_dataloader = dgl.dataloading.EdgeDataLoader(
    # The following arguments are specific to NodeDataLoader.
    G,                                  # The graph
    torch.arange(G.number_of_edges()),  # The edges to iterate over
    sampler,                                # The neighbor sampler
    negative_sampler=negative_sampler,      # The negative sampler
    device=device,                          # Put the MFGs on CPU or GPU
    # The following arguments are inherited from PyTorch DataLoader.
    batch_size=2*1024,    # Batch size on peut faire x2
    shuffle=True,       # Whether to shuffle the nodes for every epoch
    drop_last=False,    # Whether to drop the last incomplete batch
    num_workers=0       # Number of sampler processes
)

In [7]:
device = torch.device('cuda')

#model = GATModel(node_features.shape[1], 64, 4, F.elu).to(device)
model = SageModel(node_features.shape[1], 128).to(device)
predictor = DotPredictor().to(device)
opt = torch.optim.Adam(list(model.parameters()) + list(predictor.parameters()))

### Training

In [15]:
device = torch.device('cuda')
best_accuracy = 0
min_loss = np.inf
best_model_path = 'sage_model_2.pt'
epochs = 5

for epoch in range(epochs):
    losses = []
    with tqdm(train_dataloader) as tq:
        for step, (input_nodes, pos_graph, neg_graph, mfgs) in enumerate(tq):
            # feature copy from CPU to GPU takes place here
            inputs = mfgs[0].srcdata['feat']

            outputs = model(mfgs, inputs)
            pos_score = predictor(pos_graph, outputs)
            neg_score = predictor(neg_graph, outputs)

            score = torch.cat([pos_score, neg_score])
            label = torch.cat([torch.ones_like(pos_score), torch.zeros_like(neg_score)])
            loss = F.binary_cross_entropy_with_logits(score, label)
            losses.append(loss.item())
            opt.zero_grad()
            loss.backward()
            opt.step()

            tq.set_postfix({'loss': '%.03f' % loss.item()}, refresh=False)
    if np.mean(losses) < min_loss:
        min_loss = np.mean(losses)
        torch.save(model.state_dict(), best_model_path)
    print(f'Epoch {epoch} : Train mean loss {np.mean(losses)}')
    


100%|██████████| 1067/1067 [03:15<00:00,  5.47it/s, loss=0.420]


Epoch 0 : Train mean loss 0.5861868947995748


100%|██████████| 1067/1067 [03:17<00:00,  5.41it/s, loss=0.406]


Epoch 1 : Train mean loss 0.40281731105342355


100%|██████████| 1067/1067 [03:16<00:00,  5.42it/s, loss=0.401]


Epoch 2 : Train mean loss 0.3951230390728805


100%|██████████| 1067/1067 [03:15<00:00,  5.44it/s, loss=0.375]


Epoch 3 : Train mean loss 0.3894567143838877


100%|██████████| 1067/1067 [03:16<00:00,  5.43it/s, loss=0.391]

Epoch 4 : Train mean loss 0.38569909364906785





0.615 / 0.622 for DeepGat 1 epoch doc2vec \\
0.693 / 0.693 : for DeepGat 1 epoch random features 

| Model names   |      Model type      |  Hidden features | n_head | embeddings | Train acc |
|----------|:-------------:|------:|--------|----------|---------|
| gat_model_1 |  GATModel | 64 | 4 | node2vec | 0.3995 (2 epochs) 0.3897 (5 epochs) |
| gat_model_2 |    GATModel |   64 | 4 | doc2vec | 0.3991 (5 epochs) |
| sage_model_1 | GraphSage |    64 | | node2vec| 0.4003 (5 epochs)|
| sage_model_2 | GraphSage |    128 | | node2vec| 0.3857 (5 epochs)|

Gat model 1 :


In [9]:
#torch.save(model.state_dict(), 'model3.pt')

### Loading and getting embeddings

In [5]:
device = torch.device('cpu')

best_model_path = 'gat_model_1.pt'
model = GATModel(node_features.shape[1], 64, 4, F.elu).to(device)
#model = SageModel(node_features.shape[1], 128).to(device)
model.load_state_dict(torch.load(best_model_path))
node_embeddings = model.get_hidden(G, node_features)

### Prediction model

In [19]:
G_dir, abstract, _, _ = read_graph()
G_dir = retrieve_subgraph(G_dir, min_nb_nodes=-1)
attrs_n = []
for i, node in enumerate(G_dir.nodes()):
    G_dir.nodes[int(node)]['id'] = int(node)
    #feat_vec = doc2vec_model.dv.get_vector(int(node)) # torch.rand(size=doc2vec_model.dv.get_vector(int(node)).shape)

src = []
dst = []
for edge in tqdm(G_dir.edges()):
    src.append(edge[0])
    dst.append(edge[1])
G_dir = dgl.graph((src, dst))
del src, dst
G_dir.ndata['_ID'] = G.ndata['_ID']

Number of nodes: 138499
Number of edges: 1091955
Number of authors : 174961
The minimum degree of the nodes in the graph is : 1
The maximum degree of the nodes in the graph is : 3037
The mean degree of the nodes in the graph is : 15.76841710048448
The median degree of the nodes in the graph is : 9.0
Number of nodes in subgraph: 138499
Number of edges in subgraph: 1091955


100%|██████████| 1091955/1091955 [00:05<00:00, 199092.76it/s]


In [33]:
from utils import edge_train_val_split
eid_train, eid_val = edge_train_val_split(G_dir, val_size=0.3)
device = torch.device('cuda')
negative_sampler = dgl.dataloading.negative_sampler.Uniform(3)
sampler = dgl.dataloading.MultiLayerNeighborSampler([0, 0]) # We need no message flows
train_classif_dataloader = dgl.dataloading.EdgeDataLoader(
    # The following arguments are specific to NodeDataLoader.
    G_dir,                                  # The graph
    eid_train,  # The edges to iterate over
    sampler,                                # The neighbor sampler
    negative_sampler=negative_sampler,      
    device=device,                         
    batch_size=1024,    
    shuffle=True,       # Whether to shuffle the nodes for every epoch
    drop_last=False,    # Whether to drop the last incomplete batch
    num_workers=0       # Number of sampler processes
)
val_classif_dataloader = dgl.dataloading.EdgeDataLoader(
    # The following arguments are specific to NodeDataLoader.
    G_dir,                                  # The graph 
    eid_val,  # The edges to iterate over
    sampler,                                # The neighbor sampler
    negative_sampler=negative_sampler,      
    device=device,                         
    batch_size=1024,    
    shuffle=True,       # Whether to shuffle the nodes for every epoch
    drop_last=False,    # Whether to drop the last incomplete batch
    num_workers=0       # Number of sampler processes
)

In [34]:
from graph_models import MLP  

device = torch.device('cuda')
epochs = 10
mlp = MLP(n_hidden=128, n_input=2*114).to(device)
optimizer = torch.optim.Adam(mlp.parameters(), lr=0.0005)
criterion = nn.BCEWithLogitsLoss()

In [35]:
embed = torch.cat([node_features, node_embeddings], dim=1)

In [36]:
embed.shape

torch.Size([138499, 114])

In [37]:
from graph_models import train_classif
train_classif(
    mlp, 
    embed, 
    train_classif_dataloader, 
    val_classif_dataloader, 
    criterion, 
    device,
    optimizer, 
    epochs=10, 
    name_model='clf_gat_model_1_plus_doc.pt'
    )

100%|██████████| 747/747 [00:18<00:00, 39.88it/s, loss=0.123]


Epoch 0 : Train mean loss 0.25260702123643564 : Val mean loss 0.12270529535599053


100%|██████████| 747/747 [00:23<00:00, 31.99it/s, loss=0.093]


Epoch 1 : Train mean loss 0.10443901735416537 : Val mean loss 0.09520953253377229


100%|██████████| 747/747 [00:22<00:00, 33.22it/s, loss=0.082]


Epoch 2 : Train mean loss 0.08971355383694571 : Val mean loss 0.08624967820942402


100%|██████████| 747/747 [00:23<00:00, 31.64it/s, loss=0.091]


Epoch 3 : Train mean loss 0.08344800936848604 : Val mean loss 0.08270398410968482


100%|██████████| 747/747 [00:24<00:00, 30.49it/s, loss=0.076]


Epoch 4 : Train mean loss 0.08010178934059628 : Val mean loss 0.08070748989703133


100%|██████████| 747/747 [00:22<00:00, 32.54it/s, loss=0.085]


Epoch 5 : Train mean loss 0.07768091989289645 : Val mean loss 0.07820165122393519


100%|██████████| 747/747 [00:24<00:00, 30.73it/s, loss=0.068]


Epoch 6 : Train mean loss 0.07591449189058429 : Val mean loss 0.07677419348619878


 29%|██▉       | 217/747 [00:10<00:26, 20.13it/s, loss=0.069]


KeyboardInterrupt: 

| Model names   |      Model type      | Parameters | n lyaer | embeddings | Score |
|----------|:-------------:|------:|--------|----------|---------|
| gat_model_1 |  GATModel | 64 | 4 | node2vec | 0.25 (10 epochs) |
| gat_model_2 |    GATModel |   64 | 4 | doc2vec | 0.3991 (5 epochs) |
| clf_sage_model_2 | MLP |    128h | 3 layer | gnn_only| 0.26 on test set (10 epochs)|
| sage_model_2 | GraphSage |    128 | | node2vec| 0.3857 (5 epochs)|

## Load test data

In [26]:
from graph_models import MLP
from preprocessing import retrieve_embeddings
import csv

In [29]:
mlp = MLP(n_hidden=128, n_input=2*114).to(device)
mlp.load_state_dict(torch.load('clf_gat_model_1_plus_doc.pt'))

<All keys matched successfully>

In [31]:
X_test = retrieve_embeddings(G, embed)
with torch.no_grad():
    y_tens = torch.sigmoid(mlp(X_test.to(device))).cpu().numpy()
y_pred = y_tens[:,0]


In [32]:

predictions = zip(range(len(y_pred)), y_pred)
with open("submission_model_gat_1_with_doc.csv","w") as pred:
    csv_out = csv.writer(pred)
    csv_out.writerow(['id','predicted'])
    for row in predictions:
        csv_out.writerow(row) 

## Doc2Vec

In [48]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from preprocessing import read_graph
from time import time 
from nltk.tokenize import word_tokenize
_, abstracts, _, _ = read_graph()
# ~4 mins
data = list(abstract.values())
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)]

In [None]:
max_epochs = 200
vec_size = 50
alpha = 0.025

model = Doc2Vec(vector_size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=2,
                hs=1,
                dm =1)
  
model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    t_start = time()
    
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=1)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha
    print('Epoch {0}. Elapsed {1} s'.format(epoch, time() - t_start))

model.save("d2v.model")
print("Model Saved")