In [1]:
import pandas as pd
import networkx as nx
import csv
import numpy as np
from random import randint

## Data Cleaning

## Graph auto encoder

In [1]:
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import scipy.sparse as sp
from tqdm import tqdm

from preprocessing import read_graph, retrieve_subgraph


device = 'cpu' 

Using backend: pytorch


In [2]:
from gensim.models.doc2vec import Doc2Vec
doc2vec_model= Doc2Vec.load("d2v.model")

In [3]:
G, abstract, _, _ = read_graph()
G = retrieve_subgraph(G, min_nb_nodes=-1)
attrs_n = []
for i, node in enumerate(G.nodes()):
    G.nodes[int(node)]['id'] = int(node)
    G.nodes[int(node)]['feat'] = doc2vec_model.dv.get_vector(int(node))

G = dgl.from_networkx(G, node_attrs=['id','feat']) # already undirected
print(G)
G.ndata['_ID'] = torch.arange(G.num_nodes())
node_features = G.ndata['feat']
num_features = node_features.shape[1]

Number of nodes: 138499
Number of edges: 1091955
Number of authors : 174961
The minimum degree of the nodes in the graph is : 1
The maximum degree of the nodes in the graph is : 3037
The mean degree of the nodes in the graph is : 15.76841710048448
The median degree of the nodes in the graph is : 9.0
Number of nodes in subgraph: 138499
Number of edges in subgraph: 1091955


  return th.as_tensor(data, dtype=dtype)


Graph(num_nodes=138499, num_edges=2183910,
      ndata_schemes={'id': Scheme(shape=(), dtype=torch.int64), 'feat': Scheme(shape=(50,), dtype=torch.float32)}
      edata_schemes={})


In [4]:
device = torch.device('cuda')
negative_sampler = dgl.dataloading.negative_sampler.Uniform(5)
sampler = dgl.dataloading.MultiLayerNeighborSampler([-1, -1])
train_dataloader = dgl.dataloading.EdgeDataLoader(
    # The following arguments are specific to NodeDataLoader.
    G,                                  # The graph
    torch.arange(G.number_of_edges()),  # The edges to iterate over
    sampler,                                # The neighbor sampler
    negative_sampler=negative_sampler,      # The negative sampler
    device=device,                          # Put the MFGs on CPU or GPU
    # The following arguments are inherited from PyTorch DataLoader.
    batch_size=2*1024,    # Batch size on peut faire x2
    shuffle=True,       # Whether to shuffle the nodes for every epoch
    drop_last=False,    # Whether to drop the last incomplete batch
    num_workers=0       # Number of sampler processes
)

In [5]:
from graph_models import GATModel

In [7]:
device = torch.device('cuda')
from graph_models import SageModel, inference, DotPredictor
model = GATModel(node_features.shape[1], 64, 4, F.elu).to(device)
predictor = DotPredictor().to(device)
opt = torch.optim.Adam(list(model.parameters()) + list(predictor.parameters()))

### Training

In [8]:
import tqdm
import sklearn.metrics
device = torch.device('cuda')
best_accuracy = 0
#best_model_path = 'model3.pt'
for epoch in range(2):
    with tqdm.tqdm(train_dataloader) as tq:
        for step, (input_nodes, pos_graph, neg_graph, mfgs) in enumerate(tq):
            # feature copy from CPU to GPU takes place here
            inputs = mfgs[0].srcdata['feat']

            outputs = model(mfgs, inputs)
            pos_score = predictor(pos_graph, outputs)
            neg_score = predictor(neg_graph, outputs)

            score = torch.cat([pos_score, neg_score])
            label = torch.cat([torch.ones_like(pos_score), torch.zeros_like(neg_score)])
            loss = F.binary_cross_entropy_with_logits(score, label)

            opt.zero_grad()
            loss.backward()
            opt.step()

            tq.set_postfix({'loss': '%.03f' % loss.item()}, refresh=False)


100%|██████████| 1067/1067 [10:05<00:00,  1.76it/s, loss=0.614]
100%|██████████| 1067/1067 [10:00<00:00,  1.78it/s, loss=0.603]
  0%|          | 4/1067 [00:02<12:44,  1.39it/s, loss=0.608]


KeyboardInterrupt: 

In [9]:
torch.save(model.state_dict(), 'model3.pt')

In [11]:
from dgl.nn import GATConv

class GATModel(nn.Module):
    def __init__(self, in_feats, h_feats, num_heads, nonlinearity):
        super(GATModel, self).__init__()
        self.gat1 = GATConv(in_feats, h_feats, num_heads)
        self.gat2 = GATConv(h_feats * num_heads, h_feats, num_heads)
        #self.gat3 = GATConv(h_feats * num_heads, h_feats)
        self.h_feats = h_feats
        self.nonlinearity = nonlinearity
        self.num_heads = num_heads

    def forward(self, mfgs, x):
        h_dst = x[:mfgs[0].num_dst_nodes()]
        h = self.gat1(mfgs[0], (x, h_dst))
        h = h.view(-1, h.size(1) * h.size(2))
        h = self.nonlinearity(h)
        h_dst = h[:mfgs[1].num_dst_nodes()]
        h = self.gat2(mfgs[1], (h, h_dst))
        h = torch.mean(h, dim=1)
        return h

    def get_hidden(self, graph, x):
        with torch.no_grad():
            h = self.gat1(graph, x)
            h = h.view(-1, h.size(1) * h.size(2))
            h = self.nonlinearity(h)
            h = self.gat2(graph, h)
            h = torch.mean(h, dim=1)
        return h

### Loading and getting embeddings

In [12]:
device = torch.device('cpu')

best_model_path = 'model3.pt'
model = GATModel(node_features.shape[1], 64, 4, F.elu).to(device)
#model = SageModel(node_features.shape[1], 128).to(device)
model.load_state_dict(torch.load(best_model_path))
node_embeddings = model.get_hidden(G, node_features)

### Prediction model

In [13]:
device = torch.device('cuda')
negative_sampler = dgl.dataloading.negative_sampler.Uniform(1)
sampler = dgl.dataloading.MultiLayerNeighborSampler([0, 0]) # We need no message flows
train_classif_dataloader = dgl.dataloading.EdgeDataLoader(
    # The following arguments are specific to NodeDataLoader.
    G,                                  # The graph
    torch.arange(G.number_of_edges()),  # The edges to iterate over
    sampler,                                # The neighbor sampler
    negative_sampler=negative_sampler,      
    device=device,                         
    batch_size=1024,    
    shuffle=True,       # Whether to shuffle the nodes for every epoch
    drop_last=False,    # Whether to drop the last incomplete batch
    num_workers=0       # Number of sampler processes
)

In [21]:
from graph_models import MLP  

device = torch.device('cuda')
epochs = 200
mlp = MLP(n_hidden=128, n_input=2*64).to(device)
optimizer = torch.optim.Adam(mlp.parameters(), lr=0.0005)
criterion = nn.BCEWithLogitsLoss()

In [22]:
def train(model, embeddings, dataloader, criterion, epochs=10):
    best_model_path = 'classif.pt'
    min_loss=np.inf
    for epoch in range(epochs):
        losses = []
        with tqdm.tqdm(dataloader) as tq:
            for step, (input_nodes, pos_graph, neg_graph, _) in enumerate(tq):
                with torch.no_grad():
                    src, dst = pos_graph.edges()
                    src_emb = embeddings[pos_graph.nodes[src].data['_ID']]
                    dst_emb = embeddings[pos_graph.nodes[dst].data['_ID']]
                    x = torch.cat([src_emb, dst_emb], dim=1)
                    n_pos = x.shape[0]

                    src_neg, dst_neg = neg_graph.edges()
                    src_emb_neg = embeddings[neg_graph.nodes[src_neg].data['_ID']]
                    dst_emb_neg = embeddings[neg_graph.nodes[dst_neg].data['_ID']]
                    x_neg = torch.cat([src_emb_neg, dst_emb_neg], dim=1)
                    n_neg = x_neg.shape[0]
                    
                x_tot = torch.cat([x, x_neg], dim=0).to(device)
                y = model(x_tot)
                
                pos_label = torch.ones(n_pos)
                target = torch.cat([pos_label, torch.zeros(n_neg)]).to(device)

                loss = criterion(y.squeeze(), target)
                losses.append(loss.item())
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                tq.set_postfix({'loss': '%.03f' % loss.item()}, refresh=False)

        if np.mean(losses) < min_loss:
            torch.save(model.state_dict(), best_model_path)
        print(f'Epoch {epoch} : mean loss {np.mean(losses)}')

In [23]:
train(mlp, node_embeddings, train_classif_dataloader, criterion)

100%|██████████| 2133/2133 [00:47<00:00, 45.15it/s, loss=0.108]
  0%|          | 0/2133 [00:00<?, ?it/s]

Epoch 0 : mean loss 0.15045701152441146


100%|██████████| 2133/2133 [00:52<00:00, 40.26it/s, loss=0.095]
  0%|          | 0/2133 [00:00<?, ?it/s]

Epoch 1 : mean loss 0.1071010065145801


100%|██████████| 2133/2133 [00:54<00:00, 39.18it/s, loss=0.104]
  0%|          | 1/2133 [00:00<05:33,  6.39it/s, loss=0.094]

Epoch 2 : mean loss 0.1028701452947926


100%|██████████| 2133/2133 [00:50<00:00, 42.40it/s, loss=0.115]
  0%|          | 0/2133 [00:00<?, ?it/s]

Epoch 3 : mean loss 0.10074050194617454


100%|██████████| 2133/2133 [00:50<00:00, 42.56it/s, loss=0.091]
  0%|          | 1/2133 [00:00<06:20,  5.60it/s, loss=0.104]

Epoch 4 : mean loss 0.09960134188040734


100%|██████████| 2133/2133 [00:48<00:00, 43.59it/s, loss=0.110]
  0%|          | 1/2133 [00:00<06:03,  5.86it/s, loss=0.086]

Epoch 5 : mean loss 0.09864201795255641


100%|██████████| 2133/2133 [00:53<00:00, 39.52it/s, loss=0.108]
  0%|          | 0/2133 [00:00<?, ?it/s]

Epoch 6 : mean loss 0.09764063794583264


100%|██████████| 2133/2133 [00:53<00:00, 40.16it/s, loss=0.090]
  0%|          | 1/2133 [00:00<06:18,  5.63it/s, loss=0.101]

Epoch 7 : mean loss 0.0972103982730701


 11%|█▏        | 241/2133 [00:06<00:48, 39.24it/s, loss=0.104]


KeyboardInterrupt: 

## Load test data

In [24]:
def retrieve_embeddings(G, embeddings):
    node_pairs = list()
    with open('test.txt', 'r') as f:
        for line in f:
            t = line.split(',')
            node_pairs.append((int(t[0]), int(t[1])))
    x = torch.zeros((len(node_pairs), 2*embeddings.shape[1]))
    for i, (src, dst) in enumerate(node_pairs):
        src_emb = embeddings[G.nodes[src].data['_ID']]
        dst_emb = embeddings[G.nodes[dst].data['_ID']]
        line = torch.cat([src_emb, dst_emb], dim=1)
        x[i,:] = line
    return x

In [25]:
#from preprocessing import retrieve_embeddings
X_test = retrieve_embeddings(G, node_embeddings)

In [26]:
with torch.no_grad():
    y_tens = torch.sigmoid(mlp(X_test.to(device))).cpu().numpy()
y_pred = y_tens[:,0]


In [30]:
import csv
def retrieve_embeddings(G, embeddings):
    node_pairs = list()
    with open('test.txt', 'r') as f:
        for line in f:
            t = line.split(',')
            node_pairs.append((int(t[0]), int(t[1])))
    x = torch.zeros((len(node_pairs), 2*embeddings.shape[1]))
    for i, (src, dst) in enumerate(node_pairs):
        src_emb = embeddings[G.nodes[src].data['_ID']]
        dst_emb = embeddings[G.nodes[dst].data['_ID']]
        line = torch.cat([src_emb, dst_emb], dim=1)
        x[i,:] = line
    return x


In [32]:
predictions = zip(range(len(y_pred)), y_pred)
with open("submission_gat.csv","w") as pred:
    csv_out = csv.writer(pred)
    csv_out.writerow(['id','predicted'])
    for row in predictions:
        csv_out.writerow(row) 

## Doc2Vec

In [48]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from preprocessing import read_graph
from time import time 
from nltk.tokenize import word_tokenize
_, abstracts, _, _ = read_graph()
# ~4 mins
data = list(abstract.values())
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)]

In [None]:
max_epochs = 200
vec_size = 50
alpha = 0.025

model = Doc2Vec(vector_size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=2,
                hs=1,
                dm =1)
  
model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    t_start = time()
    
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=1)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha
    print('Epoch {0}. Elapsed {1} s'.format(epoch, time() - t_start))

model.save("d2v.model")
print("Model Saved")