# ALTEGRAD

In [1]:
import csv
import numpy as np
from random import randint
import dgl
import torch.nn as nn
import torch
import torch.nn.functional as F
import numpy as np
from tqdm import tqdm
from graph_models import SageModel, inference, DotPredictor, GATModel, DeepGAT, MLP
from gensim.models import KeyedVectors
from utils import edge_train_val_split
from graph_models import train_classif

from preprocessing import read_graph, retrieve_subgraph
from gensim.models.doc2vec import Doc2Vec
from author import load_author_embeddings_avg, load_common_author_embeddings


Using backend: pytorch


## 0.Loadings

In [2]:
G, abstracts, text_per_author, author_per_text = read_graph()
G = retrieve_subgraph(G, min_nb_nodes=-1)

Number of nodes: 138499
Number of edges: 1091955
Number of authors : 174961
The minimum degree of the nodes in the graph is : 1
The maximum degree of the nodes in the graph is : 3037
The mean degree of the nodes in the graph is : 15.76841710048448
The median degree of the nodes in the graph is : 9.0
Number of nodes in subgraph: 138499
Number of edges in subgraph: 1091955


In [3]:
wv = KeyedVectors.load("word2vec.wordvectors", mmap='r')
doc2vec_model= Doc2Vec.load("d2v.model")
for i, node in enumerate(G.nodes()):
    G.nodes[int(node)]['id'] = int(node)
    G.nodes[int(node)]['feat'] = wv[int(node)]
    G.nodes[int(node)]['abstract_feat'] =  doc2vec_model.dv.get_vector(int(node))

del wv, doc2vec_model

G = load_common_author_embeddings(G, text_per_author, author_per_text)
G = load_author_embeddings_avg(G, text_per_author, author_per_text)

In [5]:
graph = dgl.from_networkx(G, node_attrs=['id','feat', 'abstract_feat', 'feat_com_authors', 'avg_authors_feature']) # already undirected
graph.ndata['_ID'] = torch.arange(graph.num_nodes())
node_features = graph.ndata['feat']
num_features = node_features.shape[1]

device = torch.device('cpu')
best_model_path = 'gat_model_1.pt'
model = GATModel(node_features.shape[1], 64, 4, F.elu).to(device)
model.load_state_dict(torch.load(best_model_path))
node_embeddings = model.get_hidden(graph, node_features)

In [7]:
src = []
dst = []
for edge in tqdm(G.edges()):
    src.append(edge[0])
    dst.append(edge[1])
G_dir = dgl.graph((src, dst))
del src, dst
for feat in ['_ID', 'abstract_feat', 'feat_com_authors', 'avg_authors_feature']:
    G_dir.ndata[feat] = graph.ndata[feat]

100%|██████████| 1091955/1091955 [00:05<00:00, 190817.50it/s]


In [12]:
abstract_feat = G_dir.ndata['abstract_feat']
avg_authors_feature = G_dir.ndata['avg_authors_feature']
feat_com_authors = G_dir.ndata['feat_com_authors']

In [8]:

eid_train, eid_val = edge_train_val_split(G_dir, val_size=0.3)
device = torch.device('cuda')
negative_sampler = dgl.dataloading.negative_sampler.Uniform(3)
sampler = dgl.dataloading.MultiLayerNeighborSampler([0, 0]) # We need no message flows
train_classif_dataloader = dgl.dataloading.EdgeDataLoader(
    # The following arguments are specific to NodeDataLoader.
    G_dir,                                  # The graph
    eid_train,  # The edges to iterate over
    sampler,                                # The neighbor sampler
    negative_sampler=negative_sampler,      
    device=device,                         
    batch_size=1024,    
    shuffle=True,       # Whether to shuffle the nodes for every epoch
    drop_last=False,    # Whether to drop the last incomplete batch
    num_workers=0       # Number of sampler processes
)
val_classif_dataloader = dgl.dataloading.EdgeDataLoader(
    # The following arguments are specific to NodeDataLoader.
    G_dir,                                  # The graph 
    eid_val,  # The edges to iterate over
    sampler,                                # The neighbor sampler
    negative_sampler=negative_sampler,      
    device=device,                         
    batch_size=1024,    
    shuffle=True,       # Whether to shuffle the nodes for every epoch
    drop_last=False,    # Whether to drop the last incomplete batch
    num_workers=0       # Number of sampler processes
)

In [16]:
final_embeddings = torch.cat([node_embeddings, abstract_feat, avg_authors_feature], dim=1)
input_size = final_embeddings.shape[1]

In [19]:
device = torch.device('cuda')
epochs = 10
mlp = MLP(n_hidden=2*128, n_input=2*input_size).to(device)
optimizer = torch.optim.Adam(mlp.parameters(), lr=0.0005)
criterion = nn.BCEWithLogitsLoss()

In [24]:
train_classif(
    mlp, 
    final_embeddings, 
    train_classif_dataloader, 
    val_classif_dataloader, 
    criterion, 
    device,
    optimizer, 
    epochs=10, 
    name_model='mlp_1.pt'
    )

100%|██████████| 747/747 [01:40<00:00,  7.44it/s, loss=0.108] 


Epoch 0 : Train mean loss 0.21287320079493874 : Val mean loss 0.11022816854529083


100%|██████████| 747/747 [00:23<00:00, 32.25it/s, loss=0.088]


Epoch 1 : Train mean loss 0.09547843435003735 : Val mean loss 0.08881878650281579


100%|██████████| 747/747 [00:23<00:00, 32.38it/s, loss=0.084]


Epoch 2 : Train mean loss 0.08298943502955647 : Val mean loss 0.08190084788948297


100%|██████████| 747/747 [00:22<00:00, 33.85it/s, loss=0.088]


Epoch 3 : Train mean loss 0.07775782405771564 : Val mean loss 0.0790490783052519


100%|██████████| 747/747 [00:20<00:00, 36.17it/s, loss=0.072]


Epoch 4 : Train mean loss 0.07426298217202126 : Val mean loss 0.07758342307060957


100%|██████████| 747/747 [00:22<00:00, 32.75it/s, loss=0.081]


Epoch 5 : Train mean loss 0.07218268763528292 : Val mean loss 0.07536243255017325


100%|██████████| 747/747 [00:22<00:00, 32.82it/s, loss=0.069]


Epoch 6 : Train mean loss 0.070121929388608 : Val mean loss 0.074438983167056


100%|██████████| 747/747 [00:19<00:00, 37.96it/s, loss=0.072]


Epoch 7 : Train mean loss 0.06834862002207731 : Val mean loss 0.07303975694812834


100%|██████████| 747/747 [00:19<00:00, 38.15it/s, loss=0.069]


Epoch 8 : Train mean loss 0.0672342618914892 : Val mean loss 0.07289433757541701


100%|██████████| 747/747 [00:20<00:00, 36.93it/s, loss=0.075]


Epoch 9 : Train mean loss 0.06605953419982509 : Val mean loss 0.07204198179533705


| Model names   |      Model type      |  Hidden features | input variables |  Train acc |
|----------|:-------------:|------:|--------|----------|
| mlp_1 |  MLP | 256 | gat_1 - doc2vec - avg_authors_feature |  |  


## Final training

In [27]:

eid_train, eid_val = edge_train_val_split(G_dir, val_size=0.001)
device = torch.device('cuda')
negative_sampler = dgl.dataloading.negative_sampler.Uniform(3)
sampler = dgl.dataloading.MultiLayerNeighborSampler([0, 0]) # We need no message flows
train_classif_dataloader = dgl.dataloading.EdgeDataLoader(
    # The following arguments are specific to NodeDataLoader.
    G_dir,                                  # The graph
    eid_train,  # The edges to iterate over
    sampler,                                # The neighbor sampler
    negative_sampler=negative_sampler,      
    device=device,                         
    batch_size=1024,    
    shuffle=True,       # Whether to shuffle the nodes for every epoch
    drop_last=False,    # Whether to drop the last incomplete batch
    num_workers=0       # Number of sampler processes
)
val_classif_dataloader = dgl.dataloading.EdgeDataLoader(
    # The following arguments are specific to NodeDataLoader.
    G_dir,                                  # The graph 
    eid_val,  # The edges to iterate over
    sampler,                                # The neighbor sampler
    negative_sampler=negative_sampler,      
    device=device,                         
    batch_size=1024,    
    shuffle=True,       # Whether to shuffle the nodes for every epoch
    drop_last=False,    # Whether to drop the last incomplete batch
    num_workers=0       # Number of sampler processes
)

In [28]:
device = torch.device('cuda')
epochs = 10
mlp = MLP(n_hidden=2*128, n_input=2*input_size).to(device)
optimizer = torch.optim.Adam(mlp.parameters(), lr=0.0005)
criterion = nn.BCEWithLogitsLoss()

In [29]:
train_classif(
    mlp, 
    final_embeddings, 
    train_classif_dataloader, 
    val_classif_dataloader, 
    criterion, 
    device,
    optimizer, 
    epochs=10, 
    name_model='mlp_1.pt'
    )

100%|██████████| 1066/1066 [00:29<00:00, 35.56it/s, loss=0.093]


Epoch 0 : Train mean loss 0.1808507614690412 : Val mean loss 0.08969736471772194


100%|██████████| 1066/1066 [00:29<00:00, 36.17it/s, loss=0.072]


Epoch 1 : Train mean loss 0.08726413006322917 : Val mean loss 0.08220713213086128


100%|██████████| 1066/1066 [00:32<00:00, 32.90it/s, loss=0.073]


Epoch 2 : Train mean loss 0.07828535139211235 : Val mean loss 0.069878239184618


100%|██████████| 1066/1066 [00:30<00:00, 35.47it/s, loss=0.081]


Epoch 3 : Train mean loss 0.07404372810366901 : Val mean loss 0.0670417845249176


100%|██████████| 1066/1066 [00:30<00:00, 35.05it/s, loss=0.076]


Epoch 4 : Train mean loss 0.07145227479624368 : Val mean loss 0.06138957291841507


100%|██████████| 1066/1066 [00:31<00:00, 34.07it/s, loss=0.061]


Epoch 5 : Train mean loss 0.0691420832652368 : Val mean loss 0.08166980370879173


100%|██████████| 1066/1066 [00:29<00:00, 36.57it/s, loss=0.069]


Epoch 6 : Train mean loss 0.06763134123390543 : Val mean loss 0.07511237636208534


100%|██████████| 1066/1066 [00:27<00:00, 38.67it/s, loss=0.061]


Epoch 7 : Train mean loss 0.06638705389011533 : Val mean loss 0.05774259753525257


100%|██████████| 1066/1066 [00:28<00:00, 37.01it/s, loss=0.074]


Epoch 8 : Train mean loss 0.06539436417507745 : Val mean loss 0.07967415824532509


100%|██████████| 1066/1066 [00:32<00:00, 32.64it/s, loss=0.077]

Epoch 9 : Train mean loss 0.06454025934088632 : Val mean loss 0.06590183265507221





In [30]:
from preprocessing import retrieve_embeddings

In [33]:
#mlp = MLP(n_hidden=128, n_input=2*114).to(device)
#mlp.load_state_dict(torch.load('clf_gat_model_1_plus_doc.pt'))

X_test = retrieve_embeddings(G_dir, final_embeddings)
with torch.no_grad():
    y_tens = torch.sigmoid(mlp(X_test.to(device))).cpu().numpy()
y_pred = y_tens[:,0]


predictions = zip(range(len(y_pred)), y_pred)
with open("submission_mlp_1.csv","w") as pred:
    csv_out = csv.writer(pred)
    csv_out.writerow(['id','predicted'])
    for row in predictions:
        csv_out.writerow(row) 