In [1]:
from utils import random_walk, generate_walks, sparse_mx_to_torch_sparse_tensor 
from utils import text_to_list, intersection, read_train_val_graph, save_subgraph_in_file
from utils import apply_word2vec_on_features, create_and_normalize_adjacency, train_model, add_authors_to_pairs

In [2]:
import numpy as np
from random import random
from random import randint
from datetime import datetime
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, accuracy_score
import pandas as pd
import numpy as np
from time import time
from gensim.models import Word2Vec
import networkx as nx
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [3]:
path = '../input_data/edgelist.txt'
G, G_train, train_edges, val_edges, val_indices, y_val, nodes, node_to_idx = read_train_val_graph(val_ratio=0.1, path=path)
walks = generate_walks(G=G_train, num_walks=10, walk_length=15)
walks_wv = apply_word2vec_on_features(features=walks, nodes=nodes, vector_size=64)
adj, indices = create_and_normalize_adjacency(G_train)

Number of nodes: 138499 number of edges: 1091955 in the Complete the set
Number of nodes: 138499 number of edges: 982107 in the Training set
len(nodes) 138499
Creating random val_edges...
Returned G_train, train_edges, val_edges, y_val, nodes and node_to_idx objects
Loaded from edgelist.txt and with a training validation split ratio = 0.1
Start generating walks....
Random walks generated in in 57s!
Start applying Word2Vec...
Word2vec model trained on features in 2 min!
(138499, 64) features numpy array created in 2 min!


  adj = nx.adjacency_matrix(G) # Obtains the adjacency matrix of the training graph


Created a normalized adjancency matrix of shape (138499, 138499)
Created indices (2, 2102713) with the positions of non zeros in adj matrix


In [4]:
#features_np = np.concatenate([walks_wv, authors_wv], axis=1)
features_np = walks_wv

In [5]:
authors = pd.read_csv('../input_data/authors.txt', sep = '|', header=None)
authors = authors.rename(columns={0: "paper_id", 2: "authors"})
authors['authors'] = authors['authors'].apply(text_to_list)
authors = authors[["paper_id", "authors"]]
authors = authors[authors['paper_id'] <= max(G.nodes())]

In [6]:
# Create class labels
y = np.zeros(2*indices.shape[1])
y[:indices.shape[1]] = 1 # Concatenated ones for edges indices and later in the model we add zeros for random indices.

# Transforms the numpy matrices/vectors to torch tensors.
features = torch.FloatTensor(features_np).to(device)
y = torch.LongTensor(y).to(device)
if type(adj) != torch.Tensor:
    adj = sparse_mx_to_torch_sparse_tensor(adj).to(device)
indices = torch.LongTensor(indices).to(device)

In [7]:
class GNN(nn.Module):
    def __init__(self, n_feat, n_hidden, n_class, sub_class, dropout):
        super(GNN, self).__init__()
        self.fc1 = nn.Linear(n_feat, n_hidden)
        self.fc2 = nn.Linear(n_hidden, n_hidden)
        self.fc3 = nn.Linear(n_hidden, n_hidden)
        self.double_fc3 = nn.Linear((3*n_hidden), n_hidden)
        self.fc4 = nn.Linear(n_hidden, sub_class)
        self.fc5 = nn.Linear(sub_class, n_class)        
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()
        
        

    def forward(self, x_in, adj, pairs):
        
        h1 = self.fc1(x_in)
        z1 = self.relu(torch.mm(adj, h1))
        z1 = self.dropout(z1)

        h2 = self.fc2(z1)
        z2 = self.relu(torch.mm(adj, h2))
        z2 = self.dropout(z2)

        x = z2[pairs[0]] - z2[pairs[1]] # embedded features (z2) of node 0 - embedded features of node 1
        x = pairs[3][:, None] * x
        x1 = z2[pairs[0]]
        x2 = z2[pairs[1]]
        x = torch.cat((x, x1, x2), dim=1)
        
#         x_auth = pairs[2].reshape([len(pairs[2]), 1])
#         x_nb_auth = pairs[3].reshape([len(pairs[3]), 1]) 
        
        
        
        #x1 = z2[pairs[0]]
        #x2 = z2[pairs[1]]
        # could we add a new dimension to pairs to specify if same author(s)? and then what could we do?
              
        #x = torch.cat((x1, x2), dim=1)
        x = self.relu(self.double_fc3(x))        
        #x = self.relu(self.fc3(x))
        x = self.dropout(x)
        
        x = self.relu(self.fc4(x))
        x = self.dropout(x)
        
        #x = torch.cat((x, x_auth, x_nb_auth), dim=1) #pairs[3] : number of same authors or 1 if same author
        x = self.fc5(x)

        return F.log_softmax(x, dim=1)

In [8]:
epochs = 1000
n_hidden = 128
dropout_rate = 0.2
sub_class = 8
n_class = 2
n_features = features.shape[1]

# Creates the model and specifies the optimizer
model = GNN(n_features, n_hidden, n_class, sub_class, dropout_rate).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.1)

In [14]:
#optimizer = optim.Adam(model.parameters(), lr=0.01)

In [19]:
# def value_greater_than_list(list1, val):
#     if val >
# #     for x in list1: 
# #         # compare val with all the values of the list
# #         if val < x:
# #             return False
#     return True

def early_stopping(list_loss_val, loss_val, window=10):
    lst = list(list_loss_val)[-window:]
    if len(lst) == window and loss_val > (sum(lst)/len(lst)):
        print('mean: {:.5f} val: {:.5f}'.format((sum(lst)/len(lst)),loss_val))
        return True
    return False
    

    
def train_model(model, learning_rate, features, authors, adj, indices, y, val_indices, y_val, epochs):
    # Train model
    start_time = time()
    
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    val_indices = add_authors_to_pairs(val_indices, authors) #we add the authors to val_pairs
    indices = add_authors_to_pairs(indices, authors) #we add the authors to indices
    rand_indices = torch.randint(0, features.size(0), (indices.size(0),indices.size(1)), device=adj.device)# We take random indices each time we run an epoch
    rand_indices = add_authors_to_pairs(rand_indices, authors)
    pairs = torch.cat((indices, rand_indices), dim=1) # Concatenate the edges indices and random indices. 
    today = datetime.today().strftime('%Y-%m-%d')
    list_loss_val = []
    for epoch in range(epochs):
        t = time()
        optimizer.zero_grad()
        
        model.train()
        output = model(features, adj, pairs) # we run the model that gives the output.
        loss_train = F.nll_loss(output, y) # we are using nll_loss as loss to optimize, we store it in loss_train. We compare to y which is stable and contains the tag ones and zeros.
        acc_train = accuracy_score(torch.argmax(output, dim=1).detach().cpu().numpy(), y.cpu().numpy())# just to show it in the out put message of the training
        loss_train.backward() # The back propagation ? --> Computes the gradient of current tensor w.r.t. graph leaves
        optimizer.step() # Performs a single optimization step (parameter update).
        
        model.eval()
        output = model(features, adj, val_indices)
        y_val = torch.LongTensor(y_val).to(device)
        loss_val = F.nll_loss(output, y_val)
        list_loss_val.append(loss_val.item())
        acc_val = accuracy_score(torch.argmax(output, dim=1).detach().cpu().numpy(), y_val.cpu().numpy())
        
        if epoch % 5 == 0:
            print('Epoch: {:03d}'.format(epoch+1),
                  'loss_train: {:.4f}'.format(loss_train.item()),
                  'loss_val: {:.4f}'.format(loss_val.item()),
                  'acc_train: {:.4f}'.format(acc_train.item()),
                  'acc_val: {:.4f}'.format(acc_val.item()),
                  'time: {} s'.format(int(round(time()) - round(t))),
                 'total_time: {} min'.format(round((time() - start_time)/60)))
        if epoch % 20 == 0:
            model_path = "../outputs/models/{}-model-{}epochs.pt".format(today, epoch)
            torch.save(model.state_dict(), model_path)
            
        early = early_stopping(list_loss_val, loss_val.item(), window=10)
        if early:
            break


    print("Optimization Finished in {} min!".format(round((time() - start_time)/60)))
    return model

In [22]:
help(train_model)

Help on function train_model in module __main__:

train_model(model, learning_rate, features, authors, adj, indices, y, val_indices, y_val, epochs)



In [25]:
trained_model = train_model(model, 0.001, features, authors, adj, indices, y, val_indices, y_val, epochs)


Epoch: 001 loss_train: 0.6136 loss_val: 0.5655 acc_train: 0.6929 acc_val: 0.7684 time: 48 s total_time: 2 min
Epoch: 006 loss_train: 0.6042 loss_val: 0.5659 acc_train: 0.7058 acc_val: 0.7769 time: 43 s total_time: 6 min
Epoch: 011 loss_train: 0.6006 loss_val: 0.5576 acc_train: 0.7100 acc_val: 0.7800 time: 47 s total_time: 10 min
Epoch: 016 loss_train: 0.5955 loss_val: 0.5485 acc_train: 0.7176 acc_val: 0.7826 time: 46 s total_time: 14 min
Epoch: 021 loss_train: 0.5928 loss_val: 0.5437 acc_train: 0.7206 acc_val: 0.7861 time: 47 s total_time: 18 min
Epoch: 026 loss_train: 0.5902 loss_val: 0.5405 acc_train: 0.7217 acc_val: 0.7895 time: 45 s total_time: 22 min
Epoch: 031 loss_train: 0.5874 loss_val: 0.5357 acc_train: 0.7239 acc_val: 0.7918 time: 45 s total_time: 25 min
Epoch: 036 loss_train: 0.5837 loss_val: 0.5306 acc_train: 0.7276 acc_val: 0.7931 time: 46 s total_time: 29 min
Epoch: 041 loss_train: 0.5811 loss_val: 0.5269 acc_train: 0.7295 acc_val: 0.7952 time: 43 s total_time: 33 min
Epo

Epoch: 371 loss_train: 0.4107 loss_val: 0.3749 acc_train: 0.8298 acc_val: 0.8430 time: 45 s total_time: 286 min
Epoch: 376 loss_train: 0.4095 loss_val: 0.3742 acc_train: 0.8306 acc_val: 0.8434 time: 57 s total_time: 290 min


KeyboardInterrupt: 

In [12]:
l = [1, 3, 5, 6, 9]

l[-10:]

[1, 3, 5, 6, 9]

In [13]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)

In [16]:
from datetime import datetime
import pandas as pd



# Read test data. Each sample is a pair of nodes
node_pairs = list()
with open('../test_data/test.txt', 'r') as f:
    for line in f:
        t = line.split(',')
        node_pairs.append((node_to_idx[int(t[0])], node_to_idx[int(t[1])]))

# Testing
model.eval()
node_pairs = np.array(np.transpose(node_pairs))
pairs = torch.LongTensor(node_pairs).to(device)
pairs = add_authors_to_pairs(pairs, authors)
output = model(features, adj, pairs)
y_pred = torch.exp(output)
y_pred = y_pred.detach().cpu().numpy()

y_pred_true = list()
for element in y_pred:
    y_pred_true.append(element[1])
    

    
today = datetime.today().strftime('%Y-%m-%d')
random_nb = randint(0, 1000)

pd.DataFrame(y_pred_true, columns={'predicted'}).to_csv(
"../submissions_files/{}-submission-{}epochs-{}.csv".format(today, epochs, random_nb), header=True, index=True, index_label='id'
)

Essayer de remplacer les tags des auteurs par des 1 ones pour voir si l'autheur améliore vraiment le résultat.

In [None]:
def train_model(model, optimizer, features, adj, indices, y, val_indices, y_val, epochs):
    # Train model
    model.train()
    start_time = time()
    val_indices = add_authors_to_pairs(val_indices) #we add the authors to val_pairs
    indices = add_authors_to_pairs(indices) #we add the authors to indices
    rand_indices = torch.randint(0, features.size(0), (indices.size(0),indices.size(1)), device=adj.device)# We take random indices each time we run an epoch
    rand_indices = add_authors_to_pairs(rand_indices)
    today = datetime.today().strftime('%Y-%m-%d')
    for epoch in range(epochs):
        t = time()
        optimizer.zero_grad()
        pairs = torch.cat((indices, rand_indices), dim=1) # Concatenate the edges indices and random indices.   
        output = model(features, adj, pairs) # we run the model that gives the output.
        loss_train = F.nll_loss(output, y) # we are using nll_loss as loss to optimize, we store it in loss_train. We compare to y which is stable and contains the tag ones and zeros.
        acc_train = accuracy_score(torch.argmax(output, dim=1).detach().cpu().numpy(), y.cpu().numpy())# just to show it in the out put message of the training
        loss_train.backward() # The back propagation ? --> Computes the gradient of current tensor w.r.t. graph leaves
        optimizer.step() # Performs a single optimization step (parameter update).
        
        model.eval()
        output = model(features, adj, val_indices)
        y_val = torch.LongTensor(y_val).to(device)
        loss_val = F.nll_loss(output, y_val)
        acc_val = accuracy_score(torch.argmax(output, dim=1).detach().cpu().numpy(), y_val.cpu().numpy())

        if epoch % 5 == 0:
            print('Epoch: {:03d}'.format(epoch+1),
                  'loss_train: {:.4f}'.format(loss_train.item()),
                  'loss_val: {:.4f}'.format(loss_val.item()),
                  'acc_train: {:.4f}'.format(acc_train.item()),
                  'acc_val: {:.4f}'.format(acc_val.item()),
                  'time: {:.4f} s'.format(round(time() - t)),
                 'total_time: {} min'.format(round((time() - start_time)/60)))
        if epochs % 50 == 0:
            model_path = "../outputs/models/{}-model-{}epochs.pt".format(today, epoch)
            torch.save(model.state_dict(), model_path)


    print("Optimization Finished in {} min!".format(round((time() - start_time)/60)))
    return model

epochs = 100
trained_model = train_model(model, optimizer, features, adj, indices, y, val_indices, y_val, epochs)


In [None]:
break
#y_val = torch.FloatTensor(y_val).to(device)
trained_model = train_model(model, optimizer, features, adj, indices, y, val_indices, y_val, epochs)
model_nb = randint(0, 1000)
today = datetime.today().strftime('%Y-%m-%d')
model_path = "../submissions_files/{}-model-{}epochs-{}.pt".format(today, epochs, model_nb)
torch.save(trained_model.state_dict(), model_path)
print('Model saved in', model_path)

In [None]:
# One hot representation using Spark

import pyspark.sql.functions as F


def multi_label_binarizer(df, labels_col='labels', output_col='new_labels'):
    """
    Function that takes as input:
    - `df`, pyspark.sql.dataframe 
    - `labels_col`, string that indicates an array column containing labels
    - `output_col`, string that indicates the name of the new labels column
    
    and returns a multi-label binarized column.
    """
    
    # get set of unique labels and sort them
    labels_set = df\
        .withColumn('exploded', F.explode('labels'))\
        .agg(F.collect_set('exploded'))\
        .collect()[0][0]
    labels_set = sorted(labels_set)
    
    # dynamically create columns for each value in `labels_set`
    for i in labels_set:
        df = df.withColumn(i, F.when(F.array_contains(labels_col, i), 1).otherwise(0))
        
    # create new, multi-label binarized array column
    df = df.withColumn(output_col, F.array(*labels_set))
    
    return df

In [None]:
model.eval()
output = model(features, adj, val_indices)
y_val = torch.LongTensor(y_val).to(device)
loss_val = F.nll_loss(output, y_val)
acc_val = accuracy_score(torch.argmax(output, dim=1).detach().cpu().numpy(), y_val.cpu().numpy())
print(loss_val.item())

In [None]:
# Evaluating the model
trained_model.eval()
eval_pairs = np.array(np.transpose(val_edges))
#print(eval_pairs.shape)
eval_pairs = torch.LongTensor(eval_pairs).to(device)
#print(eval_pairs.shape)
eval_output = trained_model(features, adj, eval_pairs)
#print(eval_output.shape)
y_pred = torch.exp(eval_output)
y_pred = y_pred.detach().cpu().numpy()

#y_val_pred_true = list()

y_val_pred_true = y_pred[:, 1]

    
print('Log loss:', log_loss(y_val, y_val_pred_true))

#y_val = torch.tensor(y_val).to(device)
#y_val_pred_true = torch.tensor(y_val_pred_true).to(device)
#print(y_val, y_val_pred_true)

In [None]:
from datetime import datetime
import pandas as pd



# Read test data. Each sample is a pair of nodes
node_pairs = list()
with open('../test_data/test.txt', 'r') as f:
    for line in f:
        t = line.split(',')
        node_pairs.append((node_to_idx[int(t[0])], node_to_idx[int(t[1])]))

# Testing
model.eval()
node_pairs = np.array(np.transpose(node_pairs))
pairs = torch.LongTensor(node_pairs).to(device)
output = model(features, adj, pairs)
y_pred = torch.exp(output)
y_pred = y_pred.detach().cpu().numpy()

y_pred_true = list()
for element in y_pred:
    y_pred_true.append(element[1])
    

    
today = datetime.today().strftime('%Y-%m-%d')
random_nb = randint(0, 1000)

pd.DataFrame(y_pred_true, columns={'predicted'}).to_csv(
"../submissions_files/{}-submission-{}-{}.csv".format(today, model_nb, random_nb), header=True, index=True, index_label='id'
)

In [None]:
break

In [None]:
y_val = y_val.detach().cpu().numpy()
y_val

In [None]:
df = pd.DataFrame(y_pred)
df['y_val'] = list(y_val)
df.to_csv('../submissions_files/comparison_file.csv', sep=';')

In [None]:
from sklearn.metrics import mean_absolute_error

mean_absolute_error(y_val, y_pred[:, 1]), mean_absolute_error(y_val, y_pred[:, 0])

In [None]:
log_loss(y_val, y_pred[:, 1]), log_loss(y_val, y_pred[:, 0])

In [None]:
test_features = np.array([[5, 3, 0], [2, 1, 0], [4, 0, 1], [0, 3, 2]])
test_features = torch.LongTensor(test_features).to(device)
test_pairs = [[0, 0, 1, 2, 3, 2, 0, 1, 2, 3], [1, 2, 0, 0, 2, 3, 0, 1, 2, 3]]
test_pairs = torch.LongTensor(test_pairs).to(device)
print(test_features.shape, test_pairs.shape)
test_features[test_pairs[0]]

In [None]:
y_pred.shape, np.shape(y_val), np.shape(y_pred_true)

In [None]:
#print(output.shape)
print(features.shape, adj.shape, eval_pairs.shape)

In [None]:
z2[pairs[1,:]]

In [None]:
break

In [None]:
break

In [None]:
# Create class labels
y = np.zeros(4*G_train.number_of_edges())
y[:2*G_train.number_of_edges()] = 1 # Concatenated ones for edges indices and later in the model we add zeros for random indices.
train_edges[0]

In [None]:
t = time()
X_train = np.zeros((4*m, 2*features_np.shape[1]))

for i, edge in enumerate(train_edges):
    X_train[i] = np.concatenate((features_np[train_edges[i][0]], features_np[train_edges[i][1]]), axis=0)
    X_train[m+i] = np.concatenate((features_np[train_edges[i][0]], features_np[train_edges[i][1]]), axis=0)
    X_train[2*m+i] = np.concatenate((features_np[randint(0,n-1)], features_np[randint(0,n-1)]), axis=0)
    X_train[3*m+i] = np.concatenate((features_np[randint(0,n-1)], features_np[randint(0,n-1)]), axis=0)
    
X_val = np.zeros((len(val_edges), 2*features_np.shape[1]))
for i, edge in enumerate(val_edges):
    X_val[i] = np.concatenate((features_np[val_edges[i][0]], features_np[val_edges[i][1]]), axis=0)
    
print('X_train and X_val created in {} s!'.format(round(time()-t)))

In [29]:
import torchtext
print (torchtext.__version__)

ModuleNotFoundError: No module named 'torchtext'

In [None]:
t = time()
# Use logistic regression to predict if two nodes are linked by an edge
clf = LogisticRegression()
clf.fit(X_train, y)
y_pred = clf.predict_proba(X_val)
y_pred = y_pred[:,1]
print('Logistic regression performed in {} s!'.format(round(time()-t)))

In [None]:
print('Log loss:', log_loss(y_val, y_pred))

In [None]:
from datetime import datetime

# Read test data. Each sample is a pair of nodes
test_edges = list()
with open('../test_data/test.txt', 'r') as f:
    for line in f:
        t = line.split(',')
        test_edges.append((node_to_idx[int(t[0])], node_to_idx[int(t[1])]))

X_test = np.zeros((len(test_edges), 2*features_np.shape[1]))
for i, edge in enumerate(test_edges):
    X_test[i] = np.concatenate((features_np[test_edges[i][0]], features_np[test_edges[i][1]]), axis=0)

t = time()
# Use logistic regression to predict if two nodes are linked by an edge
y_pred = clf.predict_proba(X_test)
y_pred = y_pred[:,1]
print('Logistic regression performed in {} s!'.format(round(time()-t)))


today = datetime.today().strftime('%Y-%m-%d')
random_nb = randint(0, 100000)

pd.DataFrame(y_pred, columns={'predicted'}).to_csv(
"../submissions_files/{}-submission-{}.csv".format(today, random_nb), header=True, index=True, index_label='id'
)
    
    
# # Testing
# node_pairs = np.array(np.transpose(node_pairs))
# pairs = torch.LongTensor(node_pairs).to(device)
# output = model(features, adj, pairs)
# y_pred = torch.exp(output)
# y_pred = y_pred.detach().cpu().numpy()

# y_pred_true = list()
# for element in y_pred:
#     y_pred_true.append(element[1])
    

    
# today = datetime.today().strftime('%Y-%m-%d')
# random_nb = randint(0, 100000)

# pd.DataFrame(y_pred_true, columns={'predicted'}).to_csv(
# "../submissions_files/{}-submission-{}.csv".format(today, random_nb), header=True, index=True, index_label='id'
# )