In [29]:
import csv
import networkx as nx
import numpy as np
from random import randint
from random import random
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss
from random import choice
from gensim.models import Word2Vec
import keras

In [3]:
# Create a graph
G = nx.read_edgelist('edgelist.txt', delimiter=',', create_using=nx.Graph(), nodetype=int)
nodes = list(G.nodes())
n = G.number_of_nodes()
m = G.number_of_edges()
edges = list(G.edges())

print('Number of nodes:', n)
print('Number of edges:', m)

# We need to create a seperate validation subset 
# with 0.1 edges from the graph

val_edges = list()
G_train = G

for edge in edges:
    if random() < 0.1:
        val_edges.append(edge)

# We remove the val edges from the graph G
for edge in val_edges:
    G_train.remove_edge(edge[0], edge[1])

n = G_train.number_of_nodes()
m = G_train.number_of_edges()
train_edges = list(G_train.edges())
    
print('Number of nodes of training set:', n)
print('Number of edges of training set:', m)

y_val = [1]*len(val_edges)

n_val_edges = len(val_edges)

# Create random pairs of nodes
for i in range(n_val_edges):
    n1 = nodes[randint(0, n-1)]
    n2 = nodes[randint(0, n-1)]
    (n1, n2) = (min(n1, n2), max(n1, n2))
    val_edges.append((n1, n2))
    
# Remove edges that exist in both train and val

for edge in list(set(val_edges) & set(train_edges)):
    val_edges.remove(edge)
    
n_val_edges = len(val_edges) - len(y_val) #because we removed from val_edges edges that exist in both
y_val.extend([0]*n_val_edges)

Number of nodes: 138499
Number of edges: 1091955
Number of nodes of training set: 138499
Number of edges of training set: 982599


In [4]:
# Define a function that generates a random walk for a given graph node and walk length

def random_walk(G, node, walk_length):
    # Starts from vertex "node" and performs a random walk of length "walk length". Returns a list of the visited vertices
    walk = [node]
  
    # your code here
    for i in range(walk_length-1) :
        neibor_nodes = list(G.neighbors(walk[-1]))
        if len(neibor_nodes) > 0 :
            #print('neibor_nodes : ', neibor_nodes)
            next_node = choice(neibor_nodes)
            walk.append(next_node)
    
    walk = [str(node) for node in walk] # in case the nodes are in string format, 
    #we don't need to cast into string, but if the nodes are in numeric or integer, 
    #we need this line to cast into string
    return walk


# Define a second function that generates num_walks random walks with a given length for all the nodes of a graph

def generate_walks(G, num_walks, walk_length):
    # Runs "num_walks" random walks from each node, and returns a list of all random walk
    walks = list()
  
    # your code here
    for i in range(num_walks) :
        for node in G.nodes() :
            #print(node)
            walk = random_walk(G, node, walk_length)
            walks.append(walk)
        #print('walks : ', walks)
    return walks


In [63]:
# Generate walks in a variable called walks which is a list of walk(s) which is a sequence of nodes
walks = generate_walks(G,num_walks=30,walk_length=20)

In [17]:
# model.wv[0]
len(walks)

1384990

In [54]:
# Use word2vec to reduce the dimensionality and get a non sparse representation
vector_size = 50
model = Word2Vec(vector_size=vector_size, window=5, min_count=0, sg=1, workers=8)
model.build_vocab(walks)
model.train(walks, total_examples=model.corpus_count, epochs=5)

(411509850, 411509850)

In [55]:
# Create the train matrix
# Now we start with only 2 features for each edge
# which are the sum of degrees of the two nodes and absolute difference of 

X_train = np.zeros((2*m, 2 + 2*vector_size))
y_train = np.zeros(2*m)

for i, edge in enumerate(train_edges):
    X_train[2*i, 0] = G.degree(edge[0]) + G.degree(edge[1])
    X_train[2*i, 1] = abs(G.degree(edge[0]) - G.degree(edge[1]))
    X_train[2*i, 2:2+vector_size] = model.wv[str(edge[0])]
    X_train[2*i, 2+vector_size:2+2*vector_size] = model.wv[str(edge[1])]    
    y_train[2*i] = 1
    
    # a randomly generated pair of nodes
    n1 = nodes[randint(0, n-1)]
    n2 = nodes[randint(0, n-1)]
    X_train[2*i+1,0] = G.degree(n1) + G.degree(n2)
    X_train[2*i+1,1] = abs(G.degree(n1) - G.degree(n2))
    X_train[2*i+1, 2:2+vector_size] = model.wv[str(G.degree(min(n1, n2)))]
    X_train[2*i+1, 2+vector_size:2+2*vector_size] = model.wv[str(G.degree(max(n1, n2)))]  
    y_train[2*i+1] = 0

    
#Create the validation (test) matrix. Use the same 2 (or more) features as above (for train matrix) 
#for all edges in new complete val set

X_val = np.array(np.zeros((len(val_edges), 2 + 2*vector_size)))

for i,edge in enumerate(val_edges):
    X_val[i,0] = G.degree(edge[0]) + G.degree(edge[1])
    X_val[i,1] = abs(G.degree(edge[0]) - G.degree(edge[1]))
    X_val[i, 2:2+vector_size] = model.wv[str(edge[0])]
    X_val[i, 2+vector_size:2+2*vector_size] = model.wv[str(edge[1])] 

In [79]:
([1]*len(y_train)-y_train)[1]

1.0

In [82]:
y_train_stack = np.column_stack((y_train, ([1]*len(y_train)-y_train)))
y_val_stack = np.column_stack((y_val, ([1]*len(np.array(y_val))-np.array(y_val))))

In [103]:
y_val[150000]

0

In [84]:
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras.layers import Conv2D, MaxPooling2D, BatchNormalization, Flatten, Dense, Input
from tensorflow.keras.models import Model
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
import time

tf.config.run_functions_eagerly(True)

def multi_layers (input_dim, output_dim=3):
    model = keras.Sequential()
    model.add(keras.Input(shape=(input_dim,)))
    model.add(keras.layers.Dense(50, activation='relu'))
    model.add(keras.layers.Dense(40, activation='relu'))
    model.add(BatchNormalization())
    
    model.add(Dense(units = 30, activation = "relu"))
    model.add(Dense(units = 30, activation = "relu"))
    model.add(Dense(units = 30, activation = "relu"))
    model.add(BatchNormalization())
    
    model.add(Dense(units = 20, activation = "relu"))
    model.add(Dense(units = 20, activation = "relu"))
    model.add(Dense(units = 20, activation = "relu"))
    model.add(BatchNormalization())    
    
    model.add(Dense(units = 10, activation = "relu"))
    model.add(Dense(units = 10, activation = "relu"))
    model.add(Dense(units = 10, activation = "relu"))
    model.add(BatchNormalization()) 
    
    model.add(Dense(units = 5, activation = "relu"))
    model.add(Dense(units = 5, activation = "relu"))
    model.add(Dense(units = 5, activation = "relu"))
    model.add(BatchNormalization())     
    
    model.add(Dense(units = output_dim, activation = "softmax"))
    return model 

multi_layers_model = multi_layers(102, 2) 
# multi_layers_model.summary()

multi_layers_model.compile(
    optimizer = "adam", loss = "binary_crossentropy", metrics = ["binary_crossentropy"]
)

model_checkpoint = ModelCheckpoint(
    "multi_layers_model.h5", monitor='binary_crossentropy', save_best_only=True
)

adam = keras.optimizers.Adam(learning_rate=0.1)

early_stop = EarlyStopping(monitor='binary_crossentropy', patience=5, verbose=1)


multi_layers_model.fit(
    x = X_train, 
    y = y_train_stack, 
    epochs = 25,
    batch_size = 1000,
    validation_data = (X_val, y_val_stack),
    callbacks = [model_checkpoint, early_stop], 
)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7f96f8ef5d90>

In [129]:
y_pred = multi_layers_model.predict(X_val)
y_pred[0]





array([1.], dtype=float32)

In [128]:
y_val_stack[0]

array([1, 0])

In [89]:
loss = log_loss(y_val, y_pred)
print(loss)

12.43883337796012


list

In [123]:
multi_layers_model = multi_layers(102, 1) 
# multi_layers_model.summary()

multi_layers_model.compile(
    optimizer = "adam", loss = "binary_crossentropy", metrics = ["binary_crossentropy"]
)

model_checkpoint = ModelCheckpoint(
    "multi_layers_model.h5", monitor='binary_crossentropy', save_best_only=True
)

adam = keras.optimizers.Adam(learning_rate=0.1)

early_stop = EarlyStopping(monitor='binary_crossentropy', patience=5, verbose=1)


multi_layers_model.fit(
    x = X_train, 
    y = y_train, 
    epochs = 25,
    batch_size = 1000,
    validation_data = (X_val, np.array(y_val)),
    callbacks = [model_checkpoint, early_stop], 
)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7f97762b9730>

In [126]:
loss = log_loss(y_val, y_pred)
print(y_val[400], '\n', y_pred[400])

1 
 [1.000000e+00 4.980952e-12]


### Logistic regression and random forest to test the features

In [23]:
# Use logistic regression to predict if two nodes are linked by an edge
clf = LogisticRegression()
clf.fit(X_train, y_train)
y_pred_clf = clf.predict_proba(X_val)
y_pred_clf = y_pred_clf[:,1]

loss = log_loss(y_val, y_pred_clf)
print(loss)

0.8816955490116937


In [24]:
# Random forest classifier

rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)
rf_y_pred = rf_clf.predict_proba(X_val)

rf_y_pred = rf_y_pred[:,1]

loss = log_loss(y_val, rf_y_pred)
print(loss)

12.3793166433572
