In [1]:
import ujson as json
import node2vec
import networkx as nx
from gensim.models import Word2Vec
import logging
import random
import numpy as np
from sklearn.metrics import roc_auc_score
import pandas

In [2]:
def get_G_from_edges(edges):
    edge_dict = dict()
    # calculate the count for all the edges
    for edge in edges:
        edge_key = str(edge[0]) + '_' + str(edge[1])
        if edge_key not in edge_dict:
            edge_dict[edge_key] = 1
        else:
            edge_dict[edge_key] += 1
    tmp_G = nx.DiGraph()
    for edge_key in edge_dict:
        weight = edge_dict[edge_key]
        # add edges to the graph
        tmp_G.add_edge(edge_key.split('_')[0], edge_key.split('_')[1])
        # add weights for all the edges
        tmp_G[edge_key.split('_')[0]][edge_key.split('_')[1]]['weight'] = weight
    return tmp_G


In [3]:
def get_neighbourhood_score(local_model, node1, node2):
    try:
        vector1 = local_model.wv.syn0[local_model.wv.index2word.index(node1)]
        vector2 = local_model.wv.syn0[local_model.wv.index2word.index(node2)]
        return np.dot(vector1, vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2))
    except:
        return random.random()

In [4]:
def get_AUC(model, true_edges, false_edges):
    true_list = list()
    prediction_list = list()
    for edge in true_edges:
        tmp_score = get_neighbourhood_score(model, str(edge[0]), str(edge[1]))
        true_list.append(1)
        prediction_list.append(tmp_score)

    for edge in false_edges:
        tmp_score = get_neighbourhood_score(model, str(edge[0]), str(edge[1]))
        true_list.append(0)
        prediction_list.append(tmp_score)
    y_true = np.array(true_list)
    y_scores = np.array(prediction_list)
    return roc_auc_score(y_true, y_scores)

In [57]:
directed = True
p = 1
q = 1
num_walks = 6
walk_lengths = 9
dimensions = 230
window_size = 10
num_workers = 4
iterations = 3

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Start to load the raw network

train_edges = list()
raw_train_data = pandas.read_csv('../data/train.csv')
for i, record in raw_train_data.iterrows():
    train_edges.append((str(record['head']), str(record['tail'])))

print('finish loading the train data.')

# Start to load the valid/test data

valid_positive_edges = list()
valid_negative_edges = list()
raw_valid_data = pandas.read_csv('../data/valid.csv')
for i, record in raw_valid_data.iterrows():
    if record['label']:
        valid_positive_edges.append((str(record['head']), str(record['tail'])))
    else:
        valid_negative_edges.append((str(record['head']), str(record['tail'])))

print('finish loading the valid/test data.')

train_edges = list(set(train_edges))


train_nodes = list()
for e in train_edges:
    train_nodes.append(e[0])
    train_nodes.append(e[1])
train_nodes = list(set(train_nodes))

auc = list()
for walk_length in walk_lengths:
    # Create a node2vec object with training edges
    G = node2vec.Graph(get_G_from_edges(train_edges), directed, p, q)
    # Calculate the probability for the random walk process
    G.preprocess_transition_probs()
    # Conduct the random walk process
    walks = G.simulate_walks(num_walks, walk_length)
    # Train the node embeddings with gensim word2vec package
    model = Word2Vec(walks, size=dimension, window=window_size, min_count=0, sg=1, workers=num_workers, iter=iterations)
    # Save the resulted embeddings (you can use any format you like)
    resulted_embeddings = dict()
    for i, w in enumerate(model.wv.index2word):
        resulted_embeddings[w] = model.wv.syn0[i]
    # Test the performance of resulted embeddings with a link prediction task.
    tmp_AUC_score = get_AUC(model, valid_positive_edges, valid_negative_edges)

    print('tmp_accuracy:', tmp_AUC_score)
    auc.append(tmp_AUC_score)

    print('end')

finish loading the train data.
finish loading the valid/test data.
Walk iteration:
1 / 6
2 / 6
3 / 6
4 / 6
5 / 6
6 / 6


2019-04-03 17:05:15,658 : INFO : collecting all words and their counts
2019-04-03 17:05:15,658 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-04-03 17:05:15,684 : INFO : PROGRESS: at sentence #10000, processed 63715 words, keeping 5354 word types
2019-04-03 17:05:15,711 : INFO : PROGRESS: at sentence #20000, processed 127297 words, keeping 5354 word types
2019-04-03 17:05:15,733 : INFO : PROGRESS: at sentence #30000, processed 191044 words, keeping 5354 word types
2019-04-03 17:05:15,739 : INFO : collected 5354 word types from a corpus of 204585 raw words and 32124 sentences
2019-04-03 17:05:15,740 : INFO : Loading a fresh vocabulary
2019-04-03 17:05:15,754 : INFO : effective_min_count=0 retains 5354 unique words (100% of original 5354, drops 0)
2019-04-03 17:05:15,755 : INFO : effective_min_count=0 leaves 204585 word corpus (100% of original 204585, drops 0)
2019-04-03 17:05:15,779 : INFO : deleting the raw counts dictionary of 5354 items
2019-04-03 17

tmp_accuracy: 0.8174194029719808
end
Walk iteration:
1 / 6
2 / 6
3 / 6
4 / 6
5 / 6
6 / 6


2019-04-03 17:05:25,743 : INFO : collecting all words and their counts
2019-04-03 17:05:25,743 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-04-03 17:05:25,771 : INFO : PROGRESS: at sentence #10000, processed 72234 words, keeping 5354 word types
2019-04-03 17:05:25,792 : INFO : PROGRESS: at sentence #20000, processed 144503 words, keeping 5354 word types
2019-04-03 17:05:25,813 : INFO : PROGRESS: at sentence #30000, processed 216709 words, keeping 5354 word types
2019-04-03 17:05:25,821 : INFO : collected 5354 word types from a corpus of 232150 raw words and 32124 sentences
2019-04-03 17:05:25,822 : INFO : Loading a fresh vocabulary
2019-04-03 17:05:25,835 : INFO : effective_min_count=0 retains 5354 unique words (100% of original 5354, drops 0)
2019-04-03 17:05:25,836 : INFO : effective_min_count=0 leaves 232150 word corpus (100% of original 232150, drops 0)
2019-04-03 17:05:25,856 : INFO : deleting the raw counts dictionary of 5354 items
2019-04-03 17

tmp_accuracy: 0.8593363155810735
end
Walk iteration:
1 / 6
2 / 6
3 / 6
4 / 6
5 / 6
6 / 6


2019-04-03 17:05:37,659 : INFO : collecting all words and their counts
2019-04-03 17:05:37,659 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-04-03 17:05:37,692 : INFO : PROGRESS: at sentence #10000, processed 80989 words, keeping 5354 word types
2019-04-03 17:05:37,716 : INFO : PROGRESS: at sentence #20000, processed 161797 words, keeping 5354 word types
2019-04-03 17:05:37,744 : INFO : PROGRESS: at sentence #30000, processed 242616 words, keeping 5354 word types
2019-04-03 17:05:37,752 : INFO : collected 5354 word types from a corpus of 259872 raw words and 32124 sentences
2019-04-03 17:05:37,753 : INFO : Loading a fresh vocabulary
2019-04-03 17:05:37,773 : INFO : effective_min_count=0 retains 5354 unique words (100% of original 5354, drops 0)
2019-04-03 17:05:37,774 : INFO : effective_min_count=0 leaves 259872 word corpus (100% of original 259872, drops 0)
2019-04-03 17:05:37,806 : INFO : deleting the raw counts dictionary of 5354 items
2019-04-03 17

tmp_accuracy: 0.8597466517168468
end


In [58]:
import operator
max(enumerate(auc),key=operator.itemgetter(1))

(2, 0.8597466517168468)

In [59]:
auc

[0.8174194029719808, 0.8593363155810735, 0.8597466517168468]