In [72]:
import numpy as np
import pandas as pd
import networkx as nx
import math

from random import randint
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from itertools import islice

In [73]:
train = pd.read_csv('data/social_network.csv')
test = pd.read_csv('data/suspicious_edges.csv')

train.columns = [u'i', u'j']
test.columns = [u'id', u'i', u'j']

In [74]:
train_without_loops = train[train.i != train.j]
test_without_loops = test[test.i != test.j]

test_loops = test[test.i == test.j]

In [75]:
edges = [(i, j) for _, (i, j) in train_without_loops.iterrows()]
edges_test = [(i, j) for _, (_, i, j) in test_without_loops.iterrows()]

nodes = pd.Series(train_without_loops[['i', 'j']].values.ravel()).unique()
nodes_test = pd.Series(test_without_loops[['i', 'j']].values.ravel()).unique()

In [76]:
G = nx.Graph()
G.add_edges_from(edges)
G.add_nodes_from(nodes_test)

In [77]:
def rand_edges(G, number):
    new_edges = set()
    num_nodes, num_edges = map(len, [G.nodes(), G.edges()])
    low, high = (lambda x: (min(x), max(x)))(G.nodes())
    
    while number >= 0:
        node1 = randint(low, high + 1)
        node2 = randint(low, high + 1)
        if node1 not in G or node2 not in G: continue
        if node1 == node2 or (node1, node2) in new_edges: continue
        yield (node1, node2)
        new_edges.add((node1, node2))
        number -= 1

In [78]:
def cross_val(edges, predictor):
    X1_train, X1_test, Y1_train, Y1_test = train_test_split(edges, np.ones(len(edges)), test_size = 0.15, random_state = 5)
    num_test = int(len(edges) * 0.15)
    X2_test = list(islice(rand_non_edges(G, num_test), num_test + 1))   
    Y2_test = np.zeros(len(X2_test)) 
    X_train = X1_train
    X_test = X1_test + X2_test
    Y_test = np.hstack([Y1_test, Y2_test])
    
    Y_pred = predictor(G.nodes(), X_train, X_test)
    auc = roc_auc_score(Y_test, Y_pred)

    return auc

In [79]:
pd.options.mode.chained_assignment = None

preds_adamic_adar = nx.adamic_adar_index(G, edges_new)
probs_adamic_adar = []
for u, v, p in preds_adamic_adar:
    probs_adamic_adar.append(p)

edges_adamic_adar = test_without_loops
edges_adamic_adar['prob'] = probs_adamic_adar
test_loops['prob'] = 0
res = pd.concat([edges_adamic_adar, test_loops])[['id', 'prob']].sort_index()

res.to_csv("data/result.csv", index=False)

In [80]:
def predict_with_adamic_adar(nodes, train, test):    
    G = nx.Graph()
    G.add_edges_from(train)
    G.add_nodes_from(nodes)
    probs = np.array([prob for (_, _, prob) in nx.adamic_adar_index(G, test)])
    probs /= probs.max()
    return probs

In [81]:
print (cross_val(edges, predict_with_adamic_adar))

0.7998669451591693
