In [1]:
import pandas as pd
import numpy as np
import json
df = pd.read_csv('./train.csv')

In [2]:
N_USERS = 1216082

In [3]:
%%time

users = {}
edges = {}
for row in df.itertuples():
    users[row[1]] = row[3]
    users[row[2]] = row[4]
    if row[1] not in edges:
        edges[row[1]] = set()
    edges[row[1]].add(row[2])
    if row[2] not in edges:
        edges[row[2]] = set()
    edges[row[2]].add(row[1])

CPU times: user 26.9 s, sys: 1.02 s, total: 28 s
Wall time: 29.1 s


In [4]:
import networkx as nx

In [5]:
def create_subgraph(v1, v2):
    G = nx.Graph()
    nodes = edges[v1]|edges[v2]
    nodes.add(v1)
    nodes.add(v2)
    eds = []
    for n in nodes:
        for i in edges[n]:
            if i not in nodes:
                continue
            if (i,n) in eds or (n,i) in eds:
                continue
            eds.append((i,n))
    
    G.add_nodes_from(nodes)
    G.add_edges_from(eds)
    return G

In [6]:
def extract_features(v1, v2):
    features = {}
    neighbours_cnt = len(edges[v1])
    subgraph = create_subgraph(v1, v2)
    
    node_pairs = list((i, v2) for i in edges[v1] if i != v2)    

    jaccard = nx.jaccard_coefficient(subgraph, node_pairs)
    features['jaccard'] = 0
    for u, v, p in jaccard:
        features['jaccard'] += p
    features['jaccard'] = features['jaccard']/neighbours_cnt
    
    features['adamic_adar'] = 0
    ad_ad = nx.adamic_adar_index(subgraph, node_pairs)
    for u, v, p in ad_ad:
        features['adamic_adar'] += p
    features['adamic_adar'] = features['adamic_adar']/neighbours_cnt
    
    features['pref_attachment'] = 0
    pref_at = nx.preferential_attachment(subgraph, node_pairs)
    for u,v,p in pref_at:
        features['pref_attachment'] += p
    features['pref_attachment'] = features['pref_attachment']/neighbours_cnt
    
    p = dict.fromkeys(subgraph, 0)
    p[v2] = 1
    page_rank_v2 = nx.pagerank_scipy(subgraph, personalization=p)
    features['page_rank'] = 0
    for n in edges[v1]:
        features['page_rank'] += page_rank_v2[n]
    features['page_rank'] = features['page_rank']/neighbours_cnt
    
    return features
    
    

In [None]:
%%time

labels_all = list()

data_to_store = list()

for v, nodes in edges.iteritems():
    if users[v] != 0:
        continue
    for i in nodes:
        for v2 in edges[i]:
            if (v, v2) in labels_all or (v2, v) in labels_all:
                continue
            features = extract_features(v, v2)
            data_to_store.append({'features': features, 'class': int(v2 in nodes), 'labels': [v, v2]})
            labels_all.append((v, v2))

In [None]:
with open('features.json', 'w') as f:
    json.dump(data_to_store, data)