In [1]:
from __future__ import division, print_function
import sys
import os
lib_path = '/home/fgeigl/navigability_of_networks'
sys.path.append(lib_path)
import network_matrix_tools
import numpy as np
from scipy.sparse import csr_matrix
import pandas as pd
import datetime
from scipy.sparse.csgraph import connected_components
from collections import Counter
import operator

In [2]:
def load_sparse_csr(filename):
    loader = np.load(filename)
    return csr_matrix((loader['data'], loader['indices'], loader['indptr']), shape=loader['shape'])

In [3]:
A = load_sparse_csr('/opt/datasets/wiki_clickstream/adjacency_clickstream_network_largest_component.npz')
B = load_sparse_csr('/opt/datasets/wiki_clickstream/clickstream_network_transition_bias_largest_component.npz')    

In [4]:
print('A:', type(A), A.shape)
print('B:', type(B), B.shape)

A: <class 'scipy.sparse.csr.csr_matrix'> (2140423, 2140423)
B: <class 'scipy.sparse.csr.csr_matrix'> (2140423, 2140423)


In [5]:
_, labels = connected_components(A + B, directed=True, connection='strong', return_labels=True)
label_counts = Counter(labels)
largest_label, num_nodes = max(label_counts.iteritems(), key=operator.itemgetter(1))
print('largest component contains', num_nodes, 'nodes', '(', num_nodes/A.shape[0], ')')
if num_nodes != A.shape[0]:
    label_filt = labels == largest_label
    A = A[label_filt, :][:, label_filt]
    B = B[label_filt, :][:, label_filt]
print('A:', type(A), A.shape, A.nnz/(np.power(A.shape[0],2)))
print('B:', type(B), B.shape, B.nnz/(np.power(B.shape[0],2)))

largest component contains 2140423 nodes ( 1.0 )
A: <class 'scipy.sparse.csr.csr_matrix'> (2140423, 2140423) 3.59513059841e-05
B: <class 'scipy.sparse.csr.csr_matrix'> (2140423, 2140423) 2.66174133129e-06


In [None]:
# adjacency_matrix, bias=None, print_prefix='', eps=1e-10, orig_ma_mi_r=None, method='PR', damping=0.85, smooth_bias=True, calc_entropy_rate=True, verbose=True):
df = pd.DataFrame()
times = pd.Series()
start = datetime.datetime.now()
_, df['A_sd'] = network_matrix_tools.calc_entropy_and_stat_dist(A, method='EV', smooth_bias=False, calc_entropy_rate=False)
times.loc['A_sd'] = datetime.datetime.now() - start
print(datetime.datetime.now() - start)

 P values near zero: # 0
largest eigenvec sparse asymmetric
0:08:31.904331


In [None]:
for beta in [1., 0.5, 0.1, 0.05, 0.01]:
    col_name = 'beta_' + str(beta)
    if col_name not in df.columns:
        print('calc beta:', beta)
        start = datetime.datetime.now()
        _, df[col_name] = network_matrix_tools.calc_entropy_and_stat_dist((beta * A) + B, method='EV', smooth_bias=False, calc_entropy_rate=False)
        times.loc[col_name] = datetime.datetime.now() - start
        print('\t', datetime.datetime.now() - start, '\n')

calc beta: 1.0
 P values near zero: # 0
largest eigenvec sparse asymmetric
	 1:12:17.515948
calc beta: 0.5
 P values near zero: # 0
largest eigenvec sparse asymmetric
	 1:07:16.795241
calc beta: 0.1


In [None]:
print('pearson:')
print(df.corr(method='pearson'))
print('spearman:')
print(df.corr(method='spearman'))
print('kendall:')
print(df.corr(method='kendall'))