In [1]:
from __future__ import division, print_function
import sys
import os
%pylab notebook
lib_path = '/home/fgeigl/navigability_of_networks'
sys.path.append(lib_path)
lib_path = '/home/fgeigl/navigability_of_networks/tools'
sys.path.append(lib_path)
import network_matrix_tools
import numpy as np
from scipy.sparse import csr_matrix
import pandas as pd
import datetime
from scipy.sparse.csgraph import connected_components
from collections import Counter
import operator

Populating the interactive namespace from numpy and matplotlib


because the backend has already been chosen;
matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.



In [2]:
def load_sparse_csr(filename):
    loader = np.load(filename)
    return csr_matrix((loader['data'], loader['indices'], loader['indptr']), shape=loader['shape'])

In [3]:
A = load_sparse_csr('/opt/datasets/wiki_clickstream/adjacency_clickstream_network_largest_component.npz')
B = load_sparse_csr('/opt/datasets/wiki_clickstream/clickstream_network_transition_bias_largest_component.npz')    

In [4]:
print('A:', type(A), A.shape)
print('B:', type(B), B.shape)

A: <class 'scipy.sparse.csr.csr_matrix'> (2140423, 2140423)
B: <class 'scipy.sparse.csr.csr_matrix'> (2140423, 2140423)


In [5]:
_, labels = connected_components(A + B, directed=True, connection='strong', return_labels=True)
label_counts = Counter(labels)
largest_label, num_nodes = max(label_counts.items(), key=operator.itemgetter(1))
print('largest component contains', num_nodes, 'nodes', '(', num_nodes/A.shape[0], ')')
if num_nodes != A.shape[0]:
    label_filt = labels == largest_label
    A = A[label_filt, :][:, label_filt]
    B = B[label_filt, :][:, label_filt]
print('A:', type(A), A.shape, A.nnz/(np.power(A.shape[0],2)))
print('B:', type(B), B.shape, B.nnz/(np.power(B.shape[0],2)))

largest component contains 2140423 nodes ( 1.0 )
A: <class 'scipy.sparse.csr.csr_matrix'> (2140423, 2140423) 3.59513059841e-05
B: <class 'scipy.sparse.csr.csr_matrix'> (2140423, 2140423) 2.66174133129e-06


In [None]:
# adjacency_matrix, bias=None, print_prefix='', eps=1e-10, orig_ma_mi_r=None, method='PR', damping=0.85, smooth_bias=True, calc_entropy_rate=True, verbose=True):
df_pickle_name = 'click_stream_results.df'
time_pickle_name = 'click_stream_times.df'
try:
    df = pd.read_pickle(df_pickle_name)
    times = pd.read_pickle(time_pickle_name)
    print('loaded pickles')
except:
    print('load failed - create new dfs')
    df = pd.DataFrame()
    times = pd.Series()
start = datetime.datetime.now()
_, df['A_sd'] = network_matrix_tools.calc_entropy_and_stat_dist(A, method='EV', smooth_bias=False, calc_entropy_rate=False)
times.loc['A_sd'] = datetime.datetime.now() - start
print(datetime.datetime.now() - start)

 P values near zero: # 0
largest eigenvecsparseasymmetric
0:08:35.256508


In [None]:
for beta in [1., 0.75, 0.5, 0.25, 0.1, 0.05, 0.01, 0.005]:
    col_name = 'beta_' + str(beta)
    if col_name not in df.columns:
        print('calc beta:', beta)
        print('\t', datetime.datetime.now())
        start = datetime.datetime.now()
        _, df[col_name] = network_matrix_tools.calc_entropy_and_stat_dist((beta * A) + B.T, method='EV', smooth_bias=False, calc_entropy_rate=False)
        times.loc[col_name] = datetime.datetime.now() - start
        print('\ttook', datetime.datetime.now() - start, '\n')
        df.to_pickle(df_pickle_name)
        times.to_pickle(time_pickle_name)
    else:
        print('calc beta:', beta, 'already cached')
        print('\ttook:', times.loc[col_name])
    sys.stdout.flush()

calc beta: 1.0
	 2016-01-16 09:13:18.229290
 P values near zero: # 0
largest eigenvecsparseasymmetric
	took 1:20:12.782027 

calc beta: 0.75
	 2016-01-16 10:33:31.522634
 P values near zero: # 0
largest eigenvecsparseasymmetric
	took 1:32:56.103905 

calc beta: 0.5
	 2016-01-16 12:06:28.373984
 P values near zero: # 0
largest eigenvecsparseasymmetric
	took 2:23:53.085252 

calc beta: 0.25
	 2016-01-16 14:30:22.353274
 P values near zero: # 0
largest eigenvecsparseasymmetric
	took 5:48:26.676605 

calc beta: 0.1
	 2016-01-16 20:18:50.186320
 P values near zero: # 0
largest eigenvecsparseasymmetric
	took 13:47:57.012454 

calc beta: 0.05
	 2016-01-17 10:06:48.676885
 P values near zero: # 0
largest eigenvecsparseasymmetric
	took 1 day, 5:28:18.941205 

calc beta: 0.01
	 2016-01-18 15:35:09.178675


In [None]:
print('pearson:')
print(df.corr(method='pearson'))
print('spearman:')
print(df.corr(method='spearman'))
print('kendall:')
print(df.corr(method='kendall'))

In [None]:
def sort_key(name):
    val = name.rsplit('_', 1)[-1]
    try:
        return float(val)
    execpt:
        return 100.

sorted_cols = sorted(df.columns, key=sort_key)
df = df[sorted_cols]
df.to_pickle(df_pickle_name)
times.to_pickle(time_pickle_name)

In [None]:
df.corr(method='pearson').iloc[0][df.columns[1:]].plot()
plt.show()

In [None]:
df.head()

In [None]:
df.loc[[1,2,3]]