In [None]:
from __future__ import division, print_function
import sys
import os
%pylab notebook
lib_path = '/home/fgeigl/navigability_of_networks'
sys.path.append(lib_path)
lib_path = '/home/fgeigl/navigability_of_networks/tools'
sys.path.append(lib_path)
import network_matrix_tools
import numpy as np
from scipy.sparse import csr_matrix, diags, eye
import pandas as pd
import datetime
from scipy.sparse.csgraph import connected_components
from collections import Counter
import operator
from sklearn.preprocessing import normalize
import numba
from joblib import Parallel, delayed
from math import sqrt
from scipy.sparse.linalg import spsolve

In [None]:
def load_sparse_csr(filename):
    loader = np.load(filename)
    return csr_matrix((loader['data'], loader['indices'], loader['indptr']), shape=loader['shape'])

In [None]:
A = load_sparse_csr('/opt/datasets/wiki_clickstream/adjacency_clickstream_network_largest_component.npz')
B = load_sparse_csr('/opt/datasets/wiki_clickstream/clickstream_network_transition_bias_largest_component.npz')    

In [None]:
print('A:', type(A), A.shape)
print('B:', type(B), B.shape)

In [None]:
_, labels = connected_components(A + B, directed=True, connection='strong', return_labels=True)
label_counts = Counter(labels)
largest_label, num_nodes = max(label_counts.items(), key=operator.itemgetter(1))
print('largest component contains', num_nodes, 'nodes', '(', num_nodes/A.shape[0], ')')
if num_nodes != A.shape[0]:
    label_filt = labels == largest_label
    A = A[label_filt, :][:, label_filt]
    B = B[label_filt, :][:, label_filt]
print('A:', type(A), A.shape, A.nnz/(np.power(A.shape[0],2)))
print('B:', type(B), B.shape, B.nnz/(np.power(B.shape[0],2)))

In [None]:
def stat_dist_power_iter(M, max_iter = 1e5, print_step=10, early_stopping = 500, precision=10, init_vec = None, n_jobs=40, eigval=1.):
    print('\tnormalize...', end='')
    sys.stdout.flush()
    max_iter = int(max_iter)
    M = M.astype(np.float)
    P = M.dot(diags(1. / np.array(M.sum(axis=0), dtype=np.float).flatten()))
    if init_vec is None:
        pi_vec = np.ones(P.shape[0], dtype=np.float) / P.shape[0]
    else:
        pi_vec = init_vec.astype(np.float)
    print('done. ', type(pi_vec), pi_vec.dtype, type(P), P.dtype)
    sys.stdout.flush()
    diff = list()
    best_pi = None
    best_norm = -1
    best_diff = 100
    no_improve = 0
    print_row = list()
    precision = int(round(precision))
    atol = np.power(1e1, -precision)
    last_diff = 100
    print('\tstart power iterations. max. iterations:', max_iter)
    sys.stdout.flush()
    comp_times = list()
    start = datetime.datetime.now()
    identiy = eye(P.shape[0])
    P_solve = P - (eigval * identiy)
    pi_vec, norm = normalize(pi_vec)
    for i in range(1, max_iter + 1):
        now = datetime.datetime.now()
        comp_times.append((now-start).total_seconds())
        comp_times = comp_times[-10:]
        avg_iter_time = sum(comp_times)/len(comp_times)
        pi_vec, last_vec = spsolve(P_solve, pi_vec), pi_vec
        # print(pi_vec)
        pi_vec, norm = normalize(pi_vec)
        current_diff = np.absolute(last_vec - pi_vec).max()
        if current_diff < atol:
            print('\nneeded', i, 'iterations')
            print('last diff:', (" %." + str(precision) + 'f') % current_diff)
            print('\nlargest eigval:', (" %." + str(precision) + 'f') % norm)
            assert len(pi_vec) == P.shape[0]
            return pi_vec
        improvement = current_diff - best_diff
        if improvement < 0:
            best_iter = i
            no_improve = 0
            best_diff = current_diff
            print_row.append('-')
        else:
            print_row.append('+')
            if no_improve == 0:
                best_pi = last_vec.copy()
            no_improve += 1
        diff.append(current_diff)
        time_remain = (current_diff / improvement) * avg_iter_time
        trend = sum(diff[-int(early_stopping/2):]) - sum(diff[-early_stopping:-int(early_stopping/2)])
        trend_print = ('v' if trend < 0 else '^') if len(diff) > early_stopping else ' '
        print_row = print_row[-print_step:]
        print('\r', '[' + (''.join(print_row)).rjust(print_step) + ']', 
              str(i).rjust(len(str(max_iter)),'0'), 
              ("%." + str(precision) + 'f') % current_diff, 
              trend_print,
              (" %." + str(precision) + 'f') % improvement, 
              ("%.3f" % avg_iter_time), 'sec/it', 
              'min. remain:', datetime.timedelta(seconds=int(time_remain)), end='')
        sys.stdout.flush()
        if no_improve >= early_stopping and trend > 0:
            print('\nearly stopping triggert.')
            break
    print('\ndid not converge within', i, 'iterations.')
    print('\t', 'return best pi. iteration:', best_iter)
    print('\tlargest eigval:', "%.15f" % best_norm)
    assert len(best_pi) == P.shape[0]
    return best_pi

In [None]:
df_fname = 'click_stream_results_poweriter_inv_it.df'
time_df_fname = 'click_stream_times_poweriter_inv_it.df'
try:
    df = pd.read_pickle(df_fname)
    times = pd.read_pickle(time_df_fname)
    print('loaded stored data:', df.columns)
except:
    print('init new data')
    df = pd.DataFrame()
    times = pd.Series()

# stat_dist_power_iter(A)
A = A.astype(np.longdouble)
B = B.astype(np.longdouble)

if 'A_sd' not in df.columns:
    start = datetime.datetime.now()
    #_, df['A_sd'] = network_matrix_tools.calc_entropy_and_stat_dist(A, method='EV', smooth_bias=False, calc_entropy_rate=False)
    df['A_sd'] = np.nan
    df['A_sd'] = df['A_sd'].astype(np.longdouble)
    df['A_sd'] = stat_dist_power_iter(A)
    times.loc['A_sd'] = datetime.datetime.now() - start
    print(datetime.datetime.now() - start)
init_vec = df['A_sd'].values

In [None]:
for beta in [1., 0.75, 0.5, 0.25, 0.1, 0.05, 0.01, 0.005, 0.001]:
    col_name = 'beta_' + str(beta)
    if col_name not in df.columns:
        print('calc beta:', beta)
        print('\t', datetime.datetime.now())
        start = datetime.datetime.now()
        # _, df[col_name] = network_matrix_tools.calc_entropy_and_stat_dist((beta * A) + B.T, method='EV', smooth_bias=False, calc_entropy_rate=False)
        df[col_name] = np.nan
        df[col_name] = df[col_name].astype(np.longdouble)
        df[col_name] = stat_dist_power_iter((beta * A) + B.T, init_vec = init_vec)
        print(df[col_name].dtype)
        times.loc[col_name] = datetime.datetime.now() - start
        print('\ttook', datetime.datetime.now() - start, '\n')
        df.to_pickle(df_fname)
        times.to_pickle(time_df_fname)
    else:
        print('calc beta:', beta, 'already cached')
        print('\ttook:', times.loc[col_name])
    init_vec = df[col_name].values

In [None]:
print('pearson:')
print(df.corr(method='pearson').iloc[0])
print('spearman:')
print(df.corr(method='spearman').iloc[0])
#print('kendall:')
#print(df.corr(method='kendall').iloc[0])

In [None]:
def sort_key(name):
    val = name.rsplit('_', 1)[-1]
    try:
        return float(val)
    except:
        return 100.

sorted_cols = sorted(df.columns, key=sort_key)
df = df[sorted_cols]
df.to_pickle(df_fname)
times.to_pickle(time_df_fname)
exit()