In [None]:
import sys
# !{sys.executable} -m pip install tldextract

<div style='text-align:center;'>
<h1>HopRank</h1>
<h2>Model Fitting</h2>
</div>

In [1]:
__author__ = "Lisette Espin-Noboa"
__copyright__ = "Copyright 2018, HopRank"
__credits__ = ["Florian Lemmerich", "Markus Strohmaier", "Simon Walk", "Mark Musen"]
__license__ = "GPL"
__version__ = "1.0.3"
__maintainer__ = "Lisette Espin-Noboa"
__email__ = "Lisette.Espin@gesis.org"
__status__ = "Developing"

########################################################################################
# Warnings
########################################################################################
import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

<h2>Dependencies</h2>

In [2]:
########################################################################################
# System
########################################################################################
import os
import importlib
import numpy as np
import pandas as pd
import seaborn as sns
import networkx as nx
from collections import Counter
from joblib import Parallel, delayed
import matplotlib as mpl
import matplotlib.pyplot as plt 
from collections import defaultdict
from scipy.sparse import csr_matrix

########################################################################################
# Local Dependencies
########################################################################################
%reload_ext autoreload
%autoreload 2
from org.gesis.libs.bioportal.ontology import Ontology
from org.gesis.libs.bioportal.transition import Transition
from org.gesis.libs.bioportal.clickstream import NAVITYPES
from org.gesis.libs.bioportal.clickstream import load_clickstream
from org.gesis.libs.bioportal.clickstream import preprocess_clickstream
from org.gesis.libs.bioportal.clickstream import preprocess_sessions
from org.gesis.libs.bioportal.clickstream import clean_clickstream
from org.gesis.libs.bioportal.submission import get_submissions
from org.gesis.libs.utils import printf
from org.gesis.libs.utils import weighted_to_undirected
from org.gesis.libs.utils import to_symmetric
from org.gesis.libs.models.randomwalk import RandomWalk
from org.gesis.libs.models.markovchain import MarkovChain
from org.gesis.libs.models.hoprank import HopRank
from org.gesis.libs.models.preferential import PreferentialAttachment
from org.gesis.libs.models.gravitational import Gravitational

########################################################################################
# Plot's style
########################################################################################
sns.set_context(context="paper", font_scale=1.5)
sns.set_style("whitegrid")

<h2>Constants</h2>

In [3]:
SM = 'submission'
SUBMISSIONS_FN = '/bigdata/lespin/datasets/bioportal/submissions.json'

ON = 'ontology'
ONTOLOGY_ROOT = '/bigdata/lespin/datasets/bioportal/ontologies'
ONTOLOGY_GRAPH_OUTPUT = '/bigdata/lespin/bioportal/ontologies/graph'
ONTOLOGY_ADJACENCY_OUTPUT = '/bigdata/lespin/bioportal/ontologies/matrix'
ONTOLOGY_NODES_OUTPUT = '/bigdata/lespin/bioportal/ontologies/nodes'
ONTOLOGY_HOPS_OUTPUT = '/bigdata/lespin/bioportal/ontologies/hops'
ONTOLOGY_DISTANCE_OUTPUT = '/bigdata/lespin/bioportal/ontologies/distances'

CS = 'clickstream'
CS_FN = 'BP_webpage_requests_<YEAR><POSTFIX>.csv.bz2'
CLICKSTREAM_ROOT = '/bigdata/lespin/datasets/bioportal/clickstream/'
CLICKSTREAM_DF = '/bigdata/lespin/bioportal/clickstreams'

TR = 'transitions'
TRANS_GRAPH_OUTPUT = '/bigdata/lespin/bioportal/transitions/graph'
TRANS_ADJACENCY_OUTPUT = '/bigdata/lespin/bioportal/transitions/matrix'
TRANS_NODES_OUTPUT = '/bigdata/lespin/bioportal/transitions/nodes'
LCC_TRANS_GRAPH_OUTPUT = '/bigdata/lespin/bioportal/transitions_in_lcc/graph'
LCC_TRANS_ADJACENCY_OUTPUT = '/bigdata/lespin/bioportal/transitions_in_lcc/matrix'
LCC_TRANS_NODES_OUTPUT = '/bigdata/lespin/bioportal/transitions_in_lcc/nodes'
LCC_TRANS_MODEL_OUTPUT = '/bigdata/lespin/bioportal/transitions_in_lcc/modeling'

SU = 'summary'
LOG_OUTPUT = '/bigdata/lespin/bioportal/logs'
PLOTS_OUTPUT = '/bigdata/lespin/bioportal/plots'
SUMMARY_OUTPUT = '/bigdata/lespin/bioportal/summary'
YEAR = '2015'

LAG_BREAK_IN_MINUTES = 60
MIN_SESSION_LENGTH = 2
MIN_REQ = 1000
MIN_TRANSITIONS = 1000
MAXKHOPS = 30

LCC = True

<div style='text-align:center;'>
<h1>Toy-Example</h1>
</div>

In [4]:
M = csr_matrix(np.array([[0,1,1],[1,0,1],[0,1,0]]))
T = csr_matrix(np.array([[0,0,10],[0,0,100],[1,60,0]]))
results = {}

print('Random Walker 0.85...')
m = RandomWalk(M,T,0.85)
m.compute_loglikelihood()
m.AIC()
m.BIC()
results['RandomWalker_0.85'] = m.bic

print('Random Walker 0.0...')
m = RandomWalk(M,T,0.)
m.compute_loglikelihood()
m.AIC()
m.BIC()
results['RandomWalker_0.0'] = m.bic

print('Random Walker 1.0...')
m = RandomWalk(M,T,1.0)
m.compute_loglikelihood()
m.AIC()
m.BIC()
results['RandomWalker_1.0'] = m.bic

print('Random Walker Empirical...')
m = RandomWalk(M,T,None)
m.compute_loglikelihood()
m.AIC()
m.BIC()
results['RandomWalker_Em'] = m.bic

print('MarkovChain...')
m = MarkovChain(M,T)
m.compute_loglikelihood()
m.AIC()
m.BIC()
results['MarkovChain'] = m.bic

print('PreferentialAttachment...')
m = PreferentialAttachment(M,T)
m.compute_loglikelihood()
m.AIC()
m.BIC()
results['PreferentialAttachment'] = m.bic

# print('Gravitational...')
# m = Gravitational(M,T,khop)
# m.compute_loglikelihood()
# m.AIC()
# m.BIC()
# results['Gravitational'] = m.bic

# print('HopRank...')
# m = HopRank(M,T,khop,betas)
# m.compute_loglikelihood()
# m.AIC()
# m.BIC()
# results['HopRank'] = m.bic

results

Random Walker 0.85...
Random Walker 0.0...
Random Walker 1.0...
Random Walker Empirical...
2019-02-08 01:46:41	Empirical alpha (damping factor): 0.99
MarkovChain...
PreferentialAttachment...


{'MarkovChain': 25.630254631999875,
 'PreferentialAttachment': 336.2545347462267,
 'RandomWalker_0.0': 375.72540272449356,
 'RandomWalker_0.85': 182.41163091449621,
 'RandomWalker_1.0': 200.7413060102577,
 'RandomWalker_Em': 170.5788450855118}

<div style='text-align:center;'>
<h1>Functions</h1>
</div>

In [5]:
def __fit__(name, m, group):
    print('{}...'.format(name))
    m.compute_loglikelihood()
    m.AIC()
    m.BIC()
    return {'ontology':group.ontology.unique()[0],
             'navitype':group.navitype.unique()[0],            
             'model':name,
             'nparams':m.nparams,
             'nobservations':m.nobservations,
             'loglikelihood':m.loglikelihood,
             'aic':m.aic,
             'bic':m.bic}
    
def fit_random_walker(alpha, M, T, group):  
    m = RandomWalk(M,T,alpha)    
    name = 'RandomWalker {}'.format(m.alpha)
    return __fit__(name, m, group)

def fit_markov_chain(M, T, group):
    name = 'MarkovChain'
    m = MarkovChain(M,T) 
    return __fit__(name, m, group)

def fit_preferential_attachment(M, T, group):
    name = 'PreferentialAttachment'
    m = PreferentialAttachment(M,T)    
    return __fit__(name, m, group)

def fit_gravitational(M, T, khops, group):
    name = 'Gravitational'
    m = Gravitational(M,T,khops)    
    return __fit__(name, m, group)

def fit_hoprank(M, T, khops, betas, group):
    name = 'HopRank'
    m = HopRank(M,T,khops=khops,betas=betas)    
    return __fit__(name, m, group)

def plot_bic_values(df, fn=None):
    df.model = df.model.apply(lambda x: x.replace('RandomWalker','RW'))
    df.model = df.model.apply(lambda x: x.replace('MarkovChain','MC'))
    df.model = df.model.apply(lambda x: x.replace('PreferentialAttachment','PA'))
    df.model = df.model.apply(lambda x: x.replace('Gravitational','Gr'))

    fg = sns.catplot(data=df,
                kind='bar',
                x='model',
                y='bic',
                col='navitype',
                height=3,
                aspect=1)

    fg.ax.set_yscale('log')
    fg.ax.set_ylabel('BIC')
    _ = fg.ax.set_xticklabels(fg.ax.get_xticklabels(), rotation=90)
    # [plt.setp(ax.texts, text="") for ax in p.axes.flat]
    _ = fg.set_titles(row_template = '{row_name}', col_template = '{col_name}')
    
    plt.subplots_adjust(hspace=0.1, wspace=0.1)
    plt.tight_layout()
    
    if fn is not None:
        plt.savefig(fn)
        printf('{} saved!'.format(fn)) 
        
    plt.show()
    plt.clf()

<div style='text-align:center;'>
<h1>Model Selection</h1>
</div>

In [None]:
print('loading submissions...')
submissions_dict = get_submissions(SUBMISSIONS_FN)

print('loading k-hop overlaps...')
fn = os.path.join(SUMMARY_OUTPUT,'lcc_summary_overlaps_{}.{}'.format(YEAR,'csv'))
df_overlap = pd.read_csv(fn, index_col=None)

columns = ['ontology','navitype','model','nparams','nobservations','loglikelihood','aic','bic']
results_model = pd.DataFrame(columns=columns)

for onto_name, df in df_overlap.groupby('ontology'):
    printf("=== {} ===".format(onto_name))
    submission = submissions_dict[onto_name][YEAR]
    onto = Ontology(name=onto_name, year=YEAR, submission_id=submission['submissionId'], root_folder=ONTOLOGY_ROOT)
    onto.load_adjacency(path=ONTOLOGY_ADJACENCY_OUTPUT, lcc=True)
    khops = onto.create_distance_matrix(path=ONTOLOGY_DISTANCE_OUTPUT, hopspath=None, lcc=LCC)

    tmp = df.copy()
    tmp.rename(columns={'%transitions':'beta'}, inplace=True)
    for navitype, group in tmp.groupby('navitype'):
        
        fn_preliminar = os.path.join(LCC_TRANS_MODEL_OUTPUT,'lcc_model_{}_{}_{}.csv'.format(YEAR,onto_name,navitype))
        if os.path.exists(fn_preliminar):
            results_model = results_model.append(pd.read_csv(fn_preliminar, index_col=None), ignore_index=True)
            printf('{} loaded!'.format(navitype))
        else:
            printf('{} computing...'.format(navitype))
            betas = group.query("khop <= diameter")[['khop','beta']].set_index('khop')    
            betas = betas.append(pd.DataFrame({'beta':[0]},index=[0])) + 0.001
            betas = betas*100 / betas.sum()
            betas.sort_index(inplace=True)
            print(betas)

            cs = Transition(onto_name, navitype, YEAR)            
            cs.load_adjacency(path=LCC_TRANS_ADJACENCY_OUTPUT)

            preliminar_model = pd.DataFrame(columns=columns)
            
            model = fit_random_walker(0.0, onto.get_adjacency_matrix(lcc=LCC),cs.get_adjacency_matrix(), group)
            preliminar_model = preliminar_model.append(model, ignore_index=True)

            model = fit_random_walker(0.85, onto.get_adjacency_matrix(lcc=LCC),cs.get_adjacency_matrix(), group)
            preliminar_model = preliminar_model.append(model, ignore_index=True)

            model = fit_random_walker(1.0, onto.get_adjacency_matrix(lcc=LCC),cs.get_adjacency_matrix(), group)
            preliminar_model = preliminar_model.append(model, ignore_index=True)

            model = fit_random_walker(None, onto.get_adjacency_matrix(lcc=LCC),cs.get_adjacency_matrix(), group)
            preliminar_model = preliminar_model.append(model, ignore_index=True)

            model = fit_markov_chain(onto.get_adjacency_matrix(lcc=LCC),cs.get_adjacency_matrix(), group)
            preliminar_model = preliminar_model.append(model, ignore_index=True)

            model = fit_preferential_attachment(onto.get_adjacency_matrix(lcc=LCC),cs.get_adjacency_matrix(), group)
            preliminar_model = preliminar_model.append(model, ignore_index=True)

            model = fit_gravitational(onto.get_adjacency_matrix(lcc=LCC),cs.get_adjacency_matrix(), khops, group)
            preliminar_model = preliminar_model.append(model, ignore_index=True)

            model = fit_hoprank(onto.get_adjacency_matrix(lcc=LCC),cs.get_adjacency_matrix(), khops, betas, group)
            preliminar_model = preliminar_model.append(model, ignore_index=True)

            preliminar_model.to_csv(fn_preliminar, index=None)
            results_model = results_model.append(preliminar_model, ignore_index=True)
        
        del(preliminar_model)
        del(cs)
        
    del(onto)
    

loading submissions...
2019-02-08 02:09:45	/bigdata/lespin/datasets/bioportal/submissions.json loaded!
2019-02-08 02:09:45	- 642 ontologies
2019-02-08 02:09:45	- 13 years
loading k-hop overlaps...
2019-02-08 02:09:45	=== CHMO ===
2019-02-08 02:09:45	/bigdata/lespin/bioportal/ontologies/matrix/LCC_CHMO_2015.npz loaded!
2019-02-08 02:09:46	/bigdata/lespin/bioportal/ontologies/distances/LCC_CHMO_2015_HOPs.npz loaded!
2019-02-08 02:09:46	DC
         beta
0    0.001000
1   58.127166
2   23.251466
3   13.951280
4    4.651093
5    0.001000
6    0.001000
7    0.001000
8    0.001000
9    0.001000
10   0.001000
11   0.001000
12   0.001000
13   0.001000
14   0.001000
15   0.001000
16   0.001000
17   0.001000
18   0.001000
19   0.001000
20   0.001000
21   0.001000
22   0.001000
2019-02-08 02:09:46	/bigdata/lespin/bioportal/transitions_in_lcc/matrix/CHMO_2015_DC.npz loaded!
RandomWalker 0.0...
RandomWalker 0.85...
RandomWalker 1.0...
2019-02-08 02:10:08	Empirical alpha (damping factor): 0.44
Random

In [9]:
results_model.head()

Unnamed: 0,ontology,navitype,model,nparams,nobservations,loglikelihood,aic,bic
0,CHMO,ALL,RandomWalker 0.0,0,1423,-11375.881766,22751.76,22751.76
1,CHMO,ALL,RandomWalker 0.85,0,1423,-9708.524048,19417.05,19417.05
2,CHMO,ALL,RandomWalker 1.0,0,1423,-27163.362287,54326.72,54326.72
3,CHMO,ALL,RandomWalker 0.36,1,1423,-8937.015635,17876.03,17881.29
4,CHMO,ALL,MarkovChain,8779368,1423,-798.076174,17560330.0,63744400.0


In [None]:
for onto_name, df in results_model.groupby('ontology'):    
    fn = os.path.join(PLOTS_OUTPUT, 'lcc_summary_model_selection_bic_{}_{}.pdf'.format(onto_name,YEAR))
    plot_bic_values(df, fn)