In [1]:
import sys
# !{sys.executable} -m pip install tldextract

<div style='text-align:center;'>
<h1>BioPortal</h1>
<h2>k-hop matrices</h2>
</div>

In [2]:
__author__ = "Lisette Espin-Noboa"
__copyright__ = "Copyright 2018, HopRank"
__credits__ = ["Florian Lemmerich", "Markus Strohmaier", "Simon Walk", "Mark Musen"]
__license__ = "GPL"
__version__ = "1.0.3"
__maintainer__ = "Lisette Espin-Noboa"
__email__ = "Lisette.Espin@gesis.org"
__status__ = "Developing"

########################################################################################
# Warnings
########################################################################################
import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

<h2>Dependencies</h2>

In [3]:
########################################################################################
# System
########################################################################################
import os
import importlib
import numpy as np
import pandas as pd
import seaborn as sns
import networkx as nx
from collections import Counter
from joblib import Parallel, delayed
import matplotlib as mpl
import matplotlib.pyplot as plt 
from collections import defaultdict

########################################################################################
# Local Dependencies
########################################################################################
%reload_ext autoreload
%autoreload 2
from org.gesis.libs.bioportal.ontology import Ontology
from org.gesis.libs.bioportal.transition import Transition
from org.gesis.libs.bioportal.clickstream import NAVITYPES
from org.gesis.libs.bioportal.clickstream import load_clickstream
from org.gesis.libs.bioportal.clickstream import preprocess_clickstream
from org.gesis.libs.bioportal.clickstream import preprocess_sessions
from org.gesis.libs.bioportal.clickstream import clean_clickstream
from org.gesis.libs.bioportal.submission import get_submissions
from org.gesis.libs.utils import log
from org.gesis.libs.utils import weighted_to_undirected
from org.gesis.libs.utils import to_symmetric

########################################################################################
# Plot's style
########################################################################################
sns.set_context(context="paper", font_scale=1.5)
sns.set_style("whitegrid")

<h2>Constants</h2>

In [4]:
SM = 'submission'
SUBMISSIONS_FN = '/bigdata/lespin/datasets/bioportal/submissions.json'

ON = 'ontology'
ONTOLOGY_ROOT = '/bigdata/lespin/datasets/bioportal/ontologies'
ONTOLOGY_GRAPH_OUTPUT = '/bigdata/lespin/bioportal/ontologies/graph'
ONTOLOGY_ADJACENCY_OUTPUT = '/bigdata/lespin/bioportal/ontologies/matrix'
ONTOLOGY_NODES_OUTPUT = '/bigdata/lespin/bioportal/ontologies/nodes'
ONTOLOGY_HOPS_OUTPUT = '/bigdata/lespin/bioportal/ontologies/hops'

CS = 'clickstream'
CS_FN = 'BP_webpage_requests_<YEAR><POSTFIX>.csv.bz2'
CLICKSTREAM_ROOT = '/bigdata/lespin/datasets/bioportal/clickstream/'
CLICKSTREAM_DF = '/bigdata/lespin/bioportal/clickstreams'

TR = 'transitions'
TRANS_GRAPH_OUTPUT = '/bigdata/lespin/bioportal/transitions/graph'
TRANS_ADJACENCY_OUTPUT = '/bigdata/lespin/bioportal/transitions/matrix'
TRANS_NODES_OUTPUT = '/bigdata/lespin/bioportal/transitions/nodes'
LCC_TRANS_GRAPH_OUTPUT = '/bigdata/lespin/bioportal/transitions_in_lcc/graph'
LCC_TRANS_ADJACENCY_OUTPUT = '/bigdata/lespin/bioportal/transitions_in_lcc/matrix'
LCC_TRANS_NODES_OUTPUT = '/bigdata/lespin/bioportal/transitions_in_lcc/nodes'

SU = 'summary'
LOG_OUTPUT = '/bigdata/lespin/bioportal/logs'
PLOTS_OUTPUT = '/bigdata/lespin/bioportal/plots'
SUMMARY_OUTPUT = '/bigdata/lespin/bioportal/summary'
YEAR = '2015'

LAG_BREAK_IN_MINUTES = 60
MIN_SESSION_LENGTH = 2
MIN_REQ = 1000
MIN_TRANSITIONS = 1000
MAXKHOPS = 32

<div style='text-align:center;'>
<h1>k-hop Neighborhoods</h1>
</div>

In [None]:
print('loading submissions...')
submissions_dict = get_submissions(SUBMISSIONS_FN)

print('loading valid ontologies...')
fn = os.path.join(SUMMARY_OUTPUT,'summary_ontologies_transitions_{}.{}'.format(YEAR,'csv'))
df_onto = pd.read_csv(fn, index_col=False)
tmp = df_onto.loc[df_onto["T'"] >= MIN_TRANSITIONS].copy()

lcc = True
print('LCC: {}'.format(lcc))

print('generating k-hop matrices for {} ontologies...'.format(tmp.ontology.nunique()))
for index,row in tmp.sort_values("E'", ascending=True).iterrows():
    onto_name = row['ontology']
    
    if onto_name in ['LOINC','SNOMEDCT']:
        continue
        
    submission = submissions_dict[onto_name][YEAR]
    onto = Ontology(name=onto_name, year=YEAR, submission_id=submission['submissionId'], root_folder=ONTOLOGY_ROOT)
    onto.load_adjacency(path=ONTOLOGY_ADJACENCY_OUTPUT, lcc=lcc)
    k = onto.create_hops_matrices(path=ONTOLOGY_HOPS_OUTPUT, maxk=MAXKHOPS, lcc=lcc)   
    df_onto.loc[index,'d'] = k
    print('last k: {}'.format(k))
    del(onto)

loading submissions...
2019-02-13 14:46:39	/bigdata/lespin/datasets/bioportal/submissions.json loaded!
2019-02-13 14:46:39	- 642 ontologies
2019-02-13 14:46:39	- 13 years
loading valid ontologies...
LCC: True
generating k-hop matrices for 12 ontologies...
2019-02-13 14:46:39	/bigdata/lespin/bioportal/ontologies/matrix/LCC_WHO-ART_2015.npz loaded!
2019-02-13 14:46:39	>0...
2019-02-13 14:46:39	eliminating 0s...
2019-02-13 14:46:39	setting diagonal to zero...
2019-02-13 14:46:39	eliminating 0s...
2019-02-13 14:46:39	to csr int...
2019-02-13 14:46:39	done 1-hop!
2019-02-13 14:46:39	WHO-ART-2015-6: 1-hop --> shape:(1725, 1725), sum:5742!
2019-02-13 14:46:39	WHO-ART-2015-6: 1-hop saving...
2019-02-13 14:46:39	/bigdata/lespin/bioportal/ontologies/hops/LCC_WHO-ART_2015_1HOP.npz saved!
2019-02-13 14:46:39	WHO-ART-2015-6: 1-hop done!
2019-02-13 14:46:39	
2019-02-13 14:46:39	accumulating previous hops...
2019-02-13 14:46:39	multiplying: product.dot(m)...
2019-02-13 14:46:39	substracting previous 



2019-02-13 14:46:40	substracting previous hops from 3hop...
2019-02-13 14:46:40	>0...
2019-02-13 14:46:40	eliminating 0s...
2019-02-13 14:46:40	setting diagonal to zero...
2019-02-13 14:46:40	eliminating 0s...
2019-02-13 14:46:40	to csr int...
2019-02-13 14:46:40	done 3-hop!
2019-02-13 14:46:40	WHO-ART-2015-6: 3-hop --> shape:(1725, 1725), sum:546190!
2019-02-13 14:46:40	WHO-ART-2015-6: 3-hop saving...
2019-02-13 14:46:40	/bigdata/lespin/bioportal/ontologies/hops/LCC_WHO-ART_2015_3HOP.npz saved!
2019-02-13 14:46:40	WHO-ART-2015-6: 3-hop done!
2019-02-13 14:46:40	
2019-02-13 14:46:40	accumulating previous hops...
2019-02-13 14:46:40	multiplying: product.dot(m)...
2019-02-13 14:46:40	substracting previous hops from 4hop...
2019-02-13 14:46:40	>0...
2019-02-13 14:46:40	eliminating 0s...
2019-02-13 14:46:40	setting diagonal to zero...
2019-02-13 14:46:40	eliminating 0s...
2019-02-13 14:46:40	to csr int...
2019-02-13 14:46:40	done 4-hop!
2019-02-13 14:46:40	WHO-ART-2015-6: 4-hop --> shape:(

2019-02-13 14:46:45	multiplying: product.dot(m)...
2019-02-13 14:46:46	substracting previous hops from 11hop...
2019-02-13 14:46:46	>0...
2019-02-13 14:46:46	eliminating 0s...
2019-02-13 14:46:46	setting diagonal to zero...
2019-02-13 14:46:46	eliminating 0s...
2019-02-13 14:46:46	to csr int...
2019-02-13 14:46:46	done 11-hop!
2019-02-13 14:46:46	CHMO-2015-18: 11-hop --> shape:(2964, 2964), sum:835370!
2019-02-13 14:46:46	CHMO-2015-18: 11-hop saving...
2019-02-13 14:46:46	/bigdata/lespin/bioportal/ontologies/hops/LCC_CHMO_2015_11HOP.npz saved!
2019-02-13 14:46:46	CHMO-2015-18: 11-hop done!
2019-02-13 14:46:46	
2019-02-13 14:46:46	accumulating previous hops...
2019-02-13 14:46:46	multiplying: product.dot(m)...
2019-02-13 14:46:47	substracting previous hops from 12hop...
2019-02-13 14:46:47	>0...
2019-02-13 14:46:47	eliminating 0s...
2019-02-13 14:46:47	setting diagonal to zero...
2019-02-13 14:46:47	eliminating 0s...
2019-02-13 14:46:47	to csr int...
2019-02-13 14:46:47	done 12-hop!
201

2019-02-13 14:47:03	>0...
2019-02-13 14:47:03	eliminating 0s...
2019-02-13 14:47:03	setting diagonal to zero...
2019-02-13 14:47:03	eliminating 0s...
2019-02-13 14:47:03	to csr int...
2019-02-13 14:47:03	done 1-hop!
2019-02-13 14:47:03	HL7-2015-6: 1-hop --> shape:(9146, 9146), sum:20914!
2019-02-13 14:47:03	HL7-2015-6: 1-hop saving...
2019-02-13 14:47:03	/bigdata/lespin/bioportal/ontologies/hops/LCC_HL7_2015_1HOP.npz saved!
2019-02-13 14:47:03	HL7-2015-6: 1-hop done!
2019-02-13 14:47:03	
2019-02-13 14:47:03	accumulating previous hops...
2019-02-13 14:47:03	multiplying: product.dot(m)...
2019-02-13 14:47:03	substracting previous hops from 2hop...
2019-02-13 14:47:03	>0...
2019-02-13 14:47:03	eliminating 0s...
2019-02-13 14:47:03	setting diagonal to zero...
2019-02-13 14:47:03	eliminating 0s...
2019-02-13 14:47:03	to csr int...
2019-02-13 14:47:03	done 2-hop!
2019-02-13 14:47:03	HL7-2015-6: 2-hop --> shape:(9146, 9146), sum:736144!
2019-02-13 14:47:03	HL7-2015-6: 2-hop saving...
2019-02-

2019-02-13 14:49:08	multiplying: product.dot(m)...
2019-02-13 14:49:17	substracting previous hops from 14hop...
2019-02-13 14:49:19	>0...
2019-02-13 14:49:19	eliminating 0s...
2019-02-13 14:49:19	setting diagonal to zero...
2019-02-13 14:49:20	eliminating 0s...
2019-02-13 14:49:20	to csr int...
2019-02-13 14:49:20	done 14-hop!
2019-02-13 14:49:20	HL7-2015-6: 14-hop --> shape:(9146, 9146), sum:41552!
2019-02-13 14:49:20	HL7-2015-6: 14-hop saving...
2019-02-13 14:49:20	/bigdata/lespin/bioportal/ontologies/hops/LCC_HL7_2015_14HOP.npz saved!
2019-02-13 14:49:20	HL7-2015-6: 14-hop done!
2019-02-13 14:49:20	
2019-02-13 14:49:20	accumulating previous hops...
2019-02-13 14:49:21	multiplying: product.dot(m)...
2019-02-13 14:49:31	substracting previous hops from 15hop...
2019-02-13 14:49:33	>0...
2019-02-13 14:49:34	eliminating 0s...
2019-02-13 14:49:34	setting diagonal to zero...
2019-02-13 14:49:34	eliminating 0s...
2019-02-13 14:49:34	to csr int...
2019-02-13 14:49:34	done 15-hop!
2019-02-13 

2019-02-13 14:50:56	substracting previous hops from 7hop...
2019-02-13 14:50:56	>0...
2019-02-13 14:50:57	eliminating 0s...
2019-02-13 14:50:57	setting diagonal to zero...
2019-02-13 14:50:58	eliminating 0s...
2019-02-13 14:50:58	to csr int...
2019-02-13 14:50:58	done 7-hop!
2019-02-13 14:50:58	ICD10-2015-6: 7-hop --> shape:(11132, 11132), sum:24800186!
2019-02-13 14:50:58	ICD10-2015-6: 7-hop saving...
2019-02-13 14:51:08	/bigdata/lespin/bioportal/ontologies/hops/LCC_ICD10_2015_7HOP.npz saved!
2019-02-13 14:51:08	ICD10-2015-6: 7-hop done!
2019-02-13 14:51:08	
2019-02-13 14:51:08	accumulating previous hops...
2019-02-13 14:51:08	multiplying: product.dot(m)...
2019-02-13 14:51:14	substracting previous hops from 8hop...
2019-02-13 14:51:15	>0...
2019-02-13 14:51:16	eliminating 0s...
2019-02-13 14:51:16	setting diagonal to zero...
2019-02-13 14:51:18	eliminating 0s...
2019-02-13 14:51:18	to csr int...
2019-02-13 14:51:19	done 8-hop!
2019-02-13 14:51:20	ICD10-2015-6: 8-hop --> shape:(11132,

2019-02-13 14:52:42	multiplying: product.dot(m)...
2019-02-13 14:52:48	substracting previous hops from 9hop...
2019-02-13 14:52:49	>0...
2019-02-13 14:52:50	eliminating 0s...
2019-02-13 14:52:50	setting diagonal to zero...
2019-02-13 14:52:52	eliminating 0s...
2019-02-13 14:52:52	to csr int...
2019-02-13 14:52:53	done 9-hop!
2019-02-13 14:52:53	CPT-2015-6: 9-hop --> shape:(13092, 13092), sum:46357044!
2019-02-13 14:52:53	CPT-2015-6: 9-hop saving...
2019-02-13 14:53:04	/bigdata/lespin/bioportal/ontologies/hops/LCC_CPT_2015_9HOP.npz saved!
2019-02-13 14:53:04	CPT-2015-6: 9-hop done!
2019-02-13 14:53:04	
2019-02-13 14:53:04	accumulating previous hops...
2019-02-13 14:53:06	multiplying: product.dot(m)...
2019-02-13 14:53:14	substracting previous hops from 10hop...
2019-02-13 14:53:15	>0...
2019-02-13 14:53:17	eliminating 0s...
2019-02-13 14:53:17	setting diagonal to zero...
2019-02-13 14:53:18	eliminating 0s...
2019-02-13 14:53:18	to csr int...
2019-02-13 14:53:19	done 10-hop!
2019-02-13 1

2019-02-13 14:55:23	substracting previous hops from 6hop...
2019-02-13 14:55:23	>0...
2019-02-13 14:55:23	eliminating 0s...
2019-02-13 14:55:23	setting diagonal to zero...
2019-02-13 14:55:25	eliminating 0s...
2019-02-13 14:55:25	to csr int...
2019-02-13 14:55:25	done 6-hop!
2019-02-13 14:55:25	MESH-2015-10: 6-hop --> shape:(16947, 16947), sum:13919928!
2019-02-13 14:55:25	MESH-2015-10: 6-hop saving...
2019-02-13 14:55:32	/bigdata/lespin/bioportal/ontologies/hops/LCC_MESH_2015_6HOP.npz saved!
2019-02-13 14:55:32	MESH-2015-10: 6-hop done!
2019-02-13 14:55:32	
2019-02-13 14:55:32	accumulating previous hops...
2019-02-13 14:55:32	multiplying: product.dot(m)...
2019-02-13 14:55:39	substracting previous hops from 7hop...
2019-02-13 14:55:39	>0...
2019-02-13 14:55:40	eliminating 0s...
2019-02-13 14:55:40	setting diagonal to zero...
2019-02-13 14:55:41	eliminating 0s...
2019-02-13 14:55:41	to csr int...
2019-02-13 14:55:41	done 7-hop!
2019-02-13 14:55:42	MESH-2015-10: 7-hop --> shape:(16947, 

2019-02-13 15:06:35	eliminating 0s...
2019-02-13 15:06:35	to csr int...
2019-02-13 15:06:35	done 18-hop!
2019-02-13 15:06:35	MESH-2015-10: 18-hop --> shape:(16947, 16947), sum:1589188!
2019-02-13 15:06:35	MESH-2015-10: 18-hop saving...
2019-02-13 15:06:36	/bigdata/lespin/bioportal/ontologies/hops/LCC_MESH_2015_18HOP.npz saved!
2019-02-13 15:06:36	MESH-2015-10: 18-hop done!
2019-02-13 15:06:36	
2019-02-13 15:06:36	accumulating previous hops...
2019-02-13 15:06:39	multiplying: product.dot(m)...
2019-02-13 15:07:25	substracting previous hops from 19hop...
2019-02-13 15:07:29	>0...
2019-02-13 15:07:31	eliminating 0s...
2019-02-13 15:07:31	setting diagonal to zero...
2019-02-13 15:07:31	eliminating 0s...
2019-02-13 15:07:31	to csr int...
2019-02-13 15:07:31	done 19-hop!
2019-02-13 15:07:31	MESH-2015-10: 19-hop --> shape:(16947, 16947), sum:800854!
2019-02-13 15:07:31	MESH-2015-10: 19-hop saving...
2019-02-13 15:07:31	/bigdata/lespin/bioportal/ontologies/hops/LCC_MESH_2015_19HOP.npz saved!
2

2019-02-13 15:18:09	multiplying: product.dot(m)...
2019-02-13 15:18:55	substracting previous hops from 31hop...
2019-02-13 15:18:59	>0...
2019-02-13 15:19:01	eliminating 0s...
2019-02-13 15:19:01	setting diagonal to zero...
2019-02-13 15:19:02	eliminating 0s...
2019-02-13 15:19:02	to csr int...
2019-02-13 15:19:02	done 31-hop!
2019-02-13 15:19:02	MESH-2015-10: 31-hop --> shape:(16947, 16947), sum:18!
2019-02-13 15:19:02	MESH-2015-10: 31-hop saving...
2019-02-13 15:19:02	/bigdata/lespin/bioportal/ontologies/hops/LCC_MESH_2015_31HOP.npz saved!
2019-02-13 15:19:02	MESH-2015-10: 31-hop done!
2019-02-13 15:19:02	
2019-02-13 15:19:02	accumulating previous hops...
2019-02-13 15:19:05	multiplying: product.dot(m)...
2019-02-13 15:19:49	substracting previous hops from 32hop...
2019-02-13 15:19:54	>0...
2019-02-13 15:19:56	eliminating 0s...
2019-02-13 15:19:56	setting diagonal to zero...
2019-02-13 15:19:57	eliminating 0s...
2019-02-13 15:19:57	to csr int...
2019-02-13 15:19:57	done 32-hop!
2019-

2019-02-13 15:25:37	/bigdata/lespin/bioportal/ontologies/hops/LCC_ICD9CM_2015_11HOP.npz saved!
2019-02-13 15:25:37	ICD9CM-2015-8: 11-hop done!
2019-02-13 15:25:37	
2019-02-13 15:25:37	accumulating previous hops...
2019-02-13 15:25:44	multiplying: product.dot(m)...
2019-02-13 15:26:07	substracting previous hops from 12hop...
2019-02-13 15:26:14	>0...
2019-02-13 15:26:16	eliminating 0s...
2019-02-13 15:26:16	setting diagonal to zero...
2019-02-13 15:26:18	eliminating 0s...
2019-02-13 15:26:18	to csr int...
2019-02-13 15:26:18	done 12-hop!
2019-02-13 15:26:18	ICD9CM-2015-8: 12-hop --> shape:(22407, 22407), sum:8172342!
2019-02-13 15:26:18	ICD9CM-2015-8: 12-hop saving...
2019-02-13 15:26:19	/bigdata/lespin/bioportal/ontologies/hops/LCC_ICD9CM_2015_12HOP.npz saved!
2019-02-13 15:26:19	ICD9CM-2015-8: 12-hop done!
2019-02-13 15:26:19	
2019-02-13 15:26:19	accumulating previous hops...
2019-02-13 15:26:25	multiplying: product.dot(m)...
2019-02-13 15:26:49	substracting previous hops from 13hop..

In [None]:
df_onto.d = df_onto.d.fillna(0).astype(np.int)
df_onto

In [None]:
df_onto.rename(columns={"d":"d'"},inplace=True)
df_onto = df_onto[['#','ontology','N','E','k','cc','T', "N'","E'","k'","d'","T'"]]
df_onto.head()

In [None]:
fn = os.path.join(SUMMARY_OUTPUT,'lcc_summary_ontologies_transitions_{}.{}'.format(YEAR,'csv'))
df_onto.to_csv(fn, index=None)
print('{} saved!'.format(fn))

fn = os.path.join(SUMMARY_OUTPUT,'lcc_summary_ontologies_transitions_{}.{}'.format(YEAR,'latex'))
txt = df_onto.to_latex(index=False, float_format=lambda x: '%.2f' % x, bold_rows=[0])
with open(fn, 'w') as f:
    f.write(txt)
    f.write('\n')
print('{} saved!'.format(fn))