In [None]:
import sys
# !{sys.executable} -m pip install tldextract

<div style='text-align:center;'>
<h1>BioPortal</h1>
<h2>k-hop matrices</h2>
</div>

In [1]:
__author__ = "Lisette Espin-Noboa"
__copyright__ = "Copyright 2018, HopRank"
__credits__ = ["Florian Lemmerich", "Markus Strohmaier", "Simon Walk", "Mark Musen"]
__license__ = "GPL"
__version__ = "1.0.3"
__maintainer__ = "Lisette Espin-Noboa"
__email__ = "Lisette.Espin@gesis.org"
__status__ = "Developing"

########################################################################################
# Warnings
########################################################################################
import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

<h2>Dependencies</h2>

In [2]:
########################################################################################
# System
########################################################################################
import os
import importlib
import numpy as np
import pandas as pd
import seaborn as sns
import networkx as nx
from collections import Counter
from joblib import Parallel, delayed
import matplotlib as mpl
import matplotlib.pyplot as plt 
from collections import defaultdict

########################################################################################
# Local Dependencies
########################################################################################
%reload_ext autoreload
%autoreload 2
from org.gesis.libs.bioportal.ontology import Ontology
from org.gesis.libs.bioportal.transition import Transition
from org.gesis.libs.bioportal.clickstream import NAVITYPES
from org.gesis.libs.bioportal.clickstream import load_clickstream
from org.gesis.libs.bioportal.clickstream import preprocess_clickstream
from org.gesis.libs.bioportal.clickstream import preprocess_sessions
from org.gesis.libs.bioportal.clickstream import clean_clickstream
from org.gesis.libs.bioportal.submission import get_submissions
from org.gesis.libs.utils import log
from org.gesis.libs.utils import weighted_to_undirected
from org.gesis.libs.utils import to_symmetric

########################################################################################
# Plot's style
########################################################################################
sns.set_context(context="paper", font_scale=1.5)
sns.set_style("whitegrid")

<h2>Constants</h2>

In [3]:
SM = 'submission'
SUBMISSIONS_FN = '/bigdata/lespin/datasets/bioportal/submissions.json'

ON = 'ontology'
ONTOLOGY_ROOT = '/bigdata/lespin/datasets/bioportal/ontologies'
ONTOLOGY_GRAPH_OUTPUT = '/bigdata/lespin/bioportal/ontologies/graph'
ONTOLOGY_ADJACENCY_OUTPUT = '/bigdata/lespin/bioportal/ontologies/matrix'
ONTOLOGY_NODES_OUTPUT = '/bigdata/lespin/bioportal/ontologies/nodes'
ONTOLOGY_HOPS_OUTPUT = '/bigdata/lespin/bioportal/ontologies/hops'

CS = 'clickstream'
CS_FN = 'BP_webpage_requests_<YEAR><POSTFIX>.csv.bz2'
CLICKSTREAM_ROOT = '/bigdata/lespin/datasets/bioportal/clickstream/'
CLICKSTREAM_DF = '/bigdata/lespin/bioportal/clickstreams'

TR = 'transitions'
TRANS_GRAPH_OUTPUT = '/bigdata/lespin/bioportal/transitions/graph'
TRANS_ADJACENCY_OUTPUT = '/bigdata/lespin/bioportal/transitions/matrix'
TRANS_NODES_OUTPUT = '/bigdata/lespin/bioportal/transitions/nodes'
LCC_TRANS_GRAPH_OUTPUT = '/bigdata/lespin/bioportal/transitions_in_lcc/graph'
LCC_TRANS_ADJACENCY_OUTPUT = '/bigdata/lespin/bioportal/transitions_in_lcc/matrix'
LCC_TRANS_NODES_OUTPUT = '/bigdata/lespin/bioportal/transitions_in_lcc/nodes'

SU = 'summary'
LOG_OUTPUT = '/bigdata/lespin/bioportal/logs'
PLOTS_OUTPUT = '/bigdata/lespin/bioportal/plots'
SUMMARY_OUTPUT = '/bigdata/lespin/bioportal/summary'
YEAR = '2015'

LAG_BREAK_IN_MINUTES = 60
MIN_SESSION_LENGTH = 2
MIN_REQ = 1000
MIN_TRANSITIONS = 1000
MAXKHOPS = 32

<div style='text-align:center;'>
<h1>k-hop Neighborhoods</h1>
</div>

In [4]:
print('loading submissions...')
submissions_dict = get_submissions(SUBMISSIONS_FN)

print('loading valid ontologies...')
fn = os.path.join(SUMMARY_OUTPUT,'summary_ontologies_transitions_{}.{}'.format(YEAR,'csv'))
df_onto = pd.read_csv(fn, index_col=False)
tmp = df_onto.loc[df_onto["T'"] >= MIN_TRANSITIONS].copy()

lcc = True
print('LCC: {}'.format(lcc))

print('generating k-hop matrices for {} ontologies...'.format(tmp.ontology.nunique()))
for index,row in tmp.sort_values("E'", ascending=True).iterrows():
    onto_name = row['ontology']
    
    if onto_name in ['SNOMEDCT']:
        continue
        
    submission = submissions_dict[onto_name][YEAR]
    onto = Ontology(name=onto_name, year=YEAR, submission_id=submission['submissionId'], root_folder=ONTOLOGY_ROOT)
    onto.load_adjacency(path=ONTOLOGY_ADJACENCY_OUTPUT, lcc=lcc)
    k = onto.create_hops_matrices(path=ONTOLOGY_HOPS_OUTPUT, maxk=MAXKHOPS, lcc=lcc)   
    df_onto.loc[index,'d'] = k
    print('last k: {}'.format(k))
    del(onto)

loading submissions...
2019-02-15 18:09:56	/bigdata/lespin/datasets/bioportal/submissions.json loaded!
2019-02-15 18:09:56	- 642 ontologies
2019-02-15 18:09:56	- 13 years
loading valid ontologies...
LCC: True
generating k-hop matrices for 12 ontologies...
2019-02-15 18:09:56	/bigdata/lespin/bioportal/ontologies/matrix/LCC_WHO-ART_2015.npz loaded!
2019-02-15 18:09:56	/bigdata/lespin/bioportal/ontologies/hops/LCC_WHO-ART_2015_1HOP.npz loaded!
2019-02-15 18:09:56	/bigdata/lespin/bioportal/ontologies/hops/LCC_WHO-ART_2015_2HOP.npz loaded!
2019-02-15 18:09:56	/bigdata/lespin/bioportal/ontologies/hops/LCC_WHO-ART_2015_3HOP.npz loaded!
2019-02-15 18:09:56	/bigdata/lespin/bioportal/ontologies/hops/LCC_WHO-ART_2015_4HOP.npz loaded!
2019-02-15 18:09:56	[Errno 2] No such file or directory: '/bigdata/lespin/bioportal/ontologies/hops/LCC_WHO-ART_2015_5HOP.npz'
2019-02-15 18:09:56	ERROR: /bigdata/lespin/bioportal/ontologies/hops/LCC_WHO-ART_2015_5HOP.npz NOT loaded!
2019-02-15 18:09:56	multiplying: 



2019-02-15 18:09:57	eliminating 0s...
2019-02-15 18:09:57	to csr int...
2019-02-15 18:09:57	done 5-hop!
2019-02-15 18:09:57	WHO-ART-2015-6: 5-hop has reached zero!
last k: 4
2019-02-15 18:09:57	/bigdata/lespin/bioportal/ontologies/matrix/LCC_CHMO_2015.npz loaded!
2019-02-15 18:09:57	/bigdata/lespin/bioportal/ontologies/hops/LCC_CHMO_2015_1HOP.npz loaded!
2019-02-15 18:09:57	/bigdata/lespin/bioportal/ontologies/hops/LCC_CHMO_2015_2HOP.npz loaded!
2019-02-15 18:09:57	/bigdata/lespin/bioportal/ontologies/hops/LCC_CHMO_2015_3HOP.npz loaded!
2019-02-15 18:09:57	/bigdata/lespin/bioportal/ontologies/hops/LCC_CHMO_2015_4HOP.npz loaded!
2019-02-15 18:09:57	/bigdata/lespin/bioportal/ontologies/hops/LCC_CHMO_2015_5HOP.npz loaded!
2019-02-15 18:09:57	/bigdata/lespin/bioportal/ontologies/hops/LCC_CHMO_2015_6HOP.npz loaded!
2019-02-15 18:09:57	/bigdata/lespin/bioportal/ontologies/hops/LCC_CHMO_2015_7HOP.npz loaded!
2019-02-15 18:09:57	/bigdata/lespin/bioportal/ontologies/hops/LCC_CHMO_2015_8HOP.npz 

2019-02-15 18:10:02	substracting 19hop from 20hop...
2019-02-15 18:10:02	/bigdata/lespin/bioportal/ontologies/hops/LCC_HL7_2015_19HOP.npz loaded!
2019-02-15 18:10:02	HL7-2015-6: 20-hop has reached zero!
last k: 19
2019-02-15 18:10:02	/bigdata/lespin/bioportal/ontologies/matrix/LCC_ICD10_2015.npz loaded!
2019-02-15 18:10:03	/bigdata/lespin/bioportal/ontologies/hops/LCC_ICD10_2015_1HOP.npz loaded!
2019-02-15 18:10:03	/bigdata/lespin/bioportal/ontologies/hops/LCC_ICD10_2015_2HOP.npz loaded!
2019-02-15 18:10:03	/bigdata/lespin/bioportal/ontologies/hops/LCC_ICD10_2015_3HOP.npz loaded!
2019-02-15 18:10:03	/bigdata/lespin/bioportal/ontologies/hops/LCC_ICD10_2015_4HOP.npz loaded!
2019-02-15 18:10:03	/bigdata/lespin/bioportal/ontologies/hops/LCC_ICD10_2015_5HOP.npz loaded!
2019-02-15 18:10:03	/bigdata/lespin/bioportal/ontologies/hops/LCC_ICD10_2015_6HOP.npz loaded!
2019-02-15 18:10:04	/bigdata/lespin/bioportal/ontologies/hops/LCC_ICD10_2015_7HOP.npz loaded!
2019-02-15 18:10:06	/bigdata/lespin/b

2019-02-15 18:11:06	/bigdata/lespin/bioportal/ontologies/hops/LCC_MESH_2015_7HOP.npz loaded!
2019-02-15 18:11:07	/bigdata/lespin/bioportal/ontologies/hops/LCC_MESH_2015_8HOP.npz loaded!
2019-02-15 18:11:09	/bigdata/lespin/bioportal/ontologies/hops/LCC_MESH_2015_9HOP.npz loaded!
2019-02-15 18:11:11	/bigdata/lespin/bioportal/ontologies/hops/LCC_MESH_2015_10HOP.npz loaded!
2019-02-15 18:11:12	/bigdata/lespin/bioportal/ontologies/hops/LCC_MESH_2015_11HOP.npz loaded!
2019-02-15 18:11:14	/bigdata/lespin/bioportal/ontologies/hops/LCC_MESH_2015_12HOP.npz loaded!
2019-02-15 18:11:14	/bigdata/lespin/bioportal/ontologies/hops/LCC_MESH_2015_13HOP.npz loaded!
2019-02-15 18:11:15	/bigdata/lespin/bioportal/ontologies/hops/LCC_MESH_2015_14HOP.npz loaded!
2019-02-15 18:11:15	/bigdata/lespin/bioportal/ontologies/hops/LCC_MESH_2015_15HOP.npz loaded!
2019-02-15 18:11:16	/bigdata/lespin/bioportal/ontologies/hops/LCC_MESH_2015_16HOP.npz loaded!
2019-02-15 18:11:16	/bigdata/lespin/bioportal/ontologies/hops/L

2019-02-15 18:12:25	/bigdata/lespin/bioportal/ontologies/hops/LCC_ICD9CM_2015_8HOP.npz loaded!
2019-02-15 18:12:29	/bigdata/lespin/bioportal/ontologies/hops/LCC_ICD9CM_2015_9HOP.npz loaded!
2019-02-15 18:12:33	/bigdata/lespin/bioportal/ontologies/hops/LCC_ICD9CM_2015_10HOP.npz loaded!
2019-02-15 18:12:35	/bigdata/lespin/bioportal/ontologies/hops/LCC_ICD9CM_2015_11HOP.npz loaded!
2019-02-15 18:12:36	/bigdata/lespin/bioportal/ontologies/hops/LCC_ICD9CM_2015_12HOP.npz loaded!
2019-02-15 18:12:36	[Errno 2] No such file or directory: '/bigdata/lespin/bioportal/ontologies/hops/LCC_ICD9CM_2015_13HOP.npz'
2019-02-15 18:12:36	ERROR: /bigdata/lespin/bioportal/ontologies/hops/LCC_ICD9CM_2015_13HOP.npz NOT loaded!
2019-02-15 18:12:36	multiplying: product.dot(m)...
2019-02-15 18:12:36	tocsr and int8...
2019-02-15 18:12:36	substracting 12hop from 13hop...
2019-02-15 18:12:36	/bigdata/lespin/bioportal/ontologies/hops/LCC_ICD9CM_2015_12HOP.npz loaded!
2019-02-15 18:12:36	substracting 11hop from 13hop.

2019-02-15 18:16:39	substracting 16hop from 25hop...
2019-02-15 18:16:39	/bigdata/lespin/bioportal/ontologies/hops/LCC_NDFRT_2015_16HOP.npz loaded!
2019-02-15 18:16:39	substracting 15hop from 25hop...
2019-02-15 18:16:40	/bigdata/lespin/bioportal/ontologies/hops/LCC_NDFRT_2015_15HOP.npz loaded!
2019-02-15 18:16:40	substracting 14hop from 25hop...
2019-02-15 18:16:41	/bigdata/lespin/bioportal/ontologies/hops/LCC_NDFRT_2015_14HOP.npz loaded!
2019-02-15 18:16:42	substracting 13hop from 25hop...
2019-02-15 18:16:44	/bigdata/lespin/bioportal/ontologies/hops/LCC_NDFRT_2015_13HOP.npz loaded!
2019-02-15 18:16:46	substracting 12hop from 25hop...
2019-02-15 18:16:50	/bigdata/lespin/bioportal/ontologies/hops/LCC_NDFRT_2015_12HOP.npz loaded!
2019-02-15 18:16:55	substracting 11hop from 25hop...
2019-02-15 18:16:59	/bigdata/lespin/bioportal/ontologies/hops/LCC_NDFRT_2015_11HOP.npz loaded!
2019-02-15 18:17:07	substracting 10hop from 25hop...
2019-02-15 18:17:12	/bigdata/lespin/bioportal/ontologies/ho

2019-02-15 19:12:05	>0...
2019-02-15 19:16:57	eliminating 0s...
2019-02-15 19:16:57	setting diagonal to zero...
2019-02-15 19:17:03	eliminating 0s...
2019-02-15 19:17:03	to csr int...
2019-02-15 19:17:03	done 14-hop!
2019-02-15 19:17:03	LOINC-2015-9: 14-hop has reached zero!
last k: 13


In [5]:
df_onto.d = df_onto.d.fillna(0).astype(np.int)
df_onto

Unnamed: 0,#,ontology,N,E,k,cc,T,N',E',k',T',d
0,1,SNOMEDCT,315684,467027,2.958826,60,110859,315205,466607,2.960657,110441,0
1,2,CPT,13219,13235,2.002421,3,44815,13092,13110,2.00275,44651,15
2,3,MEDDRA,66506,31863,0.958199,43493,71580,22889,31738,2.77321,42746,8
3,4,NDFRT,35019,34504,1.970587,522,24662,32074,32080,2.000374,22452,24
4,5,LOINC,174513,152683,1.749818,73518,7905,100871,152558,3.024814,6349,13
5,6,ICD9CM,22534,22531,1.999734,3,4485,22407,22406,1.999911,4434,12
6,7,WHO-ART,1852,2997,3.236501,3,2826,1725,2872,3.329855,2811,4
7,8,MESH,165166,24182,0.292821,145652,3842,16947,21596,2.548652,2623,31
8,9,ICD10,12446,11256,1.808774,1190,2328,11132,11131,1.99982,2288,10
9,10,CHMO,2966,3071,2.070802,3,1423,2964,3071,2.0722,1423,22


In [6]:
df_onto.rename(columns={"d":"d'"},inplace=True)
df_onto = df_onto[['#','ontology','N','E','k','cc','T', "N'","E'","k'","d'","T'"]]
df_onto.head()

Unnamed: 0,#,ontology,N,E,k,cc,T,N',E',k',d',T'
0,1,SNOMEDCT,315684,467027,2.958826,60,110859,315205,466607,2.960657,0,110441
1,2,CPT,13219,13235,2.002421,3,44815,13092,13110,2.00275,15,44651
2,3,MEDDRA,66506,31863,0.958199,43493,71580,22889,31738,2.77321,8,42746
3,4,NDFRT,35019,34504,1.970587,522,24662,32074,32080,2.000374,24,22452
4,5,LOINC,174513,152683,1.749818,73518,7905,100871,152558,3.024814,13,6349


In [7]:
fn = os.path.join(SUMMARY_OUTPUT,'lcc_summary_ontologies_transitions_{}.{}'.format(YEAR,'csv'))
df_onto.to_csv(fn, index=None)
print('{} saved!'.format(fn))

fn = os.path.join(SUMMARY_OUTPUT,'lcc_summary_ontologies_transitions_{}.{}'.format(YEAR,'latex'))
txt = df_onto.to_latex(index=False, float_format=lambda x: '%.2f' % x, bold_rows=[0])
with open(fn, 'w') as f:
    f.write(txt)
    f.write('\n')
print('{} saved!'.format(fn))

/bigdata/lespin/bioportal/summary/lcc_summary_ontologies_transitions_2015.csv saved!
/bigdata/lespin/bioportal/summary/lcc_summary_ontologies_transitions_2015.latex saved!
