In [1]:
import sys
# !{sys.executable} -m pip install tldextract

<div style='text-align:center;'>
<h1>BioPortal</h1>
<h2>k-hop matrices</h2>
</div>

In [2]:
__author__ = "Lisette Espin-Noboa"
__copyright__ = "Copyright 2018, HopRank"
__credits__ = ["Florian Lemmerich", "Markus Strohmaier", "Simon Walk", "Mark Musen"]
__license__ = "GPL"
__version__ = "1.0.3"
__maintainer__ = "Lisette Espin-Noboa"
__email__ = "Lisette.Espin@gesis.org"
__status__ = "Developing"

########################################################################################
# Warnings
########################################################################################
import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

<h2>Dependencies</h2>

In [3]:
########################################################################################
# System
########################################################################################
import os
import importlib
import numpy as np
import pandas as pd
import seaborn as sns
import networkx as nx
from collections import Counter
from joblib import Parallel, delayed
import matplotlib as mpl
import matplotlib.pyplot as plt 
from collections import defaultdict

########################################################################################
# Local Dependencies
########################################################################################
%reload_ext autoreload
%autoreload 2
from org.gesis.libs.bioportal.ontology import Ontology
from org.gesis.libs.bioportal.transition import Transition
from org.gesis.libs.bioportal.clickstream import NAVITYPES
from org.gesis.libs.bioportal.clickstream import load_clickstream
from org.gesis.libs.bioportal.clickstream import preprocess_clickstream
from org.gesis.libs.bioportal.clickstream import preprocess_sessions
from org.gesis.libs.bioportal.clickstream import clean_clickstream
from org.gesis.libs.bioportal.submission import get_submissions
from org.gesis.libs.utils import log
from org.gesis.libs.utils import weighted_to_undirected
from org.gesis.libs.utils import to_symmetric

########################################################################################
# Plot's style
########################################################################################
sns.set_context(context="paper", font_scale=1.5)
sns.set_style("whitegrid")

<h2>Constants</h2>

In [4]:
SM = 'submission'
SUBMISSIONS_FN = '/bigdata/lespin/datasets/bioportal/submissions.json'

ON = 'ontology'
ONTOLOGY_ROOT = '/bigdata/lespin/datasets/bioportal/ontologies'
ONTOLOGY_GRAPH_OUTPUT = '/bigdata/lespin/bioportal/ontologies/graph'
ONTOLOGY_ADJACENCY_OUTPUT = '/bigdata/lespin/bioportal/ontologies/matrix'
ONTOLOGY_NODES_OUTPUT = '/bigdata/lespin/bioportal/ontologies/nodes'
ONTOLOGY_HOPS_OUTPUT = '/bigdata/lespin/bioportal/ontologies/hops'

CS = 'clickstream'
CS_FN = 'BP_webpage_requests_<YEAR><POSTFIX>.csv.bz2'
CLICKSTREAM_ROOT = '/bigdata/lespin/datasets/bioportal/clickstream/'
CLICKSTREAM_DF = '/bigdata/lespin/bioportal/clickstreams'

TR = 'transitions'
TRANS_GRAPH_OUTPUT = '/bigdata/lespin/bioportal/transitions/graph'
TRANS_ADJACENCY_OUTPUT = '/bigdata/lespin/bioportal/transitions/matrix'
TRANS_NODES_OUTPUT = '/bigdata/lespin/bioportal/transitions/nodes'
LCC_TRANS_GRAPH_OUTPUT = '/bigdata/lespin/bioportal/transitions_in_lcc/graph'
LCC_TRANS_ADJACENCY_OUTPUT = '/bigdata/lespin/bioportal/transitions_in_lcc/matrix'
LCC_TRANS_NODES_OUTPUT = '/bigdata/lespin/bioportal/transitions_in_lcc/nodes'

SU = 'summary'
LOG_OUTPUT = '/bigdata/lespin/bioportal/logs'
PLOTS_OUTPUT = '/bigdata/lespin/bioportal/plots'
SUMMARY_OUTPUT = '/bigdata/lespin/bioportal/summary'
YEAR = '2015'

LAG_BREAK_IN_MINUTES = 60
MIN_SESSION_LENGTH = 2
MIN_REQ = 1000
MIN_TRANSITIONS = 1000
MAXKHOPS = 32

<div style='text-align:center;'>
<h1>k-hop Neighborhoods</h1>
</div>

In [5]:
print('loading submissions...')
submissions_dict = get_submissions(SUBMISSIONS_FN)

print('loading valid ontologies...')
fn = os.path.join(SUMMARY_OUTPUT,'summary_ontologies_transitions_{}.{}'.format(YEAR,'csv'))
df_onto = pd.read_csv(fn, index_col=False)
tmp = df_onto.loc[df_onto["T'"] >= MIN_TRANSITIONS].copy()

lcc = True
print('LCC: {}'.format(lcc))

print('generating k-hop matrices for {} ontologies...'.format(tmp.ontology.nunique()))
for index,row in tmp.sort_values("E'", ascending=True).iterrows():
    onto_name = row['ontology']
    
    if onto_name in ['LOINC','SNOMEDCT']:
        continue
        
    submission = submissions_dict[onto_name][YEAR]
    onto = Ontology(name=onto_name, year=YEAR, submission_id=submission['submissionId'], root_folder=ONTOLOGY_ROOT)
    onto.load_adjacency(path=ONTOLOGY_ADJACENCY_OUTPUT, lcc=lcc)
    k = onto.create_hops_matrices(path=ONTOLOGY_HOPS_OUTPUT, maxk=MAXKHOPS, lcc=lcc)   
    df_onto.loc[index,'d'] = k

loading submissions...
2019-02-07 22:08:19	/bigdata/lespin/datasets/bioportal/submissions.json loaded!
2019-02-07 22:08:19	- 642 ontologies
2019-02-07 22:08:19	- 13 years
loading valid ontologies...
LCC: True
generating k-hop matrices for 12 ontologies...
2019-02-07 22:08:19	/bigdata/lespin/bioportal/ontologies/matrix/LCC_WHO-ART_2015.npz loaded!
2019-02-07 22:08:19	=== WHO-ART-2015: 1HOP already exists (pass)
2019-02-07 22:08:19	=== WHO-ART-2015: 2HOP already exists (pass)
2019-02-07 22:08:19	=== WHO-ART-2015: 3HOP already exists (pass)
2019-02-07 22:08:19	=== WHO-ART-2015: 4HOP already exists (pass)
2019-02-07 22:08:19	=== WHO-ART-2015: 5HOP ===
2019-02-07 22:08:19	/bigdata/lespin/bioportal/ontologies/hops/LCC_WHO-ART_2015_4HOP.npz loaded!
2019-02-07 22:08:20	/bigdata/lespin/bioportal/ontologies/hops/LCC_WHO-ART_2015_4HOP.npz loaded!
2019-02-07 22:08:21	/bigdata/lespin/bioportal/ontologies/hops/LCC_WHO-ART_2015_3HOP.npz loaded!
2019-02-07 22:08:21	B: the matrix has already reached ze

2019-02-07 22:08:28	/bigdata/lespin/bioportal/ontologies/hops/LCC_MESH_2015_31HOP.npz loaded!
2019-02-07 22:08:29	/bigdata/lespin/bioportal/ontologies/hops/LCC_MESH_2015_30HOP.npz loaded!
2019-02-07 22:08:29	B: the matrix has already reached zero (break). Up to 31HOP
2019-02-07 22:08:29	=== LCC-MESH-2015: done for 31 HOPs! ===
2019-02-07 22:08:29	/bigdata/lespin/bioportal/ontologies/matrix/LCC_ICD9CM_2015.npz loaded!
2019-02-07 22:08:29	=== ICD9CM-2015: 1HOP already exists (pass)
2019-02-07 22:08:29	=== ICD9CM-2015: 2HOP already exists (pass)
2019-02-07 22:08:29	=== ICD9CM-2015: 3HOP already exists (pass)
2019-02-07 22:08:29	=== ICD9CM-2015: 4HOP already exists (pass)
2019-02-07 22:08:29	=== ICD9CM-2015: 5HOP already exists (pass)
2019-02-07 22:08:29	=== ICD9CM-2015: 6HOP already exists (pass)
2019-02-07 22:08:29	=== ICD9CM-2015: 7HOP already exists (pass)
2019-02-07 22:08:29	=== ICD9CM-2015: 8HOP already exists (pass)
2019-02-07 22:08:29	=== ICD9CM-2015: 9HOP already exists (pass)
201

In [6]:
df_onto.d = df_onto.d.fillna(0).astype(np.int)
df_onto.head()

Unnamed: 0,#,ontology,N,E,k,cc,T,N',E',k',T',d
0,1,SNOMEDCT,315684,467027,2.958826,60,110859,315205,466607,2.960657,110441,0
1,2,CPT,13219,13235,2.002421,3,44815,13092,13110,2.00275,44651,15
2,3,MEDDRA,66506,31863,0.958199,43493,71580,22889,31738,2.77321,42746,8
3,4,NDFRT,35019,34504,1.970587,522,24662,32074,32080,2.000374,22452,24
4,5,LOINC,174513,152683,1.749818,73518,7905,100871,152558,3.024814,6349,0


In [7]:
df_onto.rename(columns={"d":"d'"},inplace=True)
df_onto = df_onto[['#','ontology','N','E','k','cc','T', "N'","E'","k'","d'","T'"]]
df_onto.head()

Unnamed: 0,#,ontology,N,E,k,cc,T,N',E',k',d',T'
0,1,SNOMEDCT,315684,467027,2.958826,60,110859,315205,466607,2.960657,0,110441
1,2,CPT,13219,13235,2.002421,3,44815,13092,13110,2.00275,15,44651
2,3,MEDDRA,66506,31863,0.958199,43493,71580,22889,31738,2.77321,8,42746
3,4,NDFRT,35019,34504,1.970587,522,24662,32074,32080,2.000374,24,22452
4,5,LOINC,174513,152683,1.749818,73518,7905,100871,152558,3.024814,0,6349


In [8]:
fn = os.path.join(SUMMARY_OUTPUT,'lcc_summary_ontologies_transitions_{}.{}'.format(YEAR,'csv'))
df_onto.to_csv(fn, index=None)
print('{} saved!'.format(fn))

fn = os.path.join(SUMMARY_OUTPUT,'lcc_summary_ontologies_transitions_{}.{}'.format(YEAR,'latex'))
txt = df_onto.to_latex(index=False, float_format=lambda x: '%.2f' % x, bold_rows=[0])
with open(fn, 'w') as f:
    f.write(txt)
    f.write('\n')
print('{} saved!'.format(fn))

/bigdata/lespin/bioportal/summary/lcc_summary_ontologies_transitions_2015.csv saved!
/bigdata/lespin/bioportal/summary/lcc_summary_ontologies_transitions_2015.latex saved!
