# KFIR

# SETUP: DEPENDENCIES AND CREDENTIALS

## Working Directory

What is the current working directory?:

In [11]:
import os
os.getcwd()

'C:\\Users\\Clokman\\Google Drive\\__Projects__\\Code\\KFIR\\notebooks'

Add parent directory to path if necessary:

In [12]:
import sys, os, re

working_directory = os.getcwd()
if re.search('\\\\notebooks$', working_directory):
    one_directory_up = re.sub('\\\\notebooks$', '', working_directory)
    sys.path.append(one_directory_up)
    
sys.path

['',
 'C:\\ProgramData\\Anaconda3\\python36.zip',
 'C:\\ProgramData\\Anaconda3\\DLLs',
 'C:\\ProgramData\\Anaconda3\\lib',
 'C:\\ProgramData\\Anaconda3',
 'C:\\ProgramData\\Anaconda3\\lib\\site-packages',
 'C:\\ProgramData\\Anaconda3\\lib\\site-packages\\Sphinx-1.5.1-py3.6.egg',
 'C:\\ProgramData\\Anaconda3\\lib\\site-packages\\win32',
 'C:\\ProgramData\\Anaconda3\\lib\\site-packages\\win32\\lib',
 'C:\\ProgramData\\Anaconda3\\lib\\site-packages\\Pythonwin',
 'C:\\ProgramData\\Anaconda3\\lib\\site-packages\\setuptools-27.2.0-py3.6.egg',
 'C:\\ProgramData\\Anaconda3\\lib\\site-packages\\IPython\\extensions',
 'C:\\Users\\Clokman\\.ipython',
 'C:\\Users\\Clokman\\Google Drive\\__Projects__\\Code\\KFIR',
 'C:\\Users\\Clokman\\Google Drive\\__Projects__\\Code\\KFIR']

## Initialize General Packages

In [13]:
import pandas
import numpy

## Initialize Plotly

Check current version:

In [14]:
from plotly import __version__ as plotly_version
plotly_version

'2.5.1'

### Online Plotly

Read plotly credentials from file:

In [15]:
from preprocessor.Text_File import Text_File

plotly_file = Text_File('..//private//plotly_credentials')
plotly_file = plotly_file.return_content()
plotly_credentials = plotly_file.splitlines()

plotly_username = plotly_credentials[0]
plotly_key = plotly_credentials[1]

Set parameters for online usage:

In [16]:
import plotly.plotly as plotly_online
iplot_online = plotly_online.iplot

import plotly.graph_objs as graph_objects
from plotly.tools import set_credentials_file

set_credentials_file(username=plotly_username, api_key=plotly_key)  # put your own plotly username and api key here 

### Offline Plotly

Setup for offline usage:

In [17]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

## Initialize Retriever: Gastrodon_Query

Import Gastrodon_Query (for running SPARQL queries in Jupyter):

In [18]:
from retriever.sparql_tools import Gastrodon_Query

Initialize eculture query:

In [19]:
eculture_query = Gastrodon_Query()

Define prefixes:

In [113]:
eculture_query.set_prefixes("""
    @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
    @prefix skos: <http://www.w3.org/2004/02/skos/core#> .
    @prefix dbo: <http://dbpedia.org/ontology/> .
    
    @prefix wos: <http://wos.risis.eu/vocabulary/> .
    @prefix wosres: <http://wos.risis.eu/resource/> .
    @prefix kfir: <http://clokman.com/kfir/ontology#> .
    @prefix ldr: <https://github.com/ali1k/ld-reactor/blob/master/vocabulary/index.ttl#> .

    @prefix wosGraph: <http://clokman.com/wos> .
    @prefix kfirGraph: <http://clokman.com/kfir> .
    @prefix testGraph: <http://clokman.com/test> .
""")

eculture_query._get_prefixes()

{rdflib.term.URIRef('http://clokman.com/kfir'): 'kfirGraph',
 rdflib.term.URIRef('http://clokman.com/kfir/ontology#'): 'kfir',
 rdflib.term.URIRef('http://clokman.com/test'): 'testGraph',
 rdflib.term.URIRef('http://clokman.com/wos'): 'wosGraph',
 rdflib.term.URIRef('http://dbpedia.org/ontology/'): 'dbo',
 rdflib.term.URIRef('http://wos.risis.eu/resource/'): 'wosres',
 rdflib.term.URIRef('http://wos.risis.eu/vocabulary/'): 'wos',
 rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#'): 'rdf',
 rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#'): 'rdfs',
 rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#'): 'xsd',
 rdflib.term.URIRef('http://www.w3.org/2004/02/skos/core#'): 'skos',
 rdflib.term.URIRef('http://www.w3.org/XML/1998/namespace'): 'xml',
 rdflib.term.URIRef('https://github.com/ali1k/ld-reactor/blob/master/vocabulary/index.ttl#'): 'ldr'}

Read endpoint from file:

In [114]:
from preprocessor.Text_File import Text_File

# Get endpoint address from file
eculture_endpoint_url_file = Text_File('..//private//eculture_virtuoso_endpoint_address')
eculture_endpoint_url = eculture_endpoint_url_file.return_content()

Define endpoint:

In [115]:
eculture_query.set_endpoint(eculture_endpoint_url)

<retriever.sparql_tools.Gastrodon_Query at 0x2ecbde617b8>

# QUERIES

## Database Statistics

Get counts for common fields:

In [116]:
wos_mappings = {'wos:TI':  'title', # wos: is defined in prefixes variable
                'wos:AF':  'author',
                'wos:SN':  'issn',
                'wos:DOI': 'doi',
                'wos:EM':  'email',
                'wos:DE':  'keywords_author',
                'wos:ID':  'keywords_plus',
                'wos:SC':  'subject_category',
                'wos:WC':  'web_of_science_category',
                'wos:PY':  'publication_year',
                'wos:CR':  'has_cited',
                'wos:NR':  'has_cited_count',
                'wos:Z9':  'cited_by_count_universal',
                'wos:TC':  'cited_by_count_local',
                'wos:SO':  'source_publication',
                'wos:PU':  'publisher',
                'wos:C1':  'author_address',
                'ldr:annotations': 'annotation'
                }

wos_field_counts = {}

print('Counting...')
for each_wos_field_name_abbreviation, each_field_name in wos_mappings.items():
    each_count = eculture_query.send_count_query(each_field_name + "s","""
    SELECT (COUNT(DISTINCT ?%s) as ?%ss) 
    WHERE{
        GRAPH wosGraph: {
            ?article a wos:Article .
            ?article %s ?%s .
        }
    }
    """ % (each_field_name, each_field_name, each_wos_field_name_abbreviation, each_field_name))
    wos_field_counts[each_field_name] = each_count
    print (each_wos_field_name_abbreviation, '/', each_field_name, ': ', each_count)
print('Counting finished.')

Counting...
wos:TI / title :  135985
wos:AF / author :  3485320
wos:SN / issn :  9627
wos:DOI / doi :  123505
wos:EM / email :  51997
wos:DE / keywords_author :  125552
wos:ID / keywords_plus :  156689
wos:SC / subject_category :  151
wos:WC / web_of_science_category :  2323
wos:PY / publication_year :  35
wos:CR / has_cited :  2854040
wos:NR / has_cited_count :  351
wos:Z9 / cited_by_count_universal :  880
wos:TC / cited_by_count_local :  852
wos:SO / source_publication :  9708
wos:PU / publisher :  2354
wos:C1 / author_address :  118156
ldr:annotations / annotation :  2158243
Counting finished.


Get number of articles:

In [117]:
article_count = eculture_query.send_count_query('articles', """
    SELECT (COUNT(DISTINCT ?article) as ?articles) 
    WHERE{
        GRAPH wosGraph: {
            ?article a wos:Article .
        }
    }
""")

article_count

136125

Add article_count to wos_field_counts:

In [118]:
wos_field_counts['article'] = article_count
wos_field_counts

{'annotation': 2158243,
 'article': 136125,
 'author': 3485320,
 'author_address': 118156,
 'cited_by_count_local': 852,
 'cited_by_count_universal': 880,
 'doi': 123505,
 'email': 51997,
 'has_cited': 2854040,
 'has_cited_count': 351,
 'issn': 9627,
 'keywords_author': 125552,
 'keywords_plus': 156689,
 'publication_year': 35,
 'publisher': 2354,
 'source_publication': 9708,
 'subject_category': 151,
 'title': 135985,
 'web_of_science_category': 2323}

Put results in a dataframe and sort them:

In [119]:
wos_field_counts_dataframe = pandas.Series(wos_field_counts)
wos_sorted_counts_dataframe = wos_field_counts_dataframe.sort_values(inplace=True, ascending=False)
wos_field_counts_dataframe

author                      3485320
has_cited                   2854040
annotation                  2158243
keywords_plus                156689
article                      136125
title                        135985
keywords_author              125552
doi                          123505
author_address               118156
email                         51997
source_publication             9708
issn                           9627
publisher                      2354
web_of_science_category        2323
cited_by_count_universal        880
cited_by_count_local            852
has_cited_count                 351
subject_category                151
publication_year                 35
dtype: int64

Plot results:

In [120]:
wos_field_counts_labels = list(wos_field_counts_dataframe.keys())
wos_field_counts_values = list(wos_field_counts_dataframe)

data = [graph_objects.Bar(x=wos_field_counts_labels,
                          y=wos_field_counts_values)]

iplot_online(data)

## Exploring and Mapping the Database

A function to retrieve all attributes related to a target property (e.g., author --> author label, author alternative label)

In [121]:
def retrieve_all_sub_attributes (target_property_of_articles):    

    result = eculture_query.send_select_query("""
        SELECT DISTINCT ?p
        WHERE{
            GRAPH wosGraph: {
                ?article a wos:Article .
                ?article %s ?target_object .
                ?target_object ?p ?o .
            }
        }
    """ % target_property_of_articles)
    
    return result

### Titles

Titles have no other properties attached to them:

In [122]:
retrieve_all_sub_attributes('wos:TI')

Unnamed: 0,p


Display all titles in a table:

In [144]:
eculture_query.send_select_query("""

    SELECT (?article AS ?wosArticleUri) ?title
    WHERE{
        GRAPH wosGraph: {
            ?article a wos:Article .
            ?article wos:TI ?title .
        }
    }
    LIMIT 10
    
""")

Unnamed: 0,wosArticleUri,title
0,wosres:WOS_000060208200006,Hollywood Berlin (The popularity of Nazi entertainment films in Germany under Hitler)
1,wosres:WOS_000070935900005,A randomized trial of anticoagulants versus aspirin after cerebral ischemia of presumed arterial...
2,wosres:WOS_000070948900005,The strength of numbers: Enumerating communities in India's princely states
3,wosres:WOS_000070961600011,Some patients with intracranial aneurysms have a reduced type III type I collagen ratio - A case...
4,wosres:WOS_000070961600033,Improving interobserver variation in reporting gadolinium-enhanced MRI lesions in multiple scler...
5,wosres:WOS_000070969600003,A physically active lifestyle - public health's best buy?
6,wosres:WOS_000070970500011,The effect of reciprocal treatments with ozone and ultraviolet-B radiation on photosynthesis and...
7,wosres:WOS_000070998100010,Compliance in administration of prescribed analgesics
8,wosres:WOS_000070998900007,Reconstruction of optical pathlength distributions from images obtained by a wide-field differen...
9,wosres:WOS_000071006900008,Does metformin increase the serum total homocysteine level in non-insulin-dependent diabetes mel...


### Authors

In [124]:
retrieve_all_sub_attributes('wos:AF')

Unnamed: 0,p
0,rdf:type
1,rdfs:label
2,skos:altLabel


Display all attributes related to authors in a table:

In [149]:
eculture_query.send_select_query("""

    SELECT (?article AS ?isAuthorOf_wosArticleUri) (?author AS ?wosAuthorCompoundUri) ?authorName ?authorAltName
    WHERE{
        GRAPH wosGraph: {
            ?article a wos:Article .
            ?article wos:AF ?author .
            ?author rdfs:label ?authorName .
            ?author skos:altLabel ?authorAltName .
        }
    }
    LIMIT 10

""")

Unnamed: 0,isAuthorOf_wosArticleUri,wosAuthorCompoundUri,authorName,authorAltName
0,wosres:WOS_000060208200006,wosres:WOS_000060208200006_Elsaesser_T,"Elsaesser, T","Elsaesser, T"
1,wosres:WOS_000070998900007,wosres:WOS_000070998900007_Aten_JA,"Aten, JA","Aten, JA"
2,wosres:WOS_000070998900007,wosres:WOS_000070998900007_Van_Munster_EB,"Van Munster, EB","Van Munster, EB"
3,wosres:WOS_000070998900007,wosres:WOS_000070998900007_Van_Vliet_LJ,"Van Vliet, LJ","Van Vliet, LJ"
4,wosres:WOS_000071084600009,wosres:WOS_000071084600009_de_Beer_K,"de Beer, K","de Beer, K"
5,wosres:WOS_000071084600009,wosres:WOS_000071084600009_de_Voogt_P,"de Voogt, P","de Voogt, P"
6,wosres:WOS_000071084600009,wosres:WOS_000071084600009_van_der_Wielen_F,"van der Wielen, F","van der Wielen, F"
7,wosres:WOS_000071167500005,wosres:WOS_000071167500005_Faas_BHW,"Faas, BHW","Faas, BHW"
8,wosres:WOS_000071167500005,wosres:WOS_000071167500005_Ligthart_PC,"Ligthart, PC","Ligthart, PC"
9,wosres:WOS_000071167500005,wosres:WOS_000071167500005_Lomas-Francis_C,"Lomas-Francis, C","Lomas-Francis, C"


### Keywords (by Authors)

Keywords has no other properties attached to them:

In [154]:
retrieve_all_sub_attributes('wos:DE')

Unnamed: 0,p


In [204]:
articles_and_keywords_dataframe = eculture_query.send_select_query("""

    SELECT (?article AS ?wosArticleUri) ?keywords
    WHERE{
        GRAPH wosGraph: {
            ?article a wos:Article;
                     wos:DE ?keywords .
        }
    }
    LIMIT 10

""")

articles_and_keywords_dataframe

Unnamed: 0,wosArticleUri,keywords
0,wosres:WOS_000070970500011,Elymus athericus; growth; photosynthesis; ozone; UV-B radiation
1,wosres:WOS_000070998100010,"pain, postoperative; analgesics, prescribing"
2,wosres:WOS_000070998900007,DIC; Nomarski; interference; microscopy; CCD; image processing;
3,wosres:WOS_000070998900007,analysis; reconstruction; optical pathlength; phase; transparent; living
4,wosres:WOS_000071006900008,atherosclerosis; homocysteine; metformin; vitamin B-12
5,wosres:WOS_000071013000007,policy; household economics
6,wosres:WOS_000071013000007,sub-Saharan Africa; Swaziland; labor migration; food security; labor
7,wosres:WOS_000071021600006,nitric oxide radical; NO scavenging; thiol; S-nitrosothiol
8,wosres:WOS_000071021600006,(electrochemical); NO sensing
9,wosres:WOS_000071040300005,lumbar spine; vertebra; trabecular bone; Wolff's Law; intervertebral


In [224]:
articles_and_keywords_dictionary = articles_and_keywords_dataframe.to_dict('split')

for each_entry in articles_and_keywords_dictionary['data']:
    
    each_article_id = each_entry[0]
    each_keywords_string = each_entry[1]
    
    each_keywords_list = each_keywords_string.split('; ')
    print(each_article_id, each_keywords_list)

# TODO: Each keyword will be inserted as a new articleId-keyword pair to the wos triple store

wosres:WOS_000070970500011 ['Elymus athericus', 'growth', 'photosynthesis', 'ozone', 'UV-B radiation']
wosres:WOS_000070998100010 ['pain, postoperative', 'analgesics, prescribing']
wosres:WOS_000070998900007 ['DIC', 'Nomarski', 'interference', 'microscopy', 'CCD', 'image processing;']
wosres:WOS_000070998900007 ['analysis', 'reconstruction', 'optical pathlength', 'phase', 'transparent', 'living']
wosres:WOS_000071006900008 ['atherosclerosis', 'homocysteine', 'metformin', 'vitamin B-12']
wosres:WOS_000071013000007 ['policy', 'household economics']
wosres:WOS_000071013000007 ['sub-Saharan Africa', 'Swaziland', 'labor migration', 'food security', 'labor']
wosres:WOS_000071021600006 ['nitric oxide radical', 'NO scavenging', 'thiol', 'S-nitrosothiol']
wosres:WOS_000071021600006 ['(electrochemical)', 'NO sensing']
wosres:WOS_000071040300005 ['lumbar spine', 'vertebra', 'trabecular bone', "Wolff's Law", 'intervertebral']


In [87]:
eculture_query.send_update_query

<bound method Gastrodon_Query.send_update_query of <retriever.sparql_tools.Gastrodon_Query object at 0x000002ECBDE617B8>>