# KFIR

# SETUP: DEPENDENCIES AND CREDENTIALS

## Working Directory

What is the current working directory?:

In [2]:
import os
os.getcwd()

'C:\\Users\\Clokman\\Google Drive\\__Projects__\\Code\\KFIR\\notebooks'

Add parent directory to path if necessary:

In [3]:
import sys, os, re

working_directory = os.getcwd()
if re.search('\\\\notebooks$', working_directory):
    one_directory_up = re.sub('\\\\notebooks$', '', working_directory)
    sys.path.append(one_directory_up)
    
sys.path

['',
 'C:\\ProgramData\\Anaconda3\\python36.zip',
 'C:\\ProgramData\\Anaconda3\\DLLs',
 'C:\\ProgramData\\Anaconda3\\lib',
 'C:\\ProgramData\\Anaconda3',
 'C:\\ProgramData\\Anaconda3\\lib\\site-packages',
 'C:\\ProgramData\\Anaconda3\\lib\\site-packages\\Sphinx-1.5.1-py3.6.egg',
 'C:\\ProgramData\\Anaconda3\\lib\\site-packages\\win32',
 'C:\\ProgramData\\Anaconda3\\lib\\site-packages\\win32\\lib',
 'C:\\ProgramData\\Anaconda3\\lib\\site-packages\\Pythonwin',
 'C:\\ProgramData\\Anaconda3\\lib\\site-packages\\setuptools-27.2.0-py3.6.egg',
 'C:\\ProgramData\\Anaconda3\\lib\\site-packages\\IPython\\extensions',
 'C:\\Users\\Clokman\\.ipython',
 'C:\\Users\\Clokman\\Google Drive\\__Projects__\\Code\\KFIR']

## Initialize General Packages

In [4]:
import pandas
import numpy

## Initialize Plotly

Check current version:

In [5]:
from plotly import __version__ as plotly_version
plotly_version

'2.5.1'

### Online Plotly

Read plotly credentials from file:

In [6]:
from preprocessor.Text_File import Text_File

plotly_file = Text_File('..//private//plotly_credentials')
plotly_file = plotly_file.return_content()
plotly_credentials = plotly_file.splitlines()

plotly_username = plotly_credentials[0]
plotly_key = plotly_credentials[1]

Set parameters for online usage:

In [7]:
import plotly.plotly as plotly_online
iplot_online = plotly_online.iplot

import plotly.graph_objs as graph_objects
from plotly.tools import set_credentials_file

set_credentials_file(username=plotly_username, api_key=plotly_key)  # put your own plotly username and api key here 

### Offline Plotly

Setup for offline usage:

In [8]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

## Initialize Retriever: Gastrodon_Query

Import Gastrodon_Query (for running SPARQL queries in Jupyter):

In [9]:
from retriever.sparql_tools import Gastrodon_Query

Initialize eculture query:

In [10]:
eculture_query = Gastrodon_Query()

Define prefixes:

In [11]:
eculture_query.set_prefixes("""
    @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
    @prefix skos: <http://www.w3.org/2004/02/skos/core#> .
    @prefix dbo: <http://dbpedia.org/ontology/> .
    
    @prefix wos: <http://wos.risis.eu/vocabulary/> .
    @prefix wosres: <http://wos.risis.eu/resource/> .
    @prefix kfir: <http://clokman.com/kfir/ontology#> .
    @prefix ldr: <https://github.com/ali1k/ld-reactor/blob/master/vocabulary/index.ttl#> .

    @prefix wosGraph: <http://clokman.com/wos> .
    @prefix kfirGraph: <http://clokman.com/kfir> .
    @prefix testGraph: <http://clokman.com/test> .
""")

eculture_query._get_prefixes()

{rdflib.term.URIRef('http://clokman.com/kfir'): 'kfirGraph',
 rdflib.term.URIRef('http://clokman.com/kfir/ontology#'): 'kfir',
 rdflib.term.URIRef('http://clokman.com/test'): 'testGraph',
 rdflib.term.URIRef('http://clokman.com/wos'): 'wosGraph',
 rdflib.term.URIRef('http://dbpedia.org/ontology/'): 'dbo',
 rdflib.term.URIRef('http://wos.risis.eu/resource/'): 'wosres',
 rdflib.term.URIRef('http://wos.risis.eu/vocabulary/'): 'wos',
 rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#'): 'rdf',
 rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#'): 'rdfs',
 rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#'): 'xsd',
 rdflib.term.URIRef('http://www.w3.org/2004/02/skos/core#'): 'skos',
 rdflib.term.URIRef('http://www.w3.org/XML/1998/namespace'): 'xml',
 rdflib.term.URIRef('https://github.com/ali1k/ld-reactor/blob/master/vocabulary/index.ttl#'): 'ldr'}

Read endpoint from file:

In [12]:
from preprocessor.Text_File import Text_File

# Get endpoint address from file
eculture_endpoint_url_file = Text_File('..//private//eculture_virtuoso_endpoint_address')
eculture_endpoint_url = eculture_endpoint_url_file.return_content()

Define endpoint:

In [13]:
eculture_query.set_endpoint(eculture_endpoint_url)

<retriever.sparql_tools.Gastrodon_Query at 0x19c95777cf8>

# QUERIES

## Database Statistics

Get counts for common fields:

In [30]:
wos_mappings = {'wos:TI':  'title', # wos: is defined in prefixes variable
                'wos:AF':  'author',
                'wos:SN':  'issn',
                'wos:DOI': 'doi',
                'wos:EM':  'email',
                'wos:DE':  'keywords_author',
                'wos:ID':  'keywords_plus',
                'wos:SC':  'subject_category',
                'wos:WC':  'web_of_science_category',
                'wos:PY':  'publication_year',
                'wos:CR':  'has_cited',
                'wos:NR':  'has_cited_count',
                'wos:Z9':  'cited_by_count_universal',
                'wos:TC':  'cited_by_count_local',
                'wos:SO':  'source_publication',
                'wos:PU':  'publisher',
                'wos:C1':  'author_address',
                'ldr:annotations': 'annotation'
                }

wos_field_counts = {}

print('Counting...')
for each_wos_field_name_abbreviation, each_field_name in wos_mappings.items():
    each_count = eculture_query.send_count_query(each_field_name + "s","""
    SELECT (COUNT(DISTINCT ?%s) as ?%ss) 
    WHERE{
        GRAPH wosGraph: {
            ?article a wos:Article .
            ?article %s ?%s .
        }
    }
    """ % (each_field_name, each_field_name, each_wos_field_name_abbreviation, each_field_name))
    wos_field_counts[each_field_name] = each_count
    print (each_wos_field_name_abbreviation, '/', each_field_name, ': ', each_count)
print('Counting finished.')

Counting...
wos:TI / title :  135985
wos:AF / author :  3485320
wos:SN / issn :  9627
wos:DOI / doi :  123505
wos:EM / email :  51997
wos:DE / keywords_author :  125552
wos:ID / keywords_plus :  156689
wos:SC / subject_category :  151
wos:WC / web_of_science_category :  2323
wos:PY / publication_year :  35
wos:CR / has_cited :  2854040
wos:NR / has_cited_count :  351
wos:Z9 / cited_by_count_universal :  880
wos:TC / cited_by_count_local :  852
wos:SO / source_publication :  9708
wos:PU / publisher :  2354
wos:C1 / author_address :  118156
ldr:annotations / annotation :  2158243
Counting finished.


Get number of articles:

In [31]:
article_count = eculture_query.send_count_query('articles', """
    SELECT (COUNT(DISTINCT ?article) as ?articles) 
    WHERE{
        GRAPH wosGraph: {
            ?article a wos:Article .
        }
    }
""")

article_count

136125

Add article_count to wos_field_counts:

In [32]:
wos_field_counts['article'] = article_count
wos_field_counts

{'annotation': 2158243,
 'article': 136125,
 'author': 3485320,
 'author_address': 118156,
 'cited_by_count_local': 852,
 'cited_by_count_universal': 880,
 'doi': 123505,
 'email': 51997,
 'has_cited': 2854040,
 'has_cited_count': 351,
 'issn': 9627,
 'keywords_author': 125552,
 'keywords_plus': 156689,
 'publication_year': 35,
 'publisher': 2354,
 'source_publication': 9708,
 'subject_category': 151,
 'title': 135985,
 'web_of_science_category': 2323}

Put results in a dataframe and sort them:

In [33]:
wos_field_counts_dataframe = pandas.Series(wos_field_counts)
wos_sorted_counts_dataframe = wos_field_counts_dataframe.sort_values(inplace=True, ascending=False)
wos_field_counts_dataframe

author                      3485320
has_cited                   2854040
annotation                  2158243
keywords_plus                156689
article                      136125
title                        135985
keywords_author              125552
doi                          123505
author_address               118156
email                         51997
source_publication             9708
issn                           9627
publisher                      2354
web_of_science_category        2323
cited_by_count_universal        880
cited_by_count_local            852
has_cited_count                 351
subject_category                151
publication_year                 35
dtype: int64

Plot results:

In [34]:
wos_field_counts_labels = list(wos_field_counts_dataframe.keys())
wos_field_counts_values = list(wos_field_counts_dataframe)

data = [graph_objects.Bar(x=wos_field_counts_labels,
                          y=wos_field_counts_values)]

iplot_online(data, filename='wos_demographics')

## Mapping and Transforming the Database

A function to retrieve all attributes related to a target property (e.g., author --> author label, author alternative label)

In [14]:
def retrieve_all_sub_attributes (target_property_of_articles):    

    result = eculture_query.send_select_query("""
        SELECT DISTINCT ?p
        WHERE{
            GRAPH wosGraph: {
                ?article a wos:Article .
                ?article %s ?target_object .
                ?target_object ?p ?o .
            }
        }
    """ % target_property_of_articles)
    
    return result

### Titles

Titles have no other properties attached to them:

In [36]:
retrieve_all_sub_attributes('wos:TI')

Unnamed: 0,p


Display titles in a table:

In [37]:
eculture_query.send_select_query("""

    SELECT DISTINCT (?article AS ?wosArticleUri) ?title
    WHERE{
        GRAPH wosGraph: {
            ?article a wos:Article .
            ?article wos:TI ?title .
        }
    }
    LIMIT 10
    
""")

Unnamed: 0,wosArticleUri,title
0,wosres:WOS_000060208200006,Hollywood Berlin (The popularity of Nazi entertainment films in Germany under Hitler)
1,wosres:WOS_000070935900005,A randomized trial of anticoagulants versus aspirin after cerebral ischemia of presumed arterial...
2,wosres:WOS_000070948900005,The strength of numbers: Enumerating communities in India's princely states
3,wosres:WOS_000070961600011,Some patients with intracranial aneurysms have a reduced type III type I collagen ratio - A case...
4,wosres:WOS_000070961600033,Improving interobserver variation in reporting gadolinium-enhanced MRI lesions in multiple scler...
5,wosres:WOS_000070969600003,A physically active lifestyle - public health's best buy?
6,wosres:WOS_000070970500011,The effect of reciprocal treatments with ozone and ultraviolet-B radiation on photosynthesis and...
7,wosres:WOS_000070998100010,Compliance in administration of prescribed analgesics
8,wosres:WOS_000070998900007,Reconstruction of optical pathlength distributions from images obtained by a wide-field differen...
9,wosres:WOS_000071006900008,Does metformin increase the serum total homocysteine level in non-insulin-dependent diabetes mel...


Count the number of **article-title pairs**:

(This number also corresponds to the number of lines that will in the returned results if no LIMIT statement is used. The triple store (e.g., Virtuoso) must be configured to allow returning at least this many lines.)

In [39]:
no_of_article_title_pairs = eculture_query.send_count_query(query=
                            """SELECT (COUNT(?article) AS ?articlesWithTitle) 
                               WHERE {
                                   GRAPH wosGraph:{
                                       ?article a wos:Article .
                                       ?article wos:TI ?title .
                                   }
                               }
                            """,
                            query_variable_that_holds_count_results='articlesWithTitle')
no_of_article_title_pairs

136125

### Authors

In [30]:
retrieve_all_sub_attributes('wos:AF')

Unnamed: 0,p
0,rdf:type
1,rdfs:label
2,skos:altLabel


Display all attributes related to authors in a table:

In [40]:
eculture_query.send_select_query("""

    SELECT DISTINCT (?article AS ?wosArticleUri) (?author AS ?wosAuthorCompoundUri) ?authorName ?authorAltName
    WHERE{
        GRAPH wosGraph: {
            ?article a wos:Article .
            ?article wos:AF ?author .
            ?author rdfs:label ?authorName .
            ?author skos:altLabel ?authorAltName .
        }
    }
    LIMIT 10

""")

Unnamed: 0,wosArticleUri,wosAuthorCompoundUri,authorName,authorAltName
0,wosres:WOS_000060208200006,wosres:WOS_000060208200006_Elsaesser_T,"Elsaesser, T","Elsaesser, T"
1,wosres:WOS_000070998900007,wosres:WOS_000070998900007_Aten_JA,"Aten, JA","Aten, JA"
2,wosres:WOS_000070998900007,wosres:WOS_000070998900007_Van_Munster_EB,"Van Munster, EB","Van Munster, EB"
3,wosres:WOS_000070998900007,wosres:WOS_000070998900007_Van_Vliet_LJ,"Van Vliet, LJ","Van Vliet, LJ"
4,wosres:WOS_000071084600009,wosres:WOS_000071084600009_de_Beer_K,"de Beer, K","de Beer, K"
5,wosres:WOS_000071084600009,wosres:WOS_000071084600009_de_Voogt_P,"de Voogt, P","de Voogt, P"
6,wosres:WOS_000071084600009,wosres:WOS_000071084600009_van_der_Wielen_F,"van der Wielen, F","van der Wielen, F"
7,wosres:WOS_000071167500005,wosres:WOS_000071167500005_Faas_BHW,"Faas, BHW","Faas, BHW"
8,wosres:WOS_000071167500005,wosres:WOS_000071167500005_Ligthart_PC,"Ligthart, PC","Ligthart, PC"
9,wosres:WOS_000071167500005,wosres:WOS_000071167500005_Lomas-Francis_C,"Lomas-Francis, C","Lomas-Francis, C"


Count the number of **article-author pairs**:

(This number also corresponds to the number of lines that will in the returned results if no LIMIT statement is used. The triple store (e.g., Virtuoso) must be configured to allow returning at least this many lines.)

In [41]:
no_of_article_author_pairs = eculture_query.send_count_query(query=
                            """SELECT (COUNT(?author) AS ?authors) 
                               WHERE {
                                   GRAPH wosGraph:{
                                       ?article a wos:Article .
                                       ?article wos:AF ?author .
                                       ?author rdfs:label ?authorName .
                                       ?author skos:altLabel ?authorAltName .
                                   }
                               }
                            """,
                            query_variable_that_holds_count_results='authors')
no_of_article_author_pairs

3485373

Even though the 'maximum lines' and related 'query time' parameters are set to very liberal values, the the query still returns about 1 million lines maximum (possibly due to 100MB file size limit). Therefore, the query for Virtuoso GUI is entered 4 times, with differing OFFSET values (i.e., LIMIT = 1000000 OFFSET = 0, 1000000, 2000000, 3000000) 

### Years

'Years' has no attributes stemming from it:

In [141]:
retrieve_all_sub_attributes('wos:PY')

Unnamed: 0,p


Display all attributes related to years in a table:

In [145]:
eculture_query.send_select_query("""

    SELECT DISTINCT (?article AS ?wosArticleUri) ?publicationYear
    WHERE{
        GRAPH wosGraph: {
            ?article a wos:Article ;
                     wos:PY ?publicationYear
        }
    }
    LIMIT 10

""")

Unnamed: 0,wosArticleUri,publicationYear
0,wosres:WOS_000060208200006,1997
1,wosres:WOS_000070935900005,1997
2,wosres:WOS_000070948900005,1997
3,wosres:WOS_000070961600011,1997
4,wosres:WOS_000070961600033,1997
5,wosres:WOS_000070969600003,1997
6,wosres:WOS_000070970500011,1997
7,wosres:WOS_000070998100010,1997
8,wosres:WOS_000070998900007,1997
9,wosres:WOS_000071006900008,1997


Count the number of **article-year pairs**:

(This number also corresponds to the number of lines that will in the returned results if no LIMIT statement is used. The triple store (e.g., Virtuoso) must be configured to allow returning at least this many lines.)

In [146]:
no_of_article_year_pairs = eculture_query.send_count_query(query=
                            """SELECT (COUNT(?article) AS ?articlesWithYears) 
                               WHERE {
                                   GRAPH wosGraph:{
                                       ?article a wos:Article ;
                                                wos:PY ?publicationYear
                                   }
                               }
                            """,
                            query_variable_that_holds_count_results='articlesWithYears')
no_of_article_year_pairs

136125

### DOI

In [14]:
retrieve_all_sub_attributes('wos:DOI')

Unnamed: 0,p
0,rdf:type
1,rdfs:label
2,wos:DOI


Display all attributes related to DOI in a table:

In [42]:
eculture_query.send_select_query("""

    SELECT DISTINCT *
    WHERE{
        GRAPH wosGraph: {
            ?article a wos:Article ;
                     wos:DOI ?doi .
                     
            ?doi rdfs:label ?doiLabel ;
                          wos:DOI ?doiDoi ;
                          a       ?doiType .
        }
    }
    LIMIT 10

""")

Unnamed: 0,article,doi,doiLabel,doiDoi,doiType
0,wosres:WOS_000070998900007,http://dx.doi.org/10.1046%2Fj.1365-2818.1997.2570815.x,"Van Munster EB, 1997, J MICROSC-OXFORD, V188, P149, DOI 10.1046/j.1365-2818.1997.2570815.x",http://dx.doi.org/10.1046%2Fj.1365-2818.1997.2570815.x,wos:CitedPublication
1,wosres:WOS_000071636500010,http://dx.doi.org/10.1002%2F(SICI)1096-9896(199801)184%3A1%3C53%3A%3AAID-PATH6%3E3.0.CO%3B2-7,"De Jong JS, 1998, J PATHOL, V184, P53, DOI 10.1002/(SICI)1096-9896(199801)184:1<53::AID-PATH6>3....",http://dx.doi.org/10.1002%2F(SICI)1096-9896(199801)184%3A1%3C53%3A%3AAID-PATH6%3E3.0.CO%3B2-7,wos:CitedPublication
2,wosres:WOS_000072954400012,http://dx.doi.org/10.1002%2Fana.410430413,"van den Berg JSP, 1998, ANN NEUROL, V43, P494, DOI 10.1002/ana.410430413",http://dx.doi.org/10.1002%2Fana.410430413,wos:CitedPublication
3,wosres:WOS_000071179700013,http://dx.doi.org/10.1006%2Fexer.1997.0396,"Broekhuyse RM, 1997, EXP EYE RES, V65, P841, DOI 10.1006/exer.1997.0396",http://dx.doi.org/10.1006%2Fexer.1997.0396,wos:CitedPublication
4,wosres:WOS_000071912700002,http://dx.doi.org/10.1006%2Fgcen.1997.7001,"de Lange RPJ, 1998, GEN COMP ENDOCR, V109, P166, DOI 10.1006/gcen.1997.7001",http://dx.doi.org/10.1006%2Fgcen.1997.7001,wos:CitedPublication
5,wosres:WOS_000074708500005,http://dx.doi.org/10.1006%2Fjdeq.1998.3428,"Gohberg I, 1998, J DIFFER EQUATIONS, V146, P375, DOI 10.1006/jdeq.1998.3428",http://dx.doi.org/10.1006%2Fjdeq.1998.3428,wos:CitedPublication
6,wosres:WOS_000073621000010,http://dx.doi.org/10.1007%2Fs002210050380,"Toussaint HM, 1998, EXP BRAIN RES, V120, P85, DOI 10.1007/s002210050380",http://dx.doi.org/10.1007%2Fs002210050380,wos:CitedPublication
7,wosres:WOS_000073621900004,http://dx.doi.org/10.1007%2Fs002510050382,"Bouma G, 1998, IMMUNOGENETICS, V47, P451, DOI 10.1007/s002510050382",http://dx.doi.org/10.1007%2Fs002510050382,wos:CitedPublication
8,wosres:WOS_000071074300003,http://dx.doi.org/10.1007%2Fs004320050111,"van der Wilt CL, 1997, J CANCER RES CLIN, V123, P595, DOI 10.1007/s004320050111",http://dx.doi.org/10.1007%2Fs004320050111,wos:CitedPublication
9,wosres:WOS_000074171100014,http://dx.doi.org/10.1007%2Fs004390050745,"Collee JM, 1998, HUM GENET, V102, P587, DOI 10.1007/s004390050745",http://dx.doi.org/10.1007%2Fs004390050745,wos:CitedPublication


Display only the attributes of interest:

In [55]:
eculture_query.send_select_query("""

    SELECT DISTINCT (?article AS ?wosArticleUri) ?doi
    WHERE{
        GRAPH wosGraph: {
            ?article a wos:Article ;
                     wos:DOI ?doi .
        }
    }
    LIMIT 10

""")

Unnamed: 0,wosArticleUri,doi
0,wosres:WOS_000070948900005,http://dx.doi.org/10.1080%2F00856409708723290
1,wosres:WOS_000070961600011,http://dx.doi.org/10.1212%2FWNL.49.6.1546
2,wosres:WOS_000070961600033,http://dx.doi.org/10.1212%2FWNL.49.6.1682
3,wosres:WOS_000070969600003,http://dx.doi.org/10.1136%2Fbjsm.31.4.264
4,wosres:WOS_000070970500011,http://dx.doi.org/10.1016%2FS0269-7491(97)00085-7
5,wosres:WOS_000070998100010,http://dx.doi.org/10.1111%2Fj.1365-2044.1997.243-az0378.x
6,wosres:WOS_000070998900007,http://dx.doi.org/10.1046%2Fj.1365-2818.1997.2570815.x
7,wosres:WOS_000071006900008,http://dx.doi.org/10.1046%2Fj.1365-2796.1997.00231.x
8,wosres:WOS_000071013000007,http://dx.doi.org/10.1016%2FS0305-750X(97)00070-3
9,wosres:WOS_000071021600006,http://dx.doi.org/10.1023%2FA%3A1008601327920


Count the number of **article-doi pairs**:

(This number also corresponds to the number of lines that will in the returned results if no LIMIT statement is used. The triple store (e.g., Virtuoso) must be configured to allow returning at least this many lines.)

In [75]:
no_of_article_doi_pairs = eculture_query.send_count_query(query=
                            """SELECT (COUNT(?article) AS ?articlesWithDois) 
                               WHERE {
                                   GRAPH wosGraph:{
                                       ?article a wos:Article ;
                                                wos:DOI ?doi .
                                   }
                               }
                            """,
                            query_variable_that_holds_count_results='articlesWithDois')
no_of_article_doi_pairs

123531

### 'Has Cited'

In [57]:
retrieve_all_sub_attributes('wos:CR')

Unnamed: 0,p
0,rdf:type
1,rdfs:label
2,wos:DOI


Display all attributes related to has_cited in a table:

In [58]:
eculture_query.send_select_query("""

    SELECT DISTINCT *
    WHERE{
        GRAPH wosGraph: {
            ?article a wos:Article .
            ?article wos:CR ?citedArticle .
            ?citedArticle rdfs:label ?citedArticleTitle ;
                          wos:DOI ?citedArticleDoi ;
                          rdf:type ?citedArticleType .
        }
    }
    LIMIT 10

""")

Unnamed: 0,article,citedArticle,citedArticleTitle,citedArticleDoi,citedArticleType
0,wosres:WOS_000403097000001,http://dx.doi.org/%5B10.1002%2F14651858.CD000088.pub3,"Pharoah F, 2010, COCHRANE DB SYST REV, DOI [10.1002/14651858.CD000088.pub3, 10.1002/14651858.CD0...",http://dx.doi.org/%5B10.1002%2F14651858.CD000088.pub3,wos:CitedPublication
1,wosres:WOS_000305753900001,http://dx.doi.org/%5B10.1007%2Fs10709-006-9012-x,"Latta RG, 2007, GENETICA, V129, P167, DOI [10.1007/s10709-006-9012-x, 10.1007/s10706-006-9012-x]",http://dx.doi.org/%5B10.1007%2Fs10709-006-9012-x,wos:CitedPublication
2,wosres:WOS_000319349400001,http://dx.doi.org/%5B10.1007%2Fs10709-006-9012-x,"Latta RG, 2007, GENETICA, V129, P167, DOI [10.1007/s10709-006-9012-x, 10.1007/s10706-006-9012-x]",http://dx.doi.org/%5B10.1007%2Fs10709-006-9012-x,wos:CitedPublication
3,wosres:WOS_000308466000009,http://dx.doi.org/%5B10.1007%2Fs10709-006-9012-x,"Latta RG, 2007, GENETICA, V129, P167, DOI [10.1007/s10709-006-9012-x, 10.1007/s10706-006-9012-x]",http://dx.doi.org/%5B10.1007%2Fs10709-006-9012-x,wos:CitedPublication
4,wosres:WOS_000231565800007,http://dx.doi.org/%5B10.1016%2FS0091-6749(03)01942-0,"Pastorello EA, 2003, J ALLERGY CLIN IMMUN, V112, P775, DOI [10.1016/S0091-6749(03)01942-0, 10.10...",http://dx.doi.org/%5B10.1016%2FS0091-6749(03)01942-0,wos:CitedPublication
5,wosres:WOS_000324765100020,http://dx.doi.org/%5B10.1016%2FS0140-6736(08)61698-0,"Garnett GP, 2009, LANCET, V373, P9, DOI [10.1016/S0140-6736(08)61698-0, 10.1016/S01406736(08)616...",http://dx.doi.org/%5B10.1016%2FS0140-6736(08)61698-0,wos:CitedPublication
6,wosres:WOS_000332113800001,http://dx.doi.org/%5B10.1016%2FS0140-6736(11)61105-7,"Kim Y, 2011, LANCET, V378, P317, DOI [10.1016/S0140-6736(11)61105-7, 10.1016/S0140-6736(11)61169-0]",http://dx.doi.org/%5B10.1016%2FS0140-6736(11)61105-7,wos:CitedPublication
7,wosres:WOS_000383255900176,http://dx.doi.org/%5B10.1016%2FS0140-6736(13)62223-0,"Garcia-Basteiro AL, 2014, LANCET, V383, P215, DOI [10.1016/S0140-6736(13)62223-0, 10.1016/S0140-...",http://dx.doi.org/%5B10.1016%2FS0140-6736(13)62223-0,wos:CitedPublication
8,wosres:WOS_000376720800001,http://dx.doi.org/%5B10.1016%2FS0140-6736(13)62223-0,"Garcia-Basteiro AL, 2014, LANCET, V383, P215, DOI [10.1016/S0140-6736(13)62223-0, 10.1016/S0140-...",http://dx.doi.org/%5B10.1016%2FS0140-6736(13)62223-0,wos:CitedPublication
9,wosres:WOS_000279537600003,http://dx.doi.org/%5B10.1016%2FS0301-4215(03)00134-4,"Kamp LM, 2004, ENERG POLICY, V32, P1625, DOI [10.1016/S0301-4215(03)00134-4, 10.1016/80301-4215(...",http://dx.doi.org/%5B10.1016%2FS0301-4215(03)00134-4,wos:CitedPublication


Display only the attributes of interest:

In [59]:
eculture_query.send_select_query("""

    SELECT DISTINCT (?article AS ?wosArticleUri) (?citedArticleDoi AS ?hasCitedArticle_withDoi)
    WHERE{
        GRAPH wosGraph: {
            ?article a wos:Article .
            ?article wos:CR ?citedArticle .
            ?citedArticle rdfs:label ?citedArticleTitle ;
                          wos:DOI ?citedArticleDoi ;
                          rdf:type ?citedArticleType .
        }
    }
    LIMIT 10

""")

Unnamed: 0,wosArticleUri,hasCitedArticle_withDoi
0,wosres:WOS_000403097000001,http://dx.doi.org/%5B10.1002%2F14651858.CD000088.pub3
1,wosres:WOS_000305753900001,http://dx.doi.org/%5B10.1007%2Fs10709-006-9012-x
2,wosres:WOS_000319349400001,http://dx.doi.org/%5B10.1007%2Fs10709-006-9012-x
3,wosres:WOS_000308466000009,http://dx.doi.org/%5B10.1007%2Fs10709-006-9012-x
4,wosres:WOS_000231565800007,http://dx.doi.org/%5B10.1016%2FS0091-6749(03)01942-0
5,wosres:WOS_000324765100020,http://dx.doi.org/%5B10.1016%2FS0140-6736(08)61698-0
6,wosres:WOS_000332113800001,http://dx.doi.org/%5B10.1016%2FS0140-6736(11)61105-7
7,wosres:WOS_000383255900176,http://dx.doi.org/%5B10.1016%2FS0140-6736(13)62223-0
8,wosres:WOS_000376720800001,http://dx.doi.org/%5B10.1016%2FS0140-6736(13)62223-0
9,wosres:WOS_000279537600003,http://dx.doi.org/%5B10.1016%2FS0301-4215(03)00134-4


Count the number of **article-has_cited pairs**:

(This number also corresponds to the number of lines that will in the returned results if no LIMIT statement is used. The triple store (e.g., Virtuoso) must be configured to allow returning at least this many lines.)

In [74]:
no_of_article_cited_article_pairs = eculture_query.send_count_query(query=
                            """SELECT (COUNT(?article) AS ?articlesWithCitedArticleDois) 
                               WHERE {
                                   GRAPH wosGraph:{
                                        ?article a wos:Article .
                                        ?article wos:CR ?citedArticle .
                                        ?citedArticle rdfs:label ?citedArticleTitle ;
                                                      wos:DOI ?citedArticleDoi ;
                                                      rdf:type ?citedArticleType .

                                   }
                               }
                            """,
                            query_variable_that_holds_count_results='articlesWithCitedArticleDois')
no_of_article_cited_article_pairs

3637471

Even though the 'maximum lines' and related 'query time' parameters are set to very liberal values, the the query still returns about 1 million lines maximum (possibly due to 100MB file size limit). Therefore, the query for Virtuoso GUI is entered 4 times, with differing OFFSET values (i.e., LIMIT = 1000000 OFFSET = 0, 1000000, 2000000, 3000000) 

### Journal

Journal has no other properties attached to it:

In [62]:
retrieve_all_sub_attributes('wos:SO')

Unnamed: 0,p


Display journals in a table:

In [15]:
eculture_query.send_select_query("""

    SELECT DISTINCT (?article AS ?wosArticleUri) (?source_publication AS ?journal)
    WHERE{
        GRAPH wosGraph: {
            ?article a wos:Article ;
                     wos:SO ?source_publication .
        }
    }
    LIMIT 10
    
""")

Unnamed: 0,wosArticleUri,journal
0,wosres:WOS_000060208200006,SIGHT AND SOUND
1,wosres:WOS_000070935900005,ANNALS OF NEUROLOGY
2,wosres:WOS_000070948900005,SOUTH ASIA-JOURNAL OF SOUTH ASIAN STUDIES
3,wosres:WOS_000070961600011,NEUROLOGY
4,wosres:WOS_000070961600033,NEUROLOGY
5,wosres:WOS_000070969600003,BRITISH JOURNAL OF SPORTS MEDICINE
6,wosres:WOS_000070970500011,ENVIRONMENTAL POLLUTION
7,wosres:WOS_000070998100010,ANAESTHESIA
8,wosres:WOS_000070998900007,JOURNAL OF MICROSCOPY-OXFORD
9,wosres:WOS_000071006900008,JOURNAL OF INTERNAL MEDICINE


Journal names are all in capitals. All capital names are used only for relationships in Neo4j, which is where this dataset will be imported. Therefore, **convert all journal names to sentence case**:

In [42]:
eculture_query.send_select_query("""

    SELECT DISTINCT (?article AS ?wosArticleUri) 
           (    
                # Make journal names sentence case
                CONCAT(
                      SUBSTR(?source_publication, 1, 1),
                      LCASE(SUBSTR(?source_publication, 2))
                ) 
                AS ?journal
            )
    WHERE{
        GRAPH wosGraph: {
            ?article a wos:Article ;
                     wos:SO ?source_publication .
        }
    }
    LIMIT 10
    
""")

Unnamed: 0,wosArticleUri,journal
0,wosres:WOS_000060208200006,Sight and sound
1,wosres:WOS_000070935900005,Annals of neurology
2,wosres:WOS_000070948900005,South asia-journal of south asian studies
3,wosres:WOS_000070961600011,Neurology
4,wosres:WOS_000070961600033,Neurology
5,wosres:WOS_000070969600003,British journal of sports medicine
6,wosres:WOS_000070970500011,Environmental pollution
7,wosres:WOS_000070998100010,Anaesthesia
8,wosres:WOS_000070998900007,Journal of microscopy-oxford
9,wosres:WOS_000071006900008,Journal of internal medicine


Count the number of **article-journal pairs**:

(This number also corresponds to the number of lines that will in the returned results if no LIMIT statement is used. The triple store (e.g., Virtuoso) must be configured to allow returning at least this many lines.)

In [72]:
no_of_article_journal_pairs = eculture_query.send_count_query(query=
                            """SELECT (COUNT(?article) AS ?articlesWithJournal) 
                               WHERE {
                                   GRAPH wosGraph:{
                                       ?article a wos:Article .
                                       ?article wos:SO ?source_publication .
                                   }
                               }
                            """,
                            query_variable_that_holds_count_results='articlesWithJournal')
no_of_article_journal_pairs

136125

### Subject Category

Subject Category has no other properties attached to it:

In [67]:
retrieve_all_sub_attributes('wos:SC')

Unnamed: 0,p


In [70]:
eculture_query.send_select_query("""

    SELECT DISTINCT (?article AS ?wosArticleUri) ?subjectCategory
    WHERE{
        GRAPH wosGraph: {
            ?article a wos:Article .
            ?article wos:SC ?subjectCategory .
        }
    }
    LIMIT 100
    
""")

Unnamed: 0,wosArticleUri,subjectCategory
0,wosres:WOS_000060208200006,"Film, Radio & Television"
1,wosres:WOS_000070935900005,Neurosciences & Neurology
2,wosres:WOS_000070948900005,Asian Studies
3,wosres:WOS_000070961600011,Neurosciences & Neurology
4,wosres:WOS_000070961600033,Neurosciences & Neurology
5,wosres:WOS_000070969600003,Sport Sciences
6,wosres:WOS_000070970500011,Environmental Sciences & Ecology
7,wosres:WOS_000070998100010,Anesthesiology
8,wosres:WOS_000070998900007,Microscopy
9,wosres:WOS_000071006900008,General & Internal Medicine


Count the number of **article-subject_category pairs**:

(This number also corresponds to the number of lines that will in the returned results if no LIMIT statement is used. The triple store (e.g., Virtuoso) must be configured to allow returning at least this many lines.)

In [71]:
no_of_article_category_pairs = eculture_query.send_count_query(query=
                            """SELECT (COUNT(?article) AS ?articlesWithSubCat) 
                               WHERE {
                                   GRAPH wosGraph:{
                                       ?article a wos:Article .
                                       ?article wos:SC ?subjectCategory .
                                   }
                               }
                            """,
                            query_variable_that_holds_count_results='articlesWithSubCat')
no_of_article_category_pairs

204928

### Annotations

Annotations has no other properties attached to it:

In [76]:
retrieve_all_sub_attributes('ldr:annotations')

Unnamed: 0,p
0,rdf:type
1,rdfs:label
2,ldr:similarityScore
3,ldr:uri
4,ldr:annotationDetail
5,ldr:surfaceForm
6,ldr:offset
7,ldr:percentageOfSecondRank


Display annotations on a table:

In [78]:
eculture_query.send_select_query("""

    SELECT DISTINCT (?article AS ?wosArticleUri) ?annotation
    WHERE{
        GRAPH wosGraph: {
            ?article a wos:Article .
            ?article ldr:annotations ?annotation_instance .
            ?annotation_instance rdfs:label ?annotation
        }
    }
    LIMIT 100
    
""")

Unnamed: 0,wosArticleUri,annotation
0,wosres:WOS_000071167500005,blood group
1,wosres:WOS_000071084600009,surfactants
2,wosres:WOS_000070998900007,image processing
3,wosres:WOS_000070998900007,lateral shift
4,wosres:WOS_000071167500005,DNA
5,wosres:WOS_000070998900007,microscope
6,wosres:WOS_000071167500005,Serologic
7,wosres:WOS_000070998900007,signal-to-noise ratio
8,wosres:WOS_000071167500005,phenotype
9,wosres:WOS_000071167500005,expression


Count the number of **article-annotation pairs**:

(This number also corresponds to the number of lines that will in the returned results if no LIMIT statement is used. The triple store (e.g., Virtuoso) must be configured to allow returning at least this many lines.)

In [81]:
no_of_article_category_pairs = eculture_query.send_count_query(query=
                            """SELECT (COUNT(?article) AS ?articlesWithAnnotation) 
                               WHERE {
                                   GRAPH wosGraph:{
                                       ?article a wos:Article .
                                       ?article ldr:annotations ?annotation_instance .
                                       ?annotation_instance rdfs:label ?annotation
                                   }
                               }
                            """,
                            query_variable_that_holds_count_results='articlesWithAnnotation')
no_of_article_category_pairs

2171532

Despite this count query's results, the actual number of returned lines is 1,322,491. As the queries return about 1 million lines maximum (possibly due to 100MB file size limit), these results were obtained from Virtuoso GUI with two 2 queries with differing OFFSET values (i.e., LIMIT = 1000000 OFFSET = 0, 1000000) 

### Keywords Plus

Keywords Plus has no other properties attached to it:

In [82]:
retrieve_all_sub_attributes('wos:ID')

Unnamed: 0,p


In [18]:
eculture_query.send_select_query("""

    SELECT DISTINCT (?article AS ?wosArticleUri) ?keywordsPlus
    WHERE{
        GRAPH wosGraph: {
            ?article a wos:Article ;
                     wos:ID ?keywordsPlus .
        }
    }
    LIMIT 100
    
""")

Unnamed: 0,wosArticleUri,keywordsPlus
0,wosres:WOS_000070935900005,DISEASE
1,wosres:WOS_000070935900005,THERAPY
2,wosres:WOS_000070935900005,ACUTE MYOCARDIAL-INFARCTION
3,wosres:WOS_000070935900005,STROKE PATIENTS
4,wosres:WOS_000070935900005,HANDICAP
5,wosres:WOS_000070935900005,INTEROBSERVER AGREEMENT
6,wosres:WOS_000070935900005,INTRACEREBRAL HEMORRHAGE
7,wosres:WOS_000070935900005,BLEEDING COMPLICATIONS
8,wosres:WOS_000070935900005,OPTIMAL INTENSITY
9,wosres:WOS_000070935900005,WARFARIN


'Keywords plus' strings are all in capitals. All capital names are used only for relationships in Neo4j, which is where this dataset will be imported. Therefore, **convert all 'keywords plus' strings to sentence case**:

In [17]:
eculture_query.send_select_query("""

    SELECT DISTINCT (?article AS ?wosArticleUri) 
           (    
                # Make keywords plus sentence case
                CONCAT(
                      SUBSTR(?keywords_plus, 1, 1),
                      LCASE(SUBSTR(?keywords_plus, 2))
                ) 
                AS ?keywordsPlus
            )
    WHERE{
        GRAPH wosGraph: {
            ?article a wos:Article ;
                     wos:ID ?keywords_plus .
        }
    }
    LIMIT 10
    
""")

Unnamed: 0,wosArticleUri,keywordsPlus
0,wosres:WOS_000070935900005,Disease
1,wosres:WOS_000070935900005,Therapy
2,wosres:WOS_000070935900005,Acute myocardial-infarction
3,wosres:WOS_000070935900005,Stroke patients
4,wosres:WOS_000070935900005,Handicap
5,wosres:WOS_000070935900005,Interobserver agreement
6,wosres:WOS_000070935900005,Intracerebral hemorrhage
7,wosres:WOS_000070935900005,Bleeding complications
8,wosres:WOS_000070935900005,Optimal intensity
9,wosres:WOS_000070935900005,Warfarin


Count the number of **article-keywords_plus pairs**:

(This number also corresponds to the number of lines that will in the returned results if no LIMIT statement is used. The triple store (e.g., Virtuoso) must be configured to allow returning at least this many lines.)

In [19]:
no_of_article_keywords_plus_pairs = eculture_query.send_count_query(query=
                            """SELECT (COUNT(?article) AS ?articlesWithKwPlus) 
                               WHERE {
                                   GRAPH wosGraph:{
                                       ?article a wos:Article ;
                                                wos:ID ?keywordPlus .
                                   }
                               }
                            """,
                            query_variable_that_holds_count_results='articlesWithKwPlus')
no_of_article_keywords_plus_pairs

916115

### Article Address

'Article address' has no other properties attached to it:

In [134]:
retrieve_all_sub_attributes('wos:C1')

Unnamed: 0,p


In [21]:
eculture_query.send_select_query("""

    SELECT DISTINCT (?article AS ?wosArticleUri) ?articleAddress
    WHERE{
        GRAPH wosGraph: {
            ?article a wos:Article ;
                     wos:C1 ?articleAddress .
        }
    }
    LIMIT 25
    
""")

Unnamed: 0,wosArticleUri,articleAddress
0,wosres:WOS_000060208200006,"Univ Amsterdam, NL-1012 WX Amsterdam, Netherlands."
1,wosres:WOS_000070935900005,"Ziekenhuis De Wever & Gregorius Heerlen, Heerlen, Netherlands. Acad Ziekenhuis, Utrecht, Netherl..."
2,wosres:WOS_000070948900005,"Vrije Univ Amsterdam, Amsterdam, Netherlands."
3,wosres:WOS_000070961600011,"Univ Amsterdam, Acad Med Ctr, Dept Neurol, NL-1100 DD Amsterdam, Netherlands. Univ Amsterdam, Ac..."
4,wosres:WOS_000070961600033,"Free Univ Amsterdam Hosp, Dept Diagnost Radiol, MR Ctr MS Res, NL-1007 MB Amsterdam, Netherlands..."
5,wosres:WOS_000070969600003,"Free Univ Amsterdam, Fac Med, Inst Res Extramural Med, NL-1081 BT Amsterdam, Netherlands. Free U..."
6,wosres:WOS_000070970500011,"Free Univ Amsterdam, Dept Ecol & Ecotoxicol, NL-1081 HV Amsterdam, Netherlands. DLO, Res Inst Ag..."
7,wosres:WOS_000070998100010,"Vrije Univ Amsterdam, Acad Hosp, Dept Anaesthesiol, NL-1007 MB Amsterdam, Netherlands. Leiden Un..."
8,wosres:WOS_000070998900007,"Univ Amsterdam, Acad Med Ctr, Dept Radiotherapy, NL-1105 DE Amsterdam, Netherlands. Delft Univ T..."
9,wosres:WOS_000071006900008,"Vrije Univ Amsterdam, Inst Res Extramural Med, NL-1081 BT Amsterdam, Netherlands. Vrije Univ Ams..."


The addresses start with the institution name, and end with country name. This pattern can be used in order to extract institutions and countries:

In [81]:
eculture_query.send_select_query("""

    SELECT DISTINCT (?article AS ?wosArticleUri) ?articleAddress 
    
    #"TODO: Institutions needs to be tokenized properly.
    # Simple instution names work OK, but when there are commas in 
    # institution names results such as '[Maccarone' appear."
    # Also, some institution names are in all caps.
    #(STRBEFORE(?articleAddress, ',') AS ?institution).
    
    
    # TODO: Regex only finds substring match but does not return it.
    # This needs to be solved before country can be extracted 
    #(REGEX(?articleAddress, ',[^,]*$', '$1') AS ?country)
    
    WHERE{
        GRAPH wosGraph: {
            ?article a wos:Article ;
                     wos:C1 ?articleAddress .
        }
    }
    LIMIT 25
    
""")

Unnamed: 0,wosArticleUri,articleAddress
0,wosres:WOS_000060208200006,"Univ Amsterdam, NL-1012 WX Amsterdam, Netherlands."
1,wosres:WOS_000070935900005,"Ziekenhuis De Wever & Gregorius Heerlen, Heerlen, Netherlands. Acad Ziekenhuis, Utrecht, Netherl..."
2,wosres:WOS_000070948900005,"Vrije Univ Amsterdam, Amsterdam, Netherlands."
3,wosres:WOS_000070961600011,"Univ Amsterdam, Acad Med Ctr, Dept Neurol, NL-1100 DD Amsterdam, Netherlands. Univ Amsterdam, Ac..."
4,wosres:WOS_000070961600033,"Free Univ Amsterdam Hosp, Dept Diagnost Radiol, MR Ctr MS Res, NL-1007 MB Amsterdam, Netherlands..."
5,wosres:WOS_000070969600003,"Free Univ Amsterdam, Fac Med, Inst Res Extramural Med, NL-1081 BT Amsterdam, Netherlands. Free U..."
6,wosres:WOS_000070970500011,"Free Univ Amsterdam, Dept Ecol & Ecotoxicol, NL-1081 HV Amsterdam, Netherlands. DLO, Res Inst Ag..."
7,wosres:WOS_000070998100010,"Vrije Univ Amsterdam, Acad Hosp, Dept Anaesthesiol, NL-1007 MB Amsterdam, Netherlands. Leiden Un..."
8,wosres:WOS_000070998900007,"Univ Amsterdam, Acad Med Ctr, Dept Radiotherapy, NL-1105 DE Amsterdam, Netherlands. Delft Univ T..."
9,wosres:WOS_000071006900008,"Vrije Univ Amsterdam, Inst Res Extramural Med, NL-1081 BT Amsterdam, Netherlands. Vrije Univ Ams..."


Count the number of **article-correspondence_adress pairs**:

(This number also corresponds to the number of lines that will in the returned results if no LIMIT statement is used. The triple store (e.g., Virtuoso) must be configured to allow returning at least this many lines.)

In [136]:
no_of_article_address_pairs = eculture_query.send_count_query(query=
                            """SELECT (COUNT(?article) AS ?articleWithAddress) 
                               WHERE {
                                   GRAPH wosGraph:{
                                       ?article a wos:Article ;
                                                wos:C1 ?address .
                                   }
                               }
                            """,
                            query_variable_that_holds_count_results='articleWithAddress')
no_of_article_address_pairs

129116

### Article Email

'Article email' has no other properties attached to it:

In [137]:
retrieve_all_sub_attributes('wos:EM')

Unnamed: 0,p


In [139]:
eculture_query.send_select_query("""

    SELECT DISTINCT (?article AS ?wosArticleUri) ?articleEmail
    WHERE{
        GRAPH wosGraph: {
            ?article a wos:Article ;
                     wos:EM ?articleEmail .
        }
    }
    LIMIT 25
    
""")

Unnamed: 0,wosArticleUri,articleEmail
0,wosres:WOS_000071047000001,tgugt@nki.nl
1,wosres:WOS_000071077500014,kits@bio.vu.nl
2,wosres:WOS_000071174900006,zandberg@stm.tudelft.nl
3,wosres:WOS_000071196300003,DN.den.Hartog@psy.vu.nl
4,wosres:WOS_000071362800008,vminnen@bio.vu.nl
5,wosres:WOS_000071402800007,a417hell@chem.uva.nl
6,wosres:WOS_000071469300016,erker@uni-muenster.de
7,wosres:WOS_000071503900020,ellen.hoogeveen@paradigm.nl
8,wosres:WOS_000071519500005,prc@psu.edu
9,wosres:WOS_000071549800012,virgis@nat.vu.nl


Count the number of **article-correspondence_adress pairs**:

(This number also corresponds to the number of lines that will in the returned results if no LIMIT statement is used. The triple store (e.g., Virtuoso) must be configured to allow returning at least this many lines.)

In [140]:
no_of_article_email_pairs = eculture_query.send_count_query(query=
                            """SELECT (COUNT(?article) AS ?articleWithEmail) 
                               WHERE {
                                   GRAPH wosGraph:{
                                       ?article a wos:Article ;
                                                wos:EM ?articleEmail .
                                   }
                               }
                            """,
                            query_variable_that_holds_count_results='articleWithEmail')
no_of_article_email_pairs

86842

### Author Keywords

'Author keywords' has no other properties attached to it:

In [90]:
retrieve_all_sub_attributes('wos:DE')

Unnamed: 0,p


In [96]:
eculture_query.send_select_query("""

    SELECT DISTINCT (?article AS ?wosArticleUri) ?authorKeywords
    WHERE{
        GRAPH wosGraph: {
            ?article a wos:Article ;
                     wos:DE ?authorKeywords .
        }
    }
    LIMIT 10
    
""")

Unnamed: 0,wosArticleUri,authorKeywords
0,wosres:WOS_000070970500011,Elymus athericus; growth; photosynthesis; ozone; UV-B radiation
1,wosres:WOS_000070998100010,"pain, postoperative; analgesics, prescribing"
2,wosres:WOS_000070998900007,DIC; Nomarski; interference; microscopy; CCD; image processing;
3,wosres:WOS_000070998900007,analysis; reconstruction; optical pathlength; phase; transparent; living
4,wosres:WOS_000071006900008,atherosclerosis; homocysteine; metformin; vitamin B-12
5,wosres:WOS_000071013000007,policy; household economics
6,wosres:WOS_000071013000007,sub-Saharan Africa; Swaziland; labor migration; food security; labor
7,wosres:WOS_000071021600006,nitric oxide radical; NO scavenging; thiol; S-nitrosothiol
8,wosres:WOS_000071021600006,(electrochemical); NO sensing
9,wosres:WOS_000071040300005,lumbar spine; vertebra; trabecular bone; Wolff's Law; intervertebral


 Because the keywords are in a semicolon-separated list, they need to be tokenized. Furthermore, keywords contain items such as "Wolff's Law" and "(electrochemical)". They need to be cleaned from special characters. These operations will be performed using the functionality developed for tokenizing, cleaning, and updating strings in the WoS triple store.

#### Use the Keywords Pipeline on the Entire WoS Graph to Tokenize and Clean Author Keywords

This separates and cleans all keywords in WoS Graph, and updates the graph with these processed keywords using the following structure: <br> *?article_id--kfir:hasAuthorKeyword-->?cleaned_keyword* structure.

Convert eculture_query (Gastrodon_Query class) to wos_query (WebOfScienceQuery class) to access tokenization and cleaning methods aimed for WOS database:

In [98]:
from retriever.sparql_tools import WebOfScienceQuery
from preprocessor.dataframe_tools import Data_Frame

In [99]:
wos_query = WebOfScienceQuery(eculture_query)

(0) **CLEAN** KFIR Graph:

Delete all triples in the kfirGraph (separately from wosGraph, kfirGraph is where all the modified files are written):

In [100]:
# WARNING: Only to be used when resetting the graph. At other times, 
# the graph is likely to be populated.

#wos_query.send_update_query("""
#    CLEAR GRAPH kfirGraph:
#""")

Confirm deletions:

In [25]:
wos_query.send_select_query("""
    SELECT * {
        GRAPH kfirGraph: {
            ?s ?p ?o .
        }
    }
""")

Unnamed: 0,s,p,o


In [None]:
# WARNING: LONG PROCESSING TIME
#wos_query.tokenize_purify_and_update_string_literals(target_property_uri="wos:DE",
#                                                     uri_of_graph_to_write_the_output = 'kfirGraph:',
#                                                     new_property_uri="kfir:hasAuthorKeyword",
#                                                     batch_size=10,
#                                                     show_progress=True)

[------------------------------------------------------------] 0% ...Processing strings and updating specified graph
[=-----------------------------------------------------------] 0% ...Processing strings and updating specified graph
[=-----------------------------------------------------------] 1% ...Processing strings and updating specified graph


Check KFIR graph:

(Tokenized & cleaned author keywords are connected to article ids with a new property)

In [22]:
wos_query.send_select_query("""
    SELECT * {
        GRAPH kfirGraph: {
            ?s ?p ?o .
        }
    }
    LIMIT 10
    """
)       

Unnamed: 0,s,p,o
0,wosres:WOS_000070970500011,kfir:hasAuthorKeyword,photosynthesis
1,wosres:WOS_000070970500011,kfir:hasAuthorKeyword,ozone
2,wosres:WOS_000070970500011,kfir:hasAuthorKeyword,UV-B radiation
3,wosres:WOS_000070970500011,kfir:hasAuthorKeyword,growth
4,wosres:WOS_000070970500011,kfir:hasAuthorKeyword,Elymus athericus
5,wosres:WOS_000070998100010,kfir:hasAuthorKeyword,"pain, postoperative"
6,wosres:WOS_000070998100010,kfir:hasAuthorKeyword,"analgesics, prescribing"
7,wosres:WOS_000070998900007,kfir:hasAuthorKeyword,CCD
8,wosres:WOS_000070998900007,kfir:hasAuthorKeyword,image processing
9,wosres:WOS_000070998900007,kfir:hasAuthorKeyword,microscopy


Select articles and the preprocessed author keywords:

In [31]:
wos_query.send_select_query("""

    SELECT DISTINCT (?article AS ?wosArticleUri) ?authorKeyword
    WHERE {
        GRAPH kfirGraph: {
            ?article kfir:hasAuthorKeyword ?authorKeyword .
        }
    }
    LIMIT 100
    
""")

Unnamed: 0,wosArticleUri,authorKeyword
0,wosres:WOS_000178582800024,2
1,wosres:WOS_000220544100007,2
2,wosres:WOS_000231781300002,2
3,wosres:WOS_000346226400009,2
4,wosres:WOS_000368811500031,2
5,wosres:WOS_000263014800031,2
6,wosres:WOS_000289642300006,2
7,wosres:WOS_000298741400005,2
8,wosres:WOS_000326824300021,2
9,wosres:WOS_000333260900019,2


Count the number of **article-author_keyword pairs** after preprocessing:

(This number also corresponds to the number of lines that will in the returned results if no LIMIT statement is used. The triple store (e.g., Virtuoso) must be configured to allow returning at least this many lines.)

In [101]:
no_of_article_au_keyword_pairs = wos_query.send_count_query(query=
                            """SELECT (COUNT(?article) AS ?articlesWithAuKeyword) 
                               WHERE {
                                   GRAPH kfirGraph:{
                                       ?article kfir:hasAuthorKeyword ?authorKeyword .
                                   }
                               }
                            """,
                         query_variable_that_holds_count_results='articlesWithAuKeyword')
no_of_article_au_keyword_pairs

421723

Count the number of **unique articles** that have keywords:

In [27]:
no_of_all_articles = wos_query.send_count_query(query=
                            """SELECT (COUNT(DISTINCT ?s) AS ?allArticles) 
                               WHERE {
                                   GRAPH kfirGraph:{
                                       ?s ?p ?o .
                                   }
                               }
                            """,
                         query_variable_that_holds_count_results='allArticles')

no_of_articles_with_keywords = wos_query.send_count_query(query=
                            """SELECT (COUNT(DISTINCT ?s) AS ?kwArticles) 
                               WHERE {
                                   GRAPH kfirGraph:{
                                       ?s kfir:hasAuthorKeyword ?o .
                                   }
                               }
                            """,
                         query_variable_that_holds_count_results='kwArticles')
print('All articles: ', no_of_all_articles)
print('Articles with keywords:', no_of_articles_with_keywords)

All articles:  122705
Articles with keywords: 77532


Count the number of **unique keywords**:

In [35]:
no_of_keywords = wos_query.send_count_query(query=
                        """SELECT (COUNT(DISTINCT ?o) AS ?keywords) 
                           WHERE {
                               GRAPH kfirGraph:{
                                   ?s kfir:hasAuthorKeyword ?o .
                                }
                        }""",
                         query_variable_that_holds_count_results='keywords')
no_of_keywords

175843

### Web of Science Category

Web of Science Category has no other properties attached to it:

In [102]:
retrieve_all_sub_attributes('wos:WC')

Unnamed: 0,p


In [38]:
eculture_query.send_select_query("""

    SELECT DISTINCT (?article AS ?wosArticleUri) ?webOfScienceCategory
    WHERE{
        GRAPH wosGraph: {
            ?article a wos:Article .
            ?article wos:WC ?webOfScienceCategory .
        }
    }
    LIMIT 25
    
""")

Unnamed: 0,wosArticleUri,webOfScienceCategory
0,wosres:WOS_000060208200006,"Film, Radio, Television"
1,wosres:WOS_000070935900005,Clinical Neurology; Neurosciences
2,wosres:WOS_000070948900005,Asian Studies
3,wosres:WOS_000070961600011,Clinical Neurology
4,wosres:WOS_000070961600033,Clinical Neurology
5,wosres:WOS_000070969600003,Sport Sciences
6,wosres:WOS_000070970500011,Environmental Sciences
7,wosres:WOS_000070998100010,Anesthesiology
8,wosres:WOS_000070998900007,Microscopy
9,wosres:WOS_000071006900008,"Medicine, General & Internal"


Web of Science Categories also need to be tokenized and cleaned (see 'Author keywords' section for further explanations).

#### Use the Keywords Pipeline on the Entire WoS Graph to Tokenize and Clean WOS Categories

Convert eculture_query (Gastrodon_Query class) to wos_query (WebOfScienceQuery class) to access tokenization and cleaning methods aimed for WOS database:

In [105]:
from retriever.sparql_tools import WebOfScienceQuery
from preprocessor.dataframe_tools import Data_Frame

In [106]:
wos_query = WebOfScienceQuery(eculture_query)

In [22]:
# WARNING: LONG PROCESSING TIME
#wos_query.tokenize_purify_and_update_string_literals(target_property_uri="wos:WC",
#                                                     uri_of_graph_to_write_the_output = 'kfirGraph:',
#                                                     new_property_uri="kfir:hasWosCategory",
#                                                     batch_size=10,
#                                                     show_progress=True)

[------------------------------------------------------------] 0% ...Processing strings and updating specified graph
[=-----------------------------------------------------------] 0% ...Processing strings and updating specified graph
[=-----------------------------------------------------------] 1% ...Processing strings and updating specified graph
[=-----------------------------------------------------------] 2% ...Processing strings and updating specified graph
[==----------------------------------------------------------] 2% ...Processing strings and updating specified graph
[==----------------------------------------------------------] 3% ...Processing strings and updating specified graph
[==----------------------------------------------------------] 4% ...Processing strings and updating specified graph
[===---------------------------------------------------------] 4% ...Processing strings and updating specified graph
[===---------------------------------------------------------] 5

URLError: <urlopen error [Errno 11001] getaddrinfo failed>

The operation currently stops at 81 percent. This needs to be investigated.

Select articles and the preprocessed WOS categories:

In [107]:
wos_query.send_select_query("""

    SELECT DISTINCT (?article AS ?wosArticleUri) ?wosCategory
    WHERE {
        GRAPH kfirGraph: {
            ?article kfir:hasWosCategory ?wosCategory .
        }
    }
    LIMIT 100
    
""")

Unnamed: 0,wosArticleUri,wosCategory
0,wosres:WOS_000060208200006,"Film, Radio, Television"
1,wosres:WOS_000070935900005,Clinical Neurology
2,wosres:WOS_000070935900005,Neurosciences
3,wosres:WOS_000070948900005,Asian Studies
4,wosres:WOS_000070961600011,Clinical Neurology
5,wosres:WOS_000070961600033,Clinical Neurology
6,wosres:WOS_000070969600003,Sport Sciences
7,wosres:WOS_000070970500011,Environmental Sciences
8,wosres:WOS_000070998100010,Anesthesiology
9,wosres:WOS_000070998900007,Microscopy


Count the number of **article-wos_category pairs** after preprocessing:

(This number also corresponds to the number of lines that will in the returned results. The triple store (e.g., Virtuoso) must be configured to allow returning *at least* this many lines.)

In [109]:
no_of_article_wos_category_pairs = wos_query.send_count_query(query=
                            """SELECT (COUNT(?article) AS ?articlesWithWosCategory) 
                               WHERE {
                                   GRAPH kfirGraph:{
                                       ?article kfir:hasWosCategory ?wosCategory .
                                   }
                               }
                            """,
                         query_variable_that_holds_count_results='articlesWithWosCategory')
no_of_article_wos_category_pairs

183925

Count the number of **unique articles** that have wos_categories:

In [38]:
no_of_all_articles = wos_query.send_count_query(query=
                            """SELECT (COUNT(DISTINCT ?s) AS ?allArticles) 
                               WHERE {
                                   GRAPH kfirGraph:{
                                       ?s ?p ?o .
                                   }
                               }
                            """,
                         query_variable_that_holds_count_results='allArticles')

no_of_articles_with_wos_categories = wos_query.send_count_query(query=
                            """SELECT (COUNT(DISTINCT ?s) AS ?articlesWithWosCat) 
                               WHERE {
                                   GRAPH kfirGraph:{
                                       ?s kfir:hasWosCategory ?o .
                                   }
                               }
                            """,
                         query_variable_that_holds_count_results='articlesWithWosCat')
print('All articles: ', no_of_all_articles)
print('Articles with WOS category:', no_of_articles_with_wos_categories)

All articles:  122705
Articles with WOS category: 110122


Count the number of **unique wos_categories**:

In [37]:
no_of_categories = wos_query.send_count_query(query=
                        """SELECT (COUNT(DISTINCT ?o) AS ?categories) 
                           WHERE {
                               GRAPH kfirGraph:{
                                   ?s kfir:hasWosCategory ?o .
                                }
                        }""",
                         query_variable_that_holds_count_results='categories')
no_of_categories

422

# APPENDIX

## Step-by-step Processing Pipeline for Author Keywords 
**(Unpacked tokenize_purify_and_update_string_literals method)**

'Keywords' has no other properties attached to it:

In [47]:
retrieve_all_sub_attributes('wos:DE')

Unnamed: 0,p


### Building the Keyword Processing Pipeline and Testing it on Part of the WoS Graph

In [48]:
from retriever.sparql_tools import WebOfScienceQuery
from preprocessor.dataframe_tools import Data_Frame

In [49]:
wos_query = WebOfScienceQuery(eculture_query)

**Step 1:** Retrieve a test dataset (using LIMIT ...) containing article ids and associated keywords:

In [50]:
ids_vs_keywords = wos_query.retrieve_relationships_of_property('wos:DE',
                                                               desired_column_name_for_literal='authorKeywords',
                                                               limit=10)
ids_vs_keywords

Unnamed: 0,wosArticleUri,authorKeywords
0,wosres:WOS_000070970500011,Elymus athericus; growth; photosynthesis; ozone; UV-B radiation
1,wosres:WOS_000070998100010,"pain, postoperative; analgesics, prescribing"
2,wosres:WOS_000070998900007,DIC; Nomarski; interference; microscopy; CCD; image processing;
3,wosres:WOS_000070998900007,analysis; reconstruction; optical pathlength; phase; transparent; living
4,wosres:WOS_000071006900008,atherosclerosis; homocysteine; metformin; vitamin B-12
5,wosres:WOS_000071013000007,policy; household economics
6,wosres:WOS_000071013000007,sub-Saharan Africa; Swaziland; labor migration; food security; labor
7,wosres:WOS_000071021600006,nitric oxide radical; NO scavenging; thiol; S-nitrosothiol
8,wosres:WOS_000071021600006,(electrochemical); NO sensing
9,wosres:WOS_000071040300005,lumbar spine; vertebra; trabecular bone; Wolff's Law; intervertebral


**Step 2:** Because the keywords are in a semicolon-separated list, they need to be tokenized:

In [51]:
ids_vs_keywords = Data_Frame(ids_vs_keywords)  # this is not 'pandas.DataFrame' class
ids_vs_keywords.tokenize_string_column(string_column_name='authorKeywords', 
                                       id_column_name='wosArticleUri', 
                                       delimiter_pattern_in_literal_cells='; ')
ids_vs_keywords.dataframe

Unnamed: 0,wosArticleUri,authorKeywords
0,wosres:WOS_000070970500011,Elymus athericus
1,wosres:WOS_000070970500011,growth
2,wosres:WOS_000070970500011,photosynthesis
3,wosres:WOS_000070970500011,ozone
4,wosres:WOS_000070970500011,UV-B radiation
5,wosres:WOS_000070998100010,"pain, postoperative"
6,wosres:WOS_000070998100010,"analgesics, prescribing"
7,wosres:WOS_000070998900007,DIC
8,wosres:WOS_000070998900007,Nomarski
9,wosres:WOS_000070998900007,interference


**Step 3:** Tokenized keywords contain items such as "Wolff's Law" and "(electrochemical)". They need to be cleaned from special characters:

In [52]:
ids_vs_keywords.purify_column(target_column_name='authorKeywords')
ids_vs_keywords.dataframe

Unnamed: 0,wosArticleUri,authorKeywords
0,wosres:WOS_000070970500011,Elymus athericus
1,wosres:WOS_000070970500011,growth
2,wosres:WOS_000070970500011,photosynthesis
3,wosres:WOS_000070970500011,ozone
4,wosres:WOS_000070970500011,UV-B radiation
5,wosres:WOS_000070998100010,"pain, postoperative"
6,wosres:WOS_000070998100010,"analgesics, prescribing"
7,wosres:WOS_000070998900007,DIC
8,wosres:WOS_000070998900007,Nomarski
9,wosres:WOS_000070998900007,interference


**Step 4:** Collapse the keywords onto article uris as lists:

In [53]:
ids_vs_keywords = ids_vs_keywords.collapse_dataframe_on_column(values_column_name='authorKeywords', identifier_column_name='wosArticleUri')
ids_vs_keywords.dataframe

Unnamed: 0,wosArticleUri,authorKeywords
0,wosres:WOS_000070970500011,"[Elymus athericus, growth, photosynthesis, ozone, UV-B radiation]"
1,wosres:WOS_000070998100010,"[pain, postoperative, analgesics, prescribing]"
2,wosres:WOS_000070998900007,"[DIC, Nomarski, interference, microscopy, CCD, image processing, analysis, reconstruction, optic..."
3,wosres:WOS_000071006900008,"[atherosclerosis, homocysteine, metformin, vitamin B-12]"
4,wosres:WOS_000071013000007,"[policy, household economics, sub-Saharan Africa, Swaziland, labor migration, food security, labor]"
5,wosres:WOS_000071021600006,"[nitric oxide radical, NO scavenging, thiol, S-nitrosothiol, electrochemical, NO sensing]"
6,wosres:WOS_000071040300005,"[lumbar spine, vertebra, trabecular bone, Wolffs Law, intervertebral]"


**Step 5**: Prepare strings to be later passed on to VALUES keyword in Gastrodon_Query:<br>
(This alleviates the need to send one SPARQL query per keyword, and instead, will be used to group keywords per article ID using the VALUES keyword)

In [54]:
from retriever.sparql_tools import Sparql_Parameter

In [55]:
# Extract the author keywords column
author_keywords_column = ids_vs_keywords.dataframe['authorKeywords']

# Convert lists on each cell of author keywords column to parameter strings
parameterized_keywords = Sparql_Parameter.Values_Parameter_Series()
parameterized_keywords.import_and_convert_pandas_series(author_keywords_column)

# Replace the author keywords column of the dataframe with the parameterized version 
ids_vs_keywords.dataframe['authorKeywords'] = parameterized_keywords.series
ids_vs_keywords.dataframe

Unnamed: 0,wosArticleUri,authorKeywords
0,wosres:WOS_000070970500011,"""Elymus athericus"" ""growth"" ""photosynthesis"" ""ozone"" ""UV-B radiation"""
1,wosres:WOS_000070998100010,"""pain, postoperative"" ""analgesics, prescribing"""
2,wosres:WOS_000070998900007,"""DIC"" ""Nomarski"" ""interference"" ""microscopy"" ""CCD"" ""image processing"" ""analysis"" ""reconstruction..."
3,wosres:WOS_000071006900008,"""atherosclerosis"" ""homocysteine"" ""metformin"" ""vitamin B-12"""
4,wosres:WOS_000071013000007,"""policy"" ""household economics"" ""sub-Saharan Africa"" ""Swaziland"" ""labor migration"" ""food security..."
5,wosres:WOS_000071021600006,"""nitric oxide radical"" ""NO scavenging"" ""thiol"" ""S-nitrosothiol"" ""electrochemical"" ""NO sensing"""
6,wosres:WOS_000071040300005,"""lumbar spine"" ""vertebra"" ""trabecular bone"" ""Wolffs Law"" ""intervertebral"""


Clean the test graph and confirm that it is empty:

In [36]:
wos_query.send_update_query("""
    CLEAR GRAPH testGraph:
""")

wos_query.send_select_query("""
    SELECT * {
        GRAPH testGraph: {
            ?s ?p ?o .
        }
    }
""")


Sending Accept header '*/*' because unexpected returned format 'json' in a 'CLEAR' SPARQL query form



Unnamed: 0,s,p,o


**Step 6**: Update the test graph using the parameterised keywords:

In [57]:
for index, each_row in ids_vs_keywords.dataframe.iterrows():
    each_article_id = each_row.values[0]
    each_parameter_string = each_row.values[1]
    wos_query.send_update_query("""
    
        INSERT {
            GRAPH testGraph: {
                %s kfir:hasAuthorKeyword ?keyword
            }
        }
        WHERE{
            VALUES ?keyword {%s}
        }
        
    """ % (each_article_id, each_parameter_string)
    )

Confirm the update:

In [58]:
wos_query.send_select_query("""
    SELECT * {
        GRAPH testGraph: {
            ?s ?p ?o .
        }
    }
""")

Unnamed: 0,s,p,o
0,wosres:WOS_000070970500011,kfir:hasAuthorKeyword,photosynthesis
1,wosres:WOS_000070970500011,kfir:hasAuthorKeyword,ozone
2,wosres:WOS_000070970500011,kfir:hasAuthorKeyword,UV-B radiation
3,wosres:WOS_000070970500011,kfir:hasAuthorKeyword,growth
4,wosres:WOS_000070970500011,kfir:hasAuthorKeyword,Elymus athericus
5,wosres:WOS_000070998100010,kfir:hasAuthorKeyword,"pain, postoperative"
6,wosres:WOS_000070998100010,kfir:hasAuthorKeyword,"analgesics, prescribing"
7,wosres:WOS_000070998900007,kfir:hasAuthorKeyword,CCD
8,wosres:WOS_000070998900007,kfir:hasAuthorKeyword,image processing
9,wosres:WOS_000070998900007,kfir:hasAuthorKeyword,microscopy
