# KFIR

# SETUP: DEPENDENCIES AND CREDENTIALS

## Working Directory

What is the current working directory?:

In [202]:
import os
os.getcwd()

'C:\\Users\\Clokman\\Google Drive\\__Projects__\\Code\\KFIR\\notebooks'

Add parent directory to path if necessary:

In [203]:
import sys, os, re

working_directory = os.getcwd()
if re.search('\\\\notebooks$', working_directory):
    one_directory_up = re.sub('\\\\notebooks$', '', working_directory)
    sys.path.append(one_directory_up)
    
sys.path

['',
 'C:\\ProgramData\\Anaconda3\\python36.zip',
 'C:\\ProgramData\\Anaconda3\\DLLs',
 'C:\\ProgramData\\Anaconda3\\lib',
 'C:\\ProgramData\\Anaconda3',
 'C:\\ProgramData\\Anaconda3\\lib\\site-packages',
 'C:\\ProgramData\\Anaconda3\\lib\\site-packages\\Sphinx-1.5.1-py3.6.egg',
 'C:\\ProgramData\\Anaconda3\\lib\\site-packages\\win32',
 'C:\\ProgramData\\Anaconda3\\lib\\site-packages\\win32\\lib',
 'C:\\ProgramData\\Anaconda3\\lib\\site-packages\\Pythonwin',
 'C:\\ProgramData\\Anaconda3\\lib\\site-packages\\setuptools-27.2.0-py3.6.egg',
 'C:\\ProgramData\\Anaconda3\\lib\\site-packages\\IPython\\extensions',
 'C:\\Users\\Clokman\\.ipython',
 'C:\\Users\\Clokman\\Google Drive\\__Projects__\\Code\\KFIR',
 'C:\\Users\\Clokman\\Google Drive\\__Projects__\\Code\\KFIR']

## Initialize General Packages

In [204]:
import pandas
import numpy

## Initialize Plotly

Check current version:

In [205]:
from plotly import __version__ as plotly_version
plotly_version

'2.5.1'

### Online Plotly

Read plotly credentials from file:

In [206]:
from preprocessor.Text_File import Text_File

plotly_file = Text_File('..//private//plotly_credentials')
plotly_file = plotly_file.return_content()
plotly_credentials = plotly_file.splitlines()

plotly_username = plotly_credentials[0]
plotly_key = plotly_credentials[1]

Set parameters for online usage:

In [207]:
import plotly.plotly as plotly_online
iplot_online = plotly_online.iplot

import plotly.graph_objs as graph_objects
from plotly.tools import set_credentials_file

set_credentials_file(username=plotly_username, api_key=plotly_key)  # put your own plotly username and api key here 

### Offline Plotly

Setup for offline usage:

In [208]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

## Initialize Retriever: Gastrodon_Query

Import Gastrodon_Query (for running SPARQL queries in Jupyter):

In [209]:
from retriever.sparql_tools import Gastrodon_Query

Initialize eculture query:

In [210]:
eculture_query = Gastrodon_Query()

Define prefixes:

In [211]:
eculture_query.set_prefixes("""
    @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
    @prefix skos: <http://www.w3.org/2004/02/skos/core#> .
    @prefix dbo: <http://dbpedia.org/ontology/> .
    
    @prefix wos: <http://wos.risis.eu/vocabulary/> .
    @prefix wosres: <http://wos.risis.eu/resource/> .
    @prefix kfir: <http://clokman.com/kfir/ontology#> .
    @prefix ldr: <https://github.com/ali1k/ld-reactor/blob/master/vocabulary/index.ttl#> .

    @prefix wosGraph: <http://clokman.com/wos> .
    @prefix kfirGraph: <http://clokman.com/kfir> .
    @prefix testGraph: <http://clokman.com/test> .
""")

eculture_query._get_prefixes()

{rdflib.term.URIRef('http://clokman.com/kfir'): 'kfirGraph',
 rdflib.term.URIRef('http://clokman.com/kfir/ontology#'): 'kfir',
 rdflib.term.URIRef('http://clokman.com/test'): 'testGraph',
 rdflib.term.URIRef('http://clokman.com/wos'): 'wosGraph',
 rdflib.term.URIRef('http://dbpedia.org/ontology/'): 'dbo',
 rdflib.term.URIRef('http://wos.risis.eu/resource/'): 'wosres',
 rdflib.term.URIRef('http://wos.risis.eu/vocabulary/'): 'wos',
 rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#'): 'rdf',
 rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#'): 'rdfs',
 rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#'): 'xsd',
 rdflib.term.URIRef('http://www.w3.org/2004/02/skos/core#'): 'skos',
 rdflib.term.URIRef('http://www.w3.org/XML/1998/namespace'): 'xml',
 rdflib.term.URIRef('https://github.com/ali1k/ld-reactor/blob/master/vocabulary/index.ttl#'): 'ldr'}

Read endpoint from file:

In [212]:
from preprocessor.Text_File import Text_File

# Get endpoint address from file
eculture_endpoint_url_file = Text_File('..//private//eculture_virtuoso_endpoint_address')
eculture_endpoint_url = eculture_endpoint_url_file.return_content()

Define endpoint:

In [213]:
eculture_query.set_endpoint(eculture_endpoint_url)

<retriever.sparql_tools.Gastrodon_Query at 0x24aa9c51080>

# QUERIES

## Database Statistics

Get counts for common fields:

In [214]:
wos_mappings = {'wos:TI':  'title', # wos: is defined in prefixes variable
                'wos:AF':  'author',
                'wos:SN':  'issn',
                'wos:DOI': 'doi',
                'wos:EM':  'email',
                'wos:DE':  'keywords_author',
                'wos:ID':  'keywords_plus',
                'wos:SC':  'subject_category',
                'wos:WC':  'web_of_science_category',
                'wos:PY':  'publication_year',
                'wos:CR':  'has_cited',
                'wos:NR':  'has_cited_count',
                'wos:Z9':  'cited_by_count_universal',
                'wos:TC':  'cited_by_count_local',
                'wos:SO':  'source_publication',
                'wos:PU':  'publisher',
                'wos:C1':  'author_address',
                'ldr:annotations': 'annotation'
                }

wos_field_counts = {}

print('Counting...')
for each_wos_field_name_abbreviation, each_field_name in wos_mappings.items():
    each_count = eculture_query.send_count_query(each_field_name + "s","""
    SELECT (COUNT(DISTINCT ?%s) as ?%ss) 
    WHERE{
        GRAPH wosGraph: {
            ?article a wos:Article .
            ?article %s ?%s .
        }
    }
    """ % (each_field_name, each_field_name, each_wos_field_name_abbreviation, each_field_name))
    wos_field_counts[each_field_name] = each_count
    print (each_wos_field_name_abbreviation, '/', each_field_name, ': ', each_count)
print('Counting finished.')

Counting...
wos:TI / title :  135985
wos:AF / author :  3485320
wos:SN / issn :  9627
wos:DOI / doi :  123505
wos:EM / email :  51997
wos:DE / keywords_author :  125552
wos:ID / keywords_plus :  156689
wos:SC / subject_category :  151
wos:WC / web_of_science_category :  2323
wos:PY / publication_year :  35
wos:CR / has_cited :  2854040
wos:NR / has_cited_count :  351
wos:Z9 / cited_by_count_universal :  880
wos:TC / cited_by_count_local :  852
wos:SO / source_publication :  9708
wos:PU / publisher :  2354
wos:C1 / author_address :  118156
ldr:annotations / annotation :  2158243
Counting finished.


Get number of articles:

In [215]:
article_count = eculture_query.send_count_query('articles', """
    SELECT (COUNT(DISTINCT ?article) as ?articles) 
    WHERE{
        GRAPH wosGraph: {
            ?article a wos:Article .
        }
    }
""")

article_count

136125

Add article_count to wos_field_counts:

In [216]:
wos_field_counts['article'] = article_count
wos_field_counts

{'annotation': 2158243,
 'article': 136125,
 'author': 3485320,
 'author_address': 118156,
 'cited_by_count_local': 852,
 'cited_by_count_universal': 880,
 'doi': 123505,
 'email': 51997,
 'has_cited': 2854040,
 'has_cited_count': 351,
 'issn': 9627,
 'keywords_author': 125552,
 'keywords_plus': 156689,
 'publication_year': 35,
 'publisher': 2354,
 'source_publication': 9708,
 'subject_category': 151,
 'title': 135985,
 'web_of_science_category': 2323}

Put results in a dataframe and sort them:

In [217]:
wos_field_counts_dataframe = pandas.Series(wos_field_counts)
wos_sorted_counts_dataframe = wos_field_counts_dataframe.sort_values(inplace=True, ascending=False)
wos_field_counts_dataframe

author                      3485320
has_cited                   2854040
annotation                  2158243
keywords_plus                156689
article                      136125
title                        135985
keywords_author              125552
doi                          123505
author_address               118156
email                         51997
source_publication             9708
issn                           9627
publisher                      2354
web_of_science_category        2323
cited_by_count_universal        880
cited_by_count_local            852
has_cited_count                 351
subject_category                151
publication_year                 35
dtype: int64

Plot results:

In [221]:
wos_field_counts_labels = list(wos_field_counts_dataframe.keys())
wos_field_counts_values = list(wos_field_counts_dataframe)

data = [graph_objects.Bar(x=wos_field_counts_labels,
                          y=wos_field_counts_values)]

iplot_online(data)

## Mapping and Transforming the Database

A function to retrieve all attributes related to a target property (e.g., author --> author label, author alternative label)

In [219]:
def retrieve_all_sub_attributes (target_property_of_articles):    

    result = eculture_query.send_select_query("""
        SELECT DISTINCT ?p
        WHERE{
            GRAPH wosGraph: {
                ?article a wos:Article .
                ?article %s ?target_object .
                ?target_object ?p ?o .
            }
        }
    """ % target_property_of_articles)
    
    return result

### Titles

Titles have no other properties attached to them:

In [220]:
retrieve_all_sub_attributes('wos:TI')

Unnamed: 0,p


Display titles in a table:

In [159]:
eculture_query.send_select_query("""

    SELECT DISTINCT (?article AS ?wosArticleUri) ?title
    WHERE{
        GRAPH wosGraph: {
            ?article a wos:Article .
            ?article wos:TI ?title .
        }
    }
    LIMIT 10
    
""")

Unnamed: 0,wosArticleUri,title
0,wosres:WOS_000060208200006,Hollywood Berlin (The popularity of Nazi entertainment films in Germany under Hitler)
1,wosres:WOS_000070935900005,A randomized trial of anticoagulants versus aspirin after cerebral ischemia of presumed arterial...
2,wosres:WOS_000070948900005,The strength of numbers: Enumerating communities in India's princely states
3,wosres:WOS_000070961600011,Some patients with intracranial aneurysms have a reduced type III type I collagen ratio - A case...
4,wosres:WOS_000070961600033,Improving interobserver variation in reporting gadolinium-enhanced MRI lesions in multiple scler...
5,wosres:WOS_000070969600003,A physically active lifestyle - public health's best buy?
6,wosres:WOS_000070970500011,The effect of reciprocal treatments with ozone and ultraviolet-B radiation on photosynthesis and...
7,wosres:WOS_000070998100010,Compliance in administration of prescribed analgesics
8,wosres:WOS_000070998900007,Reconstruction of optical pathlength distributions from images obtained by a wide-field differen...
9,wosres:WOS_000071006900008,Does metformin increase the serum total homocysteine level in non-insulin-dependent diabetes mel...


### Authors

In [160]:
retrieve_all_sub_attributes('wos:AF')

Unnamed: 0,p
0,rdf:type
1,rdfs:label
2,skos:altLabel


Display all attributes related to authors in a table:

In [201]:
eculture_query.send_select_query("""

    SELECT DISTINCT (?article AS ?wosArticleUri) (?author AS ?wosAuthorCompoundUri) ?authorName ?authorAltName
    WHERE{
        GRAPH wosGraph: {
            ?article a wos:Article .
            ?article wos:AF ?author .
            ?author rdfs:label ?authorName .
            ?author skos:altLabel ?authorAltName .
        }
    }
    LIMIT 10

""")

Unnamed: 0,wosArticleUri,wosAuthorCompoundUri,authorName,authorAltName
0,wosres:WOS_000060208200006,wosres:WOS_000060208200006_Elsaesser_T,"Elsaesser, T","Elsaesser, T"
1,wosres:WOS_000070998900007,wosres:WOS_000070998900007_Aten_JA,"Aten, JA","Aten, JA"
2,wosres:WOS_000070998900007,wosres:WOS_000070998900007_Van_Munster_EB,"Van Munster, EB","Van Munster, EB"
3,wosres:WOS_000070998900007,wosres:WOS_000070998900007_Van_Vliet_LJ,"Van Vliet, LJ","Van Vliet, LJ"
4,wosres:WOS_000071084600009,wosres:WOS_000071084600009_de_Beer_K,"de Beer, K","de Beer, K"
5,wosres:WOS_000071084600009,wosres:WOS_000071084600009_de_Voogt_P,"de Voogt, P","de Voogt, P"
6,wosres:WOS_000071084600009,wosres:WOS_000071084600009_van_der_Wielen_F,"van der Wielen, F","van der Wielen, F"
7,wosres:WOS_000071167500005,wosres:WOS_000071167500005_Faas_BHW,"Faas, BHW","Faas, BHW"
8,wosres:WOS_000071167500005,wosres:WOS_000071167500005_Ligthart_PC,"Ligthart, PC","Ligthart, PC"
9,wosres:WOS_000071167500005,wosres:WOS_000071167500005_Lomas-Francis_C,"Lomas-Francis, C","Lomas-Francis, C"


### Keywords (by Authors)

Keywords has no other properties attached to them:

In [162]:
retrieve_all_sub_attributes('wos:DE')

Unnamed: 0,p


#### Build the Keyword Processing Pipeline and Test it on Part of the WoS Graph

(1) Retrieve a test dataset (using LIMIT ...) containing article ids and associated keywords:

In [163]:
def retrieve_articles_and_keyword_lists_as_dataframe(limit=None):
    """
    Args:
        limit(int)
    Returns:
        pandas dataframe
    """
    if limit == None:
        limit_statement_in_query = ''
    else:
        limit_statement_in_query = 'LIMIT %d' % limit
    
    articles_vs_keywords_dataframe = eculture_query.send_select_query("""
        SELECT DISTINCT (?article AS ?wosArticleUri) ?keywords
        WHERE{
            GRAPH wosGraph: {
                ?article a wos:Article;
                         wos:DE ?keywords .
            }
        }
        %s
    """ % limit_statement_in_query)
    
    return articles_vs_keywords_dataframe

articles_vs_keywords_dataframe = retrieve_articles_and_keyword_lists_as_dataframe(10)
articles_vs_keywords_dataframe

Unnamed: 0,wosArticleUri,keywords
0,wosres:WOS_000070970500011,Elymus athericus; growth; photosynthesis; ozone; UV-B radiation
1,wosres:WOS_000070998100010,"pain, postoperative; analgesics, prescribing"
2,wosres:WOS_000070998900007,DIC; Nomarski; interference; microscopy; CCD; image processing;
3,wosres:WOS_000070998900007,analysis; reconstruction; optical pathlength; phase; transparent; living
4,wosres:WOS_000071006900008,atherosclerosis; homocysteine; metformin; vitamin B-12
5,wosres:WOS_000071013000007,policy; household economics
6,wosres:WOS_000071013000007,sub-Saharan Africa; Swaziland; labor migration; food security; labor
7,wosres:WOS_000071021600006,nitric oxide radical; NO scavenging; thiol; S-nitrosothiol
8,wosres:WOS_000071021600006,(electrochemical); NO sensing
9,wosres:WOS_000071040300005,lumbar spine; vertebra; trabecular bone; Wolff's Law; intervertebral


(2) Because keywords are in a semicolon-separated list, they need to be tokenized:

In [164]:
def convert_articles_vs_keywords_dataframe_to_dictionary(articles_vs_keywords_dataframe):
    """
    Args:
        articles_vs_keywords_dataframe(pandas.dataframe)
    Returns:
        dict
    """
    
    raw_articles_vs_keywords_dictionary = articles_vs_keywords_dataframe.to_dict('split')

    indexed_dictionary = {}    
    for each_entry in raw_articles_vs_keywords_dictionary['data']:

        each_article_id = each_entry[0]
        each_keywords_string = each_entry[1]

        each_keywords_list = each_keywords_string.split('; ')
        
        indexed_dictionary[each_article_id] = each_keywords_list
    return indexed_dictionary

id_vs_keyword_dictionary = convert_articles_vs_keywords_dataframe_to_dictionary(articles_vs_keywords_dataframe)

from pprint import pprint
pprint(id_vs_keyword_dictionary)

{'wosres:WOS_000070970500011': ['Elymus athericus',
                                'growth',
                                'photosynthesis',
                                'ozone',
                                'UV-B radiation'],
 'wosres:WOS_000070998100010': ['pain, postoperative',
                                'analgesics, prescribing'],
 'wosres:WOS_000070998900007': ['analysis',
                                'reconstruction',
                                'optical pathlength',
                                'phase',
                                'transparent',
                                'living'],
 'wosres:WOS_000071006900008': ['atherosclerosis',
                                'homocysteine',
                                'metformin',
                                'vitamin B-12'],
 'wosres:WOS_000071013000007': ['sub-Saharan Africa',
                                'Swaziland',
                                'labor migration',
                           

(3) Tokenized keywords contain items such as "Wolff's Law" and "(electrochemical)". They need to be cleaned from special characters:

In [166]:
# !pip install unidecode
from preprocessor.string_tools import String

def clean_id_vs_keyword_dictionary(id_vs_keyword_dictionary):
    """
    Args: 
        id_vs_keyword_dictionary(dict)
    Returns:
        dict
    """

    cleaned_id_vs_keyword_dictionary = {}
    each_cleaned_keywords_list = []

    for each_id, each_keywords_list in id_vs_keyword_dictionary.items():
        
        for each_keyword in each_keywords_list:
            each_cleaned_keyword = String(each_keyword)
            each_cleaned_keyword.purify(
                clean_from_non_ascii_characters=True, 
                remove_problematic_patterns=True,
                clean_newline_characters=True
            )
            each_cleaned_keyword.replace_patterns({'\(|\)|,':''})
            
            each_cleaned_keywords_list.append(each_cleaned_keyword)
        
        cleaned_id_vs_keyword_dictionary[each_id] = each_cleaned_keywords_list
        each_cleaned_keywords_list = []
        
    return cleaned_id_vs_keyword_dictionary

cleaned_id_vs_keyword_dictionary = clean_id_vs_keyword_dictionary(id_vs_keyword_dictionary)
pprint(cleaned_id_vs_keyword_dictionary)

{'wosres:WOS_000070970500011': ['Elymus athericus',
                                'growth',
                                'photosynthesis',
                                'ozone',
                                'UV-B radiation'],
 'wosres:WOS_000070998100010': ['pain postoperative', 'analgesics prescribing'],
 'wosres:WOS_000070998900007': ['analysis',
                                'reconstruction',
                                'optical pathlength',
                                'phase',
                                'transparent',
                                'living'],
 'wosres:WOS_000071006900008': ['atherosclerosis',
                                'homocysteine',
                                'metformin',
                                'vitamin B-12'],
 'wosres:WOS_000071013000007': ['sub-Saharan Africa',
                                'Swaziland',
                                'labor migration',
                                'food security',
            

(4) Prepare strings to be later passed on to VALUES keyword in Gastrodon_Query:<br>
(This alleviates the need to send one SPARQL query per keyword, and instead, will be used to group keywords per article ID using the VALUES keyword)

In [167]:
def convert_cleaned_id_vs_keyword_dictionary_to_values_strings_dictionary(cleaned_id_vs_keyword_dictionary):
    """
    Args:
        cleaned_id_vs_keyword_dictionary: must be cleaned, as strings that contain " or ' characters may cause problems 
            due to output being dynamically wrapped in one of the characters as well (i.e., to wrap them as a string to 
            pass as VALUES parameter)
    Returns:
        dict
    """
    parameterised_id_vs_keyword_dictionary = {}
    for each_article_id, each_keywords_list in cleaned_id_vs_keyword_dictionary.items():

        keywords_string_for_values_parameters = ''
        for each_keyword in each_keywords_list:
            keywords_string_for_values_parameters = keywords_string_for_values_parameters + ' "%s"' % each_keyword
            
        parameterised_id_vs_keyword_dictionary[each_article_id] = keywords_string_for_values_parameters

    return parameterised_id_vs_keyword_dictionary

parameterised_id_vs_keyword_dictionary = convert_cleaned_id_vs_keyword_dictionary_to_values_strings_dictionary(cleaned_id_vs_keyword_dictionary)
pprint(parameterised_id_vs_keyword_dictionary)

{'wosres:WOS_000070970500011': ' "Elymus athericus" "growth" "photosynthesis" '
                               '"ozone" "UV-B radiation"',
 'wosres:WOS_000070998100010': ' "pain postoperative" "analgesics prescribing"',
 'wosres:WOS_000070998900007': ' "analysis" "reconstruction" "optical '
                               'pathlength" "phase" "transparent" "living"',
 'wosres:WOS_000071006900008': ' "atherosclerosis" "homocysteine" "metformin" '
                               '"vitamin B-12"',
 'wosres:WOS_000071013000007': ' "sub-Saharan Africa" "Swaziland" "labor '
                               'migration" "food security" "labor"',
 'wosres:WOS_000071021600006': ' "electrochemical" "NO sensing"',
 'wosres:WOS_000071040300005': ' "lumbar spine" "vertebra" "trabecular bone" '
                               '"Wolffs Law" "intervertebral"'}


(5) Update the test graph using the parameterised keywords:

In [168]:
# TODO: Each keyword will be inserted as a new articleId-keyword pair to the wos triple store
def update_graph_with_parameterised_id_vs_keyword_dictionary(parameterised_id_vs_keyword_dictionary, graph_to_insert_into, show_progress=False):
    
    connecting_property = 'kfir:hasAuthorKeyword'  # arbitrary name, can use prefix if previously defined in Gastrodon_Query endpoint

    for each_article_id, each_values_parameter_string in parameterised_id_vs_keyword_dictionary.items():

        if show_progress:
            print (each_article_id, connecting_property, each_values_parameter_string)

        eculture_query.send_update_query(query="""
            INSERT {
                   GRAPH %s {
                       %s %s ?keyword .
                   }
               }
            WHERE{
                VALUES ?keyword {%s}
            }
            """ % (graph_to_insert_into, each_article_id, connecting_property, each_values_parameter_string)                            
        )
                                            
update_graph_with_parameterised_id_vs_keyword_dictionary(parameterised_id_vs_keyword_dictionary, 'testGraph:', show_progress=True)

wosres:WOS_000070970500011 kfir:hasAuthorKeyword  "Elymus athericus" "growth" "photosynthesis" "ozone" "UV-B radiation"
wosres:WOS_000070998100010 kfir:hasAuthorKeyword  "pain postoperative" "analgesics prescribing"
wosres:WOS_000070998900007 kfir:hasAuthorKeyword  "analysis" "reconstruction" "optical pathlength" "phase" "transparent" "living"
wosres:WOS_000071006900008 kfir:hasAuthorKeyword  "atherosclerosis" "homocysteine" "metformin" "vitamin B-12"
wosres:WOS_000071013000007 kfir:hasAuthorKeyword  "sub-Saharan Africa" "Swaziland" "labor migration" "food security" "labor"
wosres:WOS_000071021600006 kfir:hasAuthorKeyword  "electrochemical" "NO sensing"
wosres:WOS_000071040300005 kfir:hasAuthorKeyword  "lumbar spine" "vertebra" "trabecular bone" "Wolffs Law" "intervertebral"


In [169]:
eculture_query.send_select_query("""
    SELECT DISTINCT * 
    WHERE { 
        GRAPH testGraph: {
            ?s ?p ?o .
        } 
    }
""")

Unnamed: 0,s,p,o
0,wosres:WOS_000070970500011,kfir:hasAuthorKeyword,photosynthesis
1,wosres:WOS_000070970500011,kfir:hasAuthorKeyword,ozone
2,wosres:WOS_000070970500011,kfir:hasAuthorKeyword,UV-B radiation
3,wosres:WOS_000070970500011,kfir:hasAuthorKeyword,growth
4,wosres:WOS_000070970500011,kfir:hasAuthorKeyword,Elymus athericus
5,wosres:WOS_000070998100010,kfir:hasAuthorKeyword,pain postoperative
6,wosres:WOS_000070998100010,kfir:hasAuthorKeyword,analgesics prescribing
7,wosres:WOS_000070998900007,kfir:hasAuthorKeyword,analysis
8,wosres:WOS_000070998900007,kfir:hasAuthorKeyword,living
9,wosres:WOS_000070998900007,kfir:hasAuthorKeyword,reconstruction


#### Use the Keywords Pipeline on the Entire WoS Graph

This separates and cleans all keywords in WoS Graph, and updates the graph with these processed keywords using the following structure: <br> *?article_id--kfir:hasAuthorKeyword-->?cleaned_keyword* structure.

(0) **CLEAN** KFIR Graph:

Delete all triples in the kfirGraph:

In [181]:
# Must be run on server (otherwise causes timeouts)
# CLEAR GRAPH kfirGraph:

Confirm deletions:

In [187]:
eculture_query.send_select_query(query="""
    SELECT DISTINCT * {
        GRAPH kfirGraph:{
            ?s ?p ?o .
        }
    }
 
""")

Unnamed: 0,s,p,o


(1) Retrieve all articles from wosGraph: and their corresponding keyword list:

In [184]:
articles_vs_keywords_dataframe = retrieve_articles_and_keyword_lists_as_dataframe()
print(len(articles_vs_keywords_dataframe))

10000


(2) Convert results (which are in pandas dataframe format) to a dictionary:

In [185]:
id_vs_keyword_dictionary = convert_articles_vs_keywords_dataframe_to_dictionary(articles_vs_keywords_dataframe)
print(len(articles_vs_keywords_dataframe))

10000


(3) Clean the dictionary:

In [186]:
cleaned_id_vs_keyword_dictionary = clean_id_vs_keyword_dictionary(id_vs_keyword_dictionary)
print(len(cleaned_id_vs_keyword_dictionary))

5934


(4) Parameterize the dictionary for usage in VALUES keyword:

In [188]:
parameterised_id_vs_keyword_dictionary = convert_cleaned_id_vs_keyword_dictionary_to_values_strings_dictionary(cleaned_id_vs_keyword_dictionary)
print(len(parameterised_id_vs_keyword_dictionary))

5934


(5) **UPDATE** kfirGraph with the tokenized and cleaned keywords: <br>
(Takes a few minutes)

In [191]:
# first run may return error, and it may take more than one time to fully update 
# (run this snipped multiple times even if it returns error. Each run will be faster due to already added items from previous runs.)
update_graph_with_parameterised_id_vs_keyword_dictionary(parameterised_id_vs_keyword_dictionary, 'kfirGraph:')

Confirm that the kfirGraph: is updated:

In [193]:
eculture_query.send_select_query("""
    SELECT DISTINCT (?article AS ?wosArticleUri) ?authorKeyword
    WHERE { 
        GRAPH kfirGraph: {
            ?article kfir:hasAuthorKeyword ?authorKeyword .
        } 
    }
""")

Unnamed: 0,wosArticleUri,authorKeyword
0,wosres:WOS_000168368500006,P
1,wosres:WOS_000080162700011,Africa
2,wosres:WOS_000165719300016,Africa
3,wosres:WOS_000169902800011,Africa
4,wosres:WOS_000174993500042,Africa
5,wosres:WOS_000176751200010,Africa
6,wosres:WOS_000166595300001,Albania
7,wosres:WOS_000167615300010,Albania
8,wosres:WOS_000088328200008,Alberta
9,wosres:WOS_000074619300006,Amazonas


### DOI

In [194]:
retrieve_all_sub_attributes('wos:DOI')

Unnamed: 0,p
0,rdf:type
1,rdfs:label
2,wos:DOI


Display all attributes related to DOI in a table:

In [195]:
eculture_query.send_select_query("""

    SELECT DISTINCT *
    WHERE{
        GRAPH wosGraph: {
            ?article a wos:Article ;
                     wos:DOI ?doi .
                     
            ?doi rdfs:label ?doiLabel ;
                          wos:DOI ?doiDoi ;
                          a       ?doiType .
        }
    }
    LIMIT 10

""")

Unnamed: 0,article,doi,doiLabel,doiDoi,doiType
0,wosres:WOS_000070998900007,http://dx.doi.org/10.1046%2Fj.1365-2818.1997.2570815.x,"Van Munster EB, 1997, J MICROSC-OXFORD, V188, P149, DOI 10.1046/j.1365-2818.1997.2570815.x",http://dx.doi.org/10.1046%2Fj.1365-2818.1997.2570815.x,wos:CitedPublication
1,wosres:WOS_000071636500010,http://dx.doi.org/10.1002%2F(SICI)1096-9896(199801)184%3A1%3C53%3A%3AAID-PATH6%3E3.0.CO%3B2-7,"De Jong JS, 1998, J PATHOL, V184, P53, DOI 10.1002/(SICI)1096-9896(199801)184:1<53::AID-PATH6>3....",http://dx.doi.org/10.1002%2F(SICI)1096-9896(199801)184%3A1%3C53%3A%3AAID-PATH6%3E3.0.CO%3B2-7,wos:CitedPublication
2,wosres:WOS_000072954400012,http://dx.doi.org/10.1002%2Fana.410430413,"van den Berg JSP, 1998, ANN NEUROL, V43, P494, DOI 10.1002/ana.410430413",http://dx.doi.org/10.1002%2Fana.410430413,wos:CitedPublication
3,wosres:WOS_000071179700013,http://dx.doi.org/10.1006%2Fexer.1997.0396,"Broekhuyse RM, 1997, EXP EYE RES, V65, P841, DOI 10.1006/exer.1997.0396",http://dx.doi.org/10.1006%2Fexer.1997.0396,wos:CitedPublication
4,wosres:WOS_000071912700002,http://dx.doi.org/10.1006%2Fgcen.1997.7001,"de Lange RPJ, 1998, GEN COMP ENDOCR, V109, P166, DOI 10.1006/gcen.1997.7001",http://dx.doi.org/10.1006%2Fgcen.1997.7001,wos:CitedPublication
5,wosres:WOS_000074708500005,http://dx.doi.org/10.1006%2Fjdeq.1998.3428,"Gohberg I, 1998, J DIFFER EQUATIONS, V146, P375, DOI 10.1006/jdeq.1998.3428",http://dx.doi.org/10.1006%2Fjdeq.1998.3428,wos:CitedPublication
6,wosres:WOS_000073621000010,http://dx.doi.org/10.1007%2Fs002210050380,"Toussaint HM, 1998, EXP BRAIN RES, V120, P85, DOI 10.1007/s002210050380",http://dx.doi.org/10.1007%2Fs002210050380,wos:CitedPublication
7,wosres:WOS_000073621900004,http://dx.doi.org/10.1007%2Fs002510050382,"Bouma G, 1998, IMMUNOGENETICS, V47, P451, DOI 10.1007/s002510050382",http://dx.doi.org/10.1007%2Fs002510050382,wos:CitedPublication
8,wosres:WOS_000071074300003,http://dx.doi.org/10.1007%2Fs004320050111,"van der Wilt CL, 1997, J CANCER RES CLIN, V123, P595, DOI 10.1007/s004320050111",http://dx.doi.org/10.1007%2Fs004320050111,wos:CitedPublication
9,wosres:WOS_000074171100014,http://dx.doi.org/10.1007%2Fs004390050745,"Collee JM, 1998, HUM GENET, V102, P587, DOI 10.1007/s004390050745",http://dx.doi.org/10.1007%2Fs004390050745,wos:CitedPublication


Display only the attributes of interest:

In [197]:
eculture_query.send_select_query("""

    SELECT DISTINCT (?article AS ?wosArticleUri) (?doiDoi as ?doi)
    WHERE{
        GRAPH wosGraph: {
            ?article a wos:Article ;
                     wos:DOI ?doi .
                     
            ?doi rdfs:label ?doiLabel ;
                          wos:DOI ?doiDoi ;
                          a       ?doiType .
        }
    }
    LIMIT 10

""")

Unnamed: 0,wosArticleUri,doi
0,wosres:WOS_000070998900007,http://dx.doi.org/10.1046%2Fj.1365-2818.1997.2570815.x
1,wosres:WOS_000071636500010,http://dx.doi.org/10.1002%2F(SICI)1096-9896(199801)184%3A1%3C53%3A%3AAID-PATH6%3E3.0.CO%3B2-7
2,wosres:WOS_000072954400012,http://dx.doi.org/10.1002%2Fana.410430413
3,wosres:WOS_000071179700013,http://dx.doi.org/10.1006%2Fexer.1997.0396
4,wosres:WOS_000071912700002,http://dx.doi.org/10.1006%2Fgcen.1997.7001
5,wosres:WOS_000074708500005,http://dx.doi.org/10.1006%2Fjdeq.1998.3428
6,wosres:WOS_000073621000010,http://dx.doi.org/10.1007%2Fs002210050380
7,wosres:WOS_000073621900004,http://dx.doi.org/10.1007%2Fs002510050382
8,wosres:WOS_000071074300003,http://dx.doi.org/10.1007%2Fs004320050111
9,wosres:WOS_000074171100014,http://dx.doi.org/10.1007%2Fs004390050745


### 'Has Cited'

In [198]:
retrieve_all_sub_attributes('wos:CR')

Unnamed: 0,p
0,rdf:type
1,rdfs:label
2,wos:DOI


Display all attributes related to has_cite in a table:

In [199]:
eculture_query.send_select_query("""

    SELECT DISTINCT *
    WHERE{
        GRAPH wosGraph: {
            ?article a wos:Article .
            ?article wos:CR ?citedArticle .
            ?citedArticle rdfs:label ?citedArticleTitle ;
                          wos:DOI ?citedArticleDoi ;
                          rdf:type ?citedArticleType .
        }
    }
    LIMIT 10

""")

Unnamed: 0,article,citedArticle,citedArticleTitle,citedArticleDoi,citedArticleType
0,wosres:WOS_000403097000001,http://dx.doi.org/%5B10.1002%2F14651858.CD000088.pub3,"Pharoah F, 2010, COCHRANE DB SYST REV, DOI [10.1002/14651858.CD000088.pub3, 10.1002/14651858.CD0...",http://dx.doi.org/%5B10.1002%2F14651858.CD000088.pub3,wos:CitedPublication
1,wosres:WOS_000305753900001,http://dx.doi.org/%5B10.1007%2Fs10709-006-9012-x,"Latta RG, 2007, GENETICA, V129, P167, DOI [10.1007/s10709-006-9012-x, 10.1007/s10706-006-9012-x]",http://dx.doi.org/%5B10.1007%2Fs10709-006-9012-x,wos:CitedPublication
2,wosres:WOS_000319349400001,http://dx.doi.org/%5B10.1007%2Fs10709-006-9012-x,"Latta RG, 2007, GENETICA, V129, P167, DOI [10.1007/s10709-006-9012-x, 10.1007/s10706-006-9012-x]",http://dx.doi.org/%5B10.1007%2Fs10709-006-9012-x,wos:CitedPublication
3,wosres:WOS_000308466000009,http://dx.doi.org/%5B10.1007%2Fs10709-006-9012-x,"Latta RG, 2007, GENETICA, V129, P167, DOI [10.1007/s10709-006-9012-x, 10.1007/s10706-006-9012-x]",http://dx.doi.org/%5B10.1007%2Fs10709-006-9012-x,wos:CitedPublication
4,wosres:WOS_000231565800007,http://dx.doi.org/%5B10.1016%2FS0091-6749(03)01942-0,"Pastorello EA, 2003, J ALLERGY CLIN IMMUN, V112, P775, DOI [10.1016/S0091-6749(03)01942-0, 10.10...",http://dx.doi.org/%5B10.1016%2FS0091-6749(03)01942-0,wos:CitedPublication
5,wosres:WOS_000324765100020,http://dx.doi.org/%5B10.1016%2FS0140-6736(08)61698-0,"Garnett GP, 2009, LANCET, V373, P9, DOI [10.1016/S0140-6736(08)61698-0, 10.1016/S01406736(08)616...",http://dx.doi.org/%5B10.1016%2FS0140-6736(08)61698-0,wos:CitedPublication
6,wosres:WOS_000332113800001,http://dx.doi.org/%5B10.1016%2FS0140-6736(11)61105-7,"Kim Y, 2011, LANCET, V378, P317, DOI [10.1016/S0140-6736(11)61105-7, 10.1016/S0140-6736(11)61169-0]",http://dx.doi.org/%5B10.1016%2FS0140-6736(11)61105-7,wos:CitedPublication
7,wosres:WOS_000383255900176,http://dx.doi.org/%5B10.1016%2FS0140-6736(13)62223-0,"Garcia-Basteiro AL, 2014, LANCET, V383, P215, DOI [10.1016/S0140-6736(13)62223-0, 10.1016/S0140-...",http://dx.doi.org/%5B10.1016%2FS0140-6736(13)62223-0,wos:CitedPublication
8,wosres:WOS_000376720800001,http://dx.doi.org/%5B10.1016%2FS0140-6736(13)62223-0,"Garcia-Basteiro AL, 2014, LANCET, V383, P215, DOI [10.1016/S0140-6736(13)62223-0, 10.1016/S0140-...",http://dx.doi.org/%5B10.1016%2FS0140-6736(13)62223-0,wos:CitedPublication
9,wosres:WOS_000279537600003,http://dx.doi.org/%5B10.1016%2FS0301-4215(03)00134-4,"Kamp LM, 2004, ENERG POLICY, V32, P1625, DOI [10.1016/S0301-4215(03)00134-4, 10.1016/80301-4215(...",http://dx.doi.org/%5B10.1016%2FS0301-4215(03)00134-4,wos:CitedPublication


Display only the attributes of interest:

In [200]:
eculture_query.send_select_query("""

    SELECT DISTINCT (?article AS ?wosArticleUri) (?citedArticleDoi AS ?hasCitedArticle_withDoi)
    WHERE{
        GRAPH wosGraph: {
            ?article a wos:Article .
            ?article wos:CR ?citedArticle .
            ?citedArticle rdfs:label ?citedArticleTitle ;
                          wos:DOI ?citedArticleDoi ;
                          rdf:type ?citedArticleType .
        }
    }
    LIMIT 10

""")

Unnamed: 0,wosArticleUri,hasCitedArticle_withDoi
0,wosres:WOS_000070998900007,http://dx.doi.org/10.1038%2F169366b0
1,wosres:WOS_000070998900007,http://dx.doi.org/10.1111%2Fj.1365-2818.1992.tb04307.x
2,wosres:WOS_000071006900008,http://dx.doi.org/10.1002%2Fajh.2830340205
3,wosres:WOS_000071077500014,http://dx.doi.org/10.1002%2Fcne.903240409
4,wosres:WOS_000071092900005,http://dx.doi.org/10.1006%2Fabio.1994.1208
5,wosres:WOS_000071074300003,http://dx.doi.org/10.1007%2FBF00685670
6,wosres:WOS_000071053800004,http://dx.doi.org/10.1016%2F0003-4916(70)90025-4
7,wosres:WOS_000071006900008,http://dx.doi.org/10.1016%2F0021-9150(93)90258-V
8,wosres:WOS_000071071800034,http://dx.doi.org/10.1016%2F0039-6028(96)00070-2
9,wosres:WOS_000071077500024,http://dx.doi.org/10.1016%2F0165-0173(95)00011-9
