## Visualization of knowledge graphs

*    This is a Google Colab notebook. You must have a Google account with a Google Drive to store/ load the main file with the relationships, for inspection. 
*    Upload the notebook from its location in GitHub and allow the code to access your Google Drive.
*    Please put your credentials in the cell "Connect to the Virtuoso database".
*    Put your own keywords in the cell with tile "Enter keywords below". Note that the search is case-sensitive.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Installations

In [2]:
!pip install pyodbc

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!pip install pyvis

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
!pip install SPARQLWrapper
!pip install sparql_dataframe

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
!apt-get install virtuoso-opensource

Reading package lists... Done
Building dependency tree       
Reading state information... Done
virtuoso-opensource is already the newest version (6.1.6+repack-0ubuntu9).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


## Imports

In [6]:
import os 
import re
import logging
import sys
import pyodbc
import hashlib
import pandas as pd
import numpy as np
from datetime import datetime
from SPARQLWrapper import SPARQLWrapper, POST, DIGEST, GET
from SPARQLWrapper import JSON, INSERT, DELETE
import sparql_dataframe

pd.set_option('display.max_rows', 500)

from IPython.display import Image

import random

import palettable
from palettable.colorbrewer.sequential import Blues_9
from palettable.colorbrewer.sequential import Greens_9
from palettable.colorbrewer.qualitative import Accent_8
from palettable.colorbrewer.qualitative import Paired_7

from pyvis import network as net
from IPython.core.display import display, HTML

## Connect to the Virtuoso database

In [7]:
user = 'xxxxx'
passw='xxxxx'

In [8]:
def connect_db(DSN, DBA, UID, PWD):

    connection = pyodbc.connect('DRIVER=/usr/lib/odbc/virtodbc.so;HOST=lod.csd.auth.gr:1111;UID='+user+';PWD='+passw+';DATABASE=ESTAT')
    cursor = connection.cursor()

    return connection, cursor


def connect_virtuoso(DSN, UID, PWD):

    sparql = SPARQLWrapper(DSN)
    sparql.setHTTPAuth(DIGEST)
    sparql.setCredentials(UID, PWD)
    sparql.setMethod(GET)

    return sparql


# Connection to CDB 
#connection, cursor = connect_db('Virtuoso All', 
#                                'ESTAT', 
#                                user, 
#                                passw)

# Connection to the KDB 
endpoint = "http://lod.csd.auth.gr:8890/sparql/"
sparql = connect_virtuoso(endpoint, 
                          user, 
                          passw)

## Extraction of relationships

In [9]:
def databaseTable(keywords):

  RelationsStatements = """
  DEFINE input:inference <https://ec.europa.eu/eurostat/NLP4StatRef/knowledge/>
  PREFIX estat: <https://ec.europa.eu/eurostat/NLP4StatRef/ontology/>
  PREFIX estatdata: <https://ec.europa.eu/eurostat/NLP4StatRef/knowledge/>
  PREFIX oecd: <https://ec.europa.eu/eurostat/NLP4StatRef/knowledge/oecd/> 
  PREFIX estat: <https://ec.europa.eu/eurostat/NLP4StatRef/ontology/>
  select ?article ?title ?articleURL ?r ?reftitle ?refURL where { 
      {?article rdf:type estat:Article .
      ?article ?p ?t .}
      UNION
      {?article rdf:type estat:Article .
      ?article estat:hasParagraph ?x.
      ?x ?p ?t .}
      filter contains(str(?t),\" """ + str(keywords) + """\") 
      ?article estat:title ?title .
      ?article estat:hasURL ?articleURL.
      ?article estat:hasReference ?r .
      {?r rdf:type estat:Table} 
      UNION
      {?r rdf:type estat:Database}
      ?r estat:hasURL ?refURL .
      ?r estat:title ?reftitle .
  } 
  """

  sparql.setQuery(RelationsStatements)
  sparql.method = "POST"
  sparql.setReturnFormat(JSON)
  results = sparql.query().convert()['results']['bindings']
  results = pd.json_normalize(results)
  
  return results

def articleTable(keywords):

  RelationsStatements = """
  DEFINE input:inference <https://ec.europa.eu/eurostat/NLP4StatRef/knowledge/>
  PREFIX estat: <https://ec.europa.eu/eurostat/NLP4StatRef/ontology/>
  PREFIX estatdata: <https://ec.europa.eu/eurostat/NLP4StatRef/knowledge/>
  PREFIX oecd: <https://ec.europa.eu/eurostat/NLP4StatRef/knowledge/oecd/> 
  select * where { 
      {?article rdf:type estat:Article .
      ?article ?p ?t .}
      UNION
      {?article rdf:type estat:Article .
      ?article estat:hasParagraph ?x.
      ?x ?p ?t .}
      filter contains(str(?t),\" """ + str(keywords) + """\") 
      ?article estat:title ?title .
      ?article estat:hasURL ?articleURL.
      ?article estat:hasReference ?r .
      { ?r estat:hasURI ?euroArticle.
      ?euroArticle estat:title ?euroTitle .
        ?euroArticle estat:hasURL ?euroURL.}
     UNION
     {?r estat:hasURL ?euroURL.
     ?r estat:title ?euroTitle .
     BIND(?r as ?euroArticle)
    }   
  } 
  """
  
  sparql.setQuery(RelationsStatements)
  sparql.method = "POST"
  sparql.setReturnFormat(JSON)
  results = sparql.query().convert()['results']['bindings']
  results = pd.json_normalize(results)
  
  return results




### Production of the dataframe with the relationships

In [10]:
def produce_df(keywords):

    results1 = databaseTable(keywords)
    
    results1.drop(columns=['article.type', 'title.type', 'articleURL.type', 'articleURL.datatype','r.type',
       'reftitle.type','refURL.type','refURL.datatype', ],inplace=True)
    results1.rename(columns={'article.value':'articleURI','title.value':'articleTitle','articleURL.value':'articleURL',
                             'r.value':'relatedURI','reftitle.value':'relatedTitle','refURL.value':'relatedURL'},inplace=True)

    results2 = articleTable(keywords)
    #print(results2.columns)
    #display(results2)

    results2.drop(columns = ['article.type','p.type','p.value','t.type','t.value','title.type','articleURL.type','articleURL.datatype','r.value',      
                                 'euroURL.type', 'euroURL.datatype', 'x.type', 'x.value','r.type','euroArticle.type','euroTitle.type'],inplace=True)
    results2.rename(columns={'article.value':'articleURI','title.value':'articleTitle','articleURL.value':'articleURL',
                             'euroArticle.value':'relatedURI','euroTitle.value':'relatedTitle','euroURL.value':'relatedURL'},inplace=True)


    df = pd.concat([results1,results2])
    return df



### Enter keywords below
*    To re-run with different keywords change them below and re-run ("Runtime > Run after") from this cell. 

In [11]:
keywords = 'morbidity statistics'

## Produce file for inspection

In [12]:

df = produce_df(keywords)
#print(df)
df.to_excel('/content/drive/MyDrive/df.xlsx')


In [13]:
## add column relatedType indicating the type of the related resource

conditions = [
    (df['relatedURL'].str.contains('Glossary:')),              
    (df['relatedURL'].str.contains('statistics-explained/index.php')),
    ## (df['relatedURL'].str.contains('data/database')),
    ## (df['relatedURL'].str.contains('/news/')),    
    (df['relatedURL'].str.contains('eurostat/product')),    
    (df['relatedURL'].str.contains('eur-lex.europa.eu'))    
    ]
##values = ['GL article','SE article', 'Data', 'News','Publication','Legislation']
values = ['GL article','SE article', 'Publication','Legislation']
df['relatedType'] = np.select(conditions,values,default='Other') 
df = df[df['relatedType'] != 'Other'].copy()
df.reset_index(drop=False,inplace=True)
df.drop(columns='index',inplace=True)

##display(df)


In [14]:
## unique URIs in Articles and Related resources

valuesURI = df[['articleURI','relatedURI']].values
uniqueURI = np.unique(valuesURI)
print('uniqueURIs:')
uniqueURI

df2 = pd.DataFrame(uniqueURI,columns=['uniqueURI'])
df2['ID'] = range(len(df2))
##df2

uniqueURIs:


In [15]:
## transfer the unique IDs to the main file

df3 = pd.merge(df,df2,how='inner', left_on='articleURI',right_on='uniqueURI')
df3.rename(columns={'ID':'articleID'},inplace=True)
df3 = pd.merge(df3,df2,how='inner', left_on='relatedURI',right_on='uniqueURI')
df3.rename(columns={'ID':'relatedID'},inplace=True)
df3 = df3[['articleID','relatedID','relatedType','articleURI','articleURL','articleTitle','relatedURI','relatedURL','relatedTitle']]
##df3

In [16]:
## related type groups and assignment to colors
type_groups = np.unique(df3['relatedType'])
print(type_groups)
node_colors = Paired_7.hex_colors[:len(type_groups)]
node_colors

['GL article' 'Publication' 'SE article']


['#A6CEE3', '#1F78B4', '#B2DF8A']

In [17]:
## Create graph

g=net.Network(height='600px', width='75%',heading='')
g.barnes_hut()
g.repulsion(node_distance=100, spring_length=200)

## First the unique articles - source nodes, by articleID 
uniqueArticles = df3.groupby(['articleID'])[['articleTitle']].agg(list).reset_index() ## aggregate into lists - titles are repeated
col_index = int(np.where(type_groups=='SE article')[0]) ## index of color to use for SE articles
for i in range(len(uniqueArticles)): ## for some reason, single elements must be put in a list, add_node() does not work
    g.add_nodes([uniqueArticles.loc[i,'articleID']], color=[node_colors[col_index]], label = [uniqueArticles.loc[i,'articleTitle'][0]])

## Then the related nodes, by type
relatedGroups = df3.groupby(['relatedType'])[['relatedID','relatedTitle']].agg(list).reset_index() ## again aggregate into lists
for i in range(len(relatedGroups)):
    how_many_nodes = len(relatedGroups.loc[i,'relatedID'])
    g.add_nodes(relatedGroups.loc[i,'relatedID'],label=relatedGroups.loc[i,'relatedTitle'], color=[node_colors[i]]*how_many_nodes)

## The edges
for i in range(len(df3)):        
    n1 = int(df3.loc[i,'articleID'])
    n2 = int(df3.loc[i,'relatedID'])
    type_ind = int(np.where(type_groups==df3.loc[i,'relatedType'])[0])
    col =  node_colors[type_ind]
    lbl = type_groups[type_ind]
    g.add_edge(n1,n2,color=col,label=lbl)
g.show('example.html')
display(HTML('example.html'))
