## Visualization of knowledge graphs

*    This is a Google Colab notebook. You must have a Google account with a Google Drive to store/ load the main file with the relationships, for inspection. 
*    Upload the notebook from its location in GitHub and allow the code to access your Google Drive.
*    Please put your credentials in the cell "Connect to the Virtuoso database".
*    Put your own keywords in the cell with tile "Enter keywords below"

In [129]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Installations

In [130]:
!pip install pyodbc



In [131]:
!pip install pyvis



In [132]:
!pip install SPARQLWrapper
!pip install sparql_dataframe



In [133]:
!apt-get install virtuoso-opensource

Reading package lists... Done
Building dependency tree       
Reading state information... Done
virtuoso-opensource is already the newest version (6.1.6+repack-0ubuntu9).
The following packages were automatically installed and are no longer required:
  libnvidia-common-460 nsight-compute-2020.2.0
Use 'apt autoremove' to remove them.
0 upgraded, 0 newly installed, 0 to remove and 42 not upgraded.


## Imports

In [134]:
import os 
import re
import logging
import sys
import pyodbc
import hashlib
import pandas as pd
import numpy as np
from datetime import datetime
from SPARQLWrapper import SPARQLWrapper, POST, DIGEST, GET
from SPARQLWrapper import JSON, INSERT, DELETE
import sparql_dataframe

pd.set_option('display.max_rows', 500)

from IPython.display import Image

import random

import palettable
from palettable.colorbrewer.sequential import Blues_9
from palettable.colorbrewer.sequential import Greens_9
from palettable.colorbrewer.qualitative import Accent_8
from palettable.colorbrewer.qualitative import Paired_7

from pyvis import network as net
from IPython.core.display import display, HTML

## Connect to the Virtuoso database

In [135]:
def connect_db(DSN, DBA, UID, PWD):

    connection = pyodbc.connect('DRIVER=/usr/lib/odbc/virtodbc.so;HOST=lod.csd.auth.gr:1111;UID=kimon;PWD=RkhvQYZ442e2JVXLHdtW;DATABASE=ESTAT')
    cursor = connection.cursor()

    return connection, cursor


def connect_virtuoso(DSN, UID, PWD):

    sparql = SPARQLWrapper(DSN)
    sparql.setHTTPAuth(DIGEST)
    sparql.setCredentials(UID, PWD)
    sparql.setMethod(GET)

    return sparql


user = "kimon"
login = "RkhvQYZ442e2JVXLHdtW"
# Connection to CDB 
connection, cursor = connect_db('Virtuoso All', 
                                'ESTAT', 
                                user, 
                                login)

# Connection to the KDB 
endpoint = "http://lod.csd.auth.gr:8890/sparql/"
sparql = connect_virtuoso(endpoint, 
                          user, 
                          login)

## Extraction of relationships

In [136]:
def databaseTable(keywords):

  RelationsStatements = """
  DEFINE input:inference <https://ec.europa.eu/eurostat/NLP4StatRef/knowledge/>
  PREFIX estat: <https://ec.europa.eu/eurostat/NLP4StatRef/ontology/>
  PREFIX estatdata: <https://ec.europa.eu/eurostat/NLP4StatRef/knowledge/>
  PREFIX oecd: <https://ec.europa.eu/eurostat/NLP4StatRef/knowledge/oecd/> 
  PREFIX estat: <https://ec.europa.eu/eurostat/NLP4StatRef/ontology/>
  select ?article ?title ?articleURL ?r ?reftitle ?refURL where { 
      {?article rdf:type estat:Article .
      ?article ?p ?t .}
      UNION
      {?article rdf:type estat:Article .
      ?article estat:hasParagraph ?x.
      ?x ?p ?t .}
      filter contains(str(?t),\" """ + str(keywords) + """\") 
      ?article estat:title ?title .
      ?article estat:hasURL ?articleURL.
      ?article estat:hasReference ?r .
      {?r rdf:type estat:Table} 
      UNION
      {?r rdf:type estat:Database}
      ?r estat:hasURL ?refURL .
      ?r estat:title ?reftitle .
  } 
  """

  sparql.setQuery(RelationsStatements)
  sparql.method = "POST"
  sparql.setReturnFormat(JSON)
  results = sparql.query().convert()['results']['bindings']
  results = pd.json_normalize(results)
  
  return results

def articleTable(keywords):

  RelationsStatements = """
  DEFINE input:inference <https://ec.europa.eu/eurostat/NLP4StatRef/knowledge/>
  PREFIX estat: <https://ec.europa.eu/eurostat/NLP4StatRef/ontology/>
  PREFIX estatdata: <https://ec.europa.eu/eurostat/NLP4StatRef/knowledge/>
  PREFIX oecd: <https://ec.europa.eu/eurostat/NLP4StatRef/knowledge/oecd/> 
  PREFIX estat: <https://ec.europa.eu/eurostat/NLP4StatRef/ontology/>
  select ?article ?title ?articleURL ?euroArticle ?euroTitle ?euroURL where { 
      {?article rdf:type estat:Article .
      ?article ?p ?t .}
      UNION
      {?article rdf:type estat:Article .
      ?article estat:hasParagraph ?x.
      ?x ?p ?t .}
      filter contains(str(?t),\" """ + str(keywords) + """\") 
      ?article estat:title ?title .
      ?article estat:hasURL ?articleURL.
      ?article estat:hasReference ?r .
      ?r estat:hasURI ?euroArticle.
      ?euroArticle estat:title ?euroTitle .
      ?euroArticle estat:hasURL ?euroURL.
  } 
  """
  
  sparql.setQuery(RelationsStatements)
  sparql.method = "POST"
  sparql.setReturnFormat(JSON)
  results = sparql.query().convert()['results']['bindings']
  results = pd.json_normalize(results)
  
  return results




### Production of the dataframe with the relationships

In [137]:

def produce_df(keywords):

    results1 = databaseTable(keywords)
    results2 = articleTable(keywords)

    listData = []
    for name, row in results1.iterrows():
        listData.append((row[1], row[3], row[6], row[8], row[10], row[13]))

    listArticle = []
    for name, row in results2.iterrows():
        listArticle.append((row[1], row[3], row[6], row[8], row[10], row[13]))

    finalList = listData + listArticle

    hashHelper = {}
    articleURI, articleURL,articletitle, relatedURI, relatedURL, relatedtitle = [], [], [], [], [], []
    for relation in finalList:
        articleURI.append(relation[0])
        articleURL.append(relation[2])
        articletitle.append(relation[1])
        relatedURI.append(relation[3])
        relatedURL.append(relation[5])
        relatedtitle.append(relation[4])
        hashHelper['articleURI'] = articleURI
        hashHelper['articleURL'] = articleURL
        hashHelper['articletitle'] = articletitle
        hashHelper['relatedURI'] = relatedURI
        hashHelper['relatedURL'] = relatedURL
        hashHelper['relatedTitle'] = relatedtitle

    # Create DataFrame  
    df = pd.DataFrame(hashHelper)
    #print(df)
    print(len(finalList))
    return df

### Enter keywords below
*    To re-run with different keywords change them below and re-run ("Runtime > Run after") from this cell. 

In [138]:
keywords = 'migrant integration'

## Produce file for inspection

In [139]:

df = produce_df(keywords)
df.to_excel('/content/drive/MyDrive/df.xlsx')
display(df)

209


Unnamed: 0,articleURI,articleURL,articletitle,relatedURI,relatedURL,relatedTitle
0,https://ec.europa.eu/eurostat/NLP4StatRef/know...,https://ec.europa.eu/eurostat/statistics-expla...,Migrant integration statistics labour market i...,https://ec.europa.eu/eurostat/NLP4StatRef/know...,http://appsso.eurostat.ec.europa.eu/nui/show.d...,Employment rates by sex age and citizenship % ...
1,https://ec.europa.eu/eurostat/NLP4StatRef/know...,https://ec.europa.eu/eurostat/statistics-expla...,Migrant integration statistics labour market i...,https://ec.europa.eu/eurostat/NLP4StatRef/know...,http://appsso.eurostat.ec.europa.eu/nui/show.d...,Unemployment rates by sex age and citizenship ...
2,https://ec.europa.eu/eurostat/NLP4StatRef/know...,https://ec.europa.eu/eurostat/statistics-expla...,Migrant integration statistics labour market i...,https://ec.europa.eu/eurostat/NLP4StatRef/know...,https://ec.europa.eu/eurostat/databrowser/view...,Figure 1 Development of the share of self empl...
3,https://ec.europa.eu/eurostat/NLP4StatRef/know...,https://ec.europa.eu/eurostat/statistics-expla...,Migrant integration statistics labour market i...,https://ec.europa.eu/eurostat/NLP4StatRef/know...,http://appsso.eurostat.ec.europa.eu/nui/show.d...,Activity rates by sex age and country of birth...
4,https://ec.europa.eu/eurostat/NLP4StatRef/know...,https://ec.europa.eu/eurostat/statistics-expla...,Migrant integration statistics labour market i...,https://ec.europa.eu/eurostat/NLP4StatRef/know...,http://appsso.eurostat.ec.europa.eu/nui/show.d...,Employment rates by sex age and country of bir...
5,https://ec.europa.eu/eurostat/NLP4StatRef/know...,https://ec.europa.eu/eurostat/statistics-expla...,Migrant integration statistics labour market i...,https://ec.europa.eu/eurostat/NLP4StatRef/know...,http://appsso.eurostat.ec.europa.eu/nui/show.d...,Long term unemployment 12 months or more as a ...
6,https://ec.europa.eu/eurostat/NLP4StatRef/know...,https://ec.europa.eu/eurostat/statistics-expla...,Migrant integration statistics labour market i...,https://ec.europa.eu/eurostat/NLP4StatRef/know...,http://appsso.eurostat.ec.europa.eu/nui/show.d...,Activity rates by sex age educational attainme...
7,https://ec.europa.eu/eurostat/NLP4StatRef/know...,https://ec.europa.eu/eurostat/statistics-expla...,Migrant integration statistics labour market i...,https://ec.europa.eu/eurostat/NLP4StatRef/know...,http://appsso.eurostat.ec.europa.eu/nui/show.d...,Employment rates by sex age educational attain...
8,https://ec.europa.eu/eurostat/NLP4StatRef/know...,https://ec.europa.eu/eurostat/statistics-expla...,Migrant integration statistics labour market i...,https://ec.europa.eu/eurostat/NLP4StatRef/know...,http://appsso.eurostat.ec.europa.eu/nui/show.d...,Unemployment rates by sex age and country of b...
9,https://ec.europa.eu/eurostat/NLP4StatRef/know...,https://ec.europa.eu/eurostat/statistics-expla...,Migrant integration statistics labour market i...,https://ec.europa.eu/eurostat/NLP4StatRef/know...,http://appsso.eurostat.ec.europa.eu/nui/show.d...,Employment rates by sex age educational attain...


In [140]:
## add column relatedType indicating the type of the related resource

conditions = [
    (df['relatedURL'].str.contains('Glossary:')),              
    (df['relatedURL'].str.contains('statistics-explained/index.php')),
    ## (df['relatedURL'].str.contains('data/database')),
    ## (df['relatedURL'].str.contains('/news/')),    
    (df['relatedURL'].str.contains('eurostat/product')),    
    (df['relatedURL'].str.contains('eur-lex.europa.eu'))    
    ]
##values = ['GL article','SE article', 'Data', 'News','Publication','Legislation']
values = ['GL article','SE article', 'Publication','Legislation']
df['relatedType'] = np.select(conditions,values,default='Other') 
df = df[df['relatedType'] != 'Other'].copy()
df.reset_index(drop=False,inplace=True)
df.drop(columns='index',inplace=True)
df.rename(columns={'articletitle':'articleTitle'},inplace=True)
df


Unnamed: 0,articleURI,articleURL,articleTitle,relatedURI,relatedURL,relatedTitle,relatedType
0,https://ec.europa.eu/eurostat/NLP4StatRef/know...,https://ec.europa.eu/eurostat/statistics-expla...,Migrant integration statistics active citizenship,https://ec.europa.eu/eurostat/NLP4StatRef/know...,https://eur-lex.europa.eu/legal-content/EN/ALL...,Regulation EC No 862 2007 of the European Parl...,Legislation
1,https://ec.europa.eu/eurostat/NLP4StatRef/know...,https://ec.europa.eu/eurostat/statistics-expla...,Accessing European statistics,https://ec.europa.eu/eurostat/NLP4StatRef/know...,http://ec.europa.eu/eurostat/product?code=KS-G...,Manual for Air Emissions Accounts 2015 edition,Publication
2,https://ec.europa.eu/eurostat/NLP4StatRef/know...,https://ec.europa.eu/eurostat/statistics-expla...,Accessing European statistics,https://ec.europa.eu/eurostat/NLP4StatRef/know...,http://ec.europa.eu/eurostat/product?code=KS-G...,Manual for Air Emissions Accounts 2015 edition,Publication
3,https://ec.europa.eu/eurostat/NLP4StatRef/know...,https://ec.europa.eu/eurostat/statistics-expla...,Migrant integration statistics active citizenship,https://ec.europa.eu/eurostat/NLP4StatRef/know...,https://eur-lex.europa.eu/legal-content/EN/ALL...,Regulation EC No 862 2007 of the European Parl...,Legislation
4,https://ec.europa.eu/eurostat/NLP4StatRef/know...,https://ec.europa.eu/eurostat/statistics-expla...,Migrant integration statistics introduced,https://ec.europa.eu/eurostat/NLP4StatRef/know...,https://ec.europa.eu/eurostat/statistics-expla...,Statistics for European policies and high prio...,SE article
5,https://ec.europa.eu/eurostat/NLP4StatRef/know...,https://ec.europa.eu/eurostat/statistics-expla...,Migrant integration statistics labour market i...,https://ec.europa.eu/eurostat/NLP4StatRef/know...,https://ec.europa.eu/eurostat/statistics-expla...,People outside the labour force,SE article
6,https://ec.europa.eu/eurostat/NLP4StatRef/know...,https://ec.europa.eu/eurostat/statistics-expla...,Migrant integration statistics labour market i...,https://ec.europa.eu/eurostat/NLP4StatRef/know...,https://ec.europa.eu/eurostat/statistics-expla...,People outside the labour force,SE article
7,https://ec.europa.eu/eurostat/NLP4StatRef/know...,https://ec.europa.eu/eurostat/statistics-expla...,Migrant integration statistics labour market i...,https://ec.europa.eu/eurostat/NLP4StatRef/know...,https://ec.europa.eu/eurostat/statistics-expla...,First and second generation immigrants statist...,SE article
8,https://ec.europa.eu/eurostat/NLP4StatRef/know...,https://ec.europa.eu/eurostat/statistics-expla...,Migrant integration statistics labour market i...,https://ec.europa.eu/eurostat/NLP4StatRef/know...,https://ec.europa.eu/eurostat/statistics-expla...,First and second generation immigrants statist...,SE article
9,https://ec.europa.eu/eurostat/NLP4StatRef/know...,https://ec.europa.eu/eurostat/statistics-expla...,Migrant integration statistics labour market i...,https://ec.europa.eu/eurostat/NLP4StatRef/know...,https://ec.europa.eu/eurostat/statistics-expla...,Unemployment statistics and beyond,SE article


In [141]:
## unique URIs in Articles and Related resources

valuesURI = df[['articleURI','relatedURI']].values
uniqueURI = np.unique(valuesURI)
print('uniqueURIs:')
uniqueURI

df2 = pd.DataFrame(uniqueURI,columns=['uniqueURI'])
df2['ID'] = range(len(df2))
df2

uniqueURIs:


Unnamed: 0,uniqueURI,ID
0,https://ec.europa.eu/eurostat/NLP4StatRef/know...,0
1,https://ec.europa.eu/eurostat/NLP4StatRef/know...,1
2,https://ec.europa.eu/eurostat/NLP4StatRef/know...,2
3,https://ec.europa.eu/eurostat/NLP4StatRef/know...,3
4,https://ec.europa.eu/eurostat/NLP4StatRef/know...,4
5,https://ec.europa.eu/eurostat/NLP4StatRef/know...,5
6,https://ec.europa.eu/eurostat/NLP4StatRef/know...,6
7,https://ec.europa.eu/eurostat/NLP4StatRef/know...,7
8,https://ec.europa.eu/eurostat/NLP4StatRef/know...,8
9,https://ec.europa.eu/eurostat/NLP4StatRef/know...,9


In [142]:
## transfer the unique IDs to the main file

df3 = pd.merge(df,df2,how='inner', left_on='articleURI',right_on='uniqueURI')
df3.rename(columns={'ID':'articleID'},inplace=True)
df3 = pd.merge(df3,df2,how='inner', left_on='relatedURI',right_on='uniqueURI')
df3.rename(columns={'ID':'relatedID'},inplace=True)
df3 = df3[['articleID','relatedID','relatedType','articleURI','articleURL','articleTitle','relatedURI','relatedURL','relatedTitle']]
df3

Unnamed: 0,articleID,relatedID,relatedType,articleURI,articleURL,articleTitle,relatedURI,relatedURL,relatedTitle
0,21,35,Legislation,https://ec.europa.eu/eurostat/NLP4StatRef/know...,https://ec.europa.eu/eurostat/statistics-expla...,Migrant integration statistics active citizenship,https://ec.europa.eu/eurostat/NLP4StatRef/know...,https://eur-lex.europa.eu/legal-content/EN/ALL...,Regulation EC No 862 2007 of the European Parl...
1,21,35,Legislation,https://ec.europa.eu/eurostat/NLP4StatRef/know...,https://ec.europa.eu/eurostat/statistics-expla...,Migrant integration statistics active citizenship,https://ec.europa.eu/eurostat/NLP4StatRef/know...,https://eur-lex.europa.eu/legal-content/EN/ALL...,Regulation EC No 862 2007 of the European Parl...
2,21,3,SE article,https://ec.europa.eu/eurostat/NLP4StatRef/know...,https://ec.europa.eu/eurostat/statistics-expla...,Migrant integration statistics active citizenship,https://ec.europa.eu/eurostat/NLP4StatRef/know...,https://ec.europa.eu/eurostat/statistics-expla...,First and second generation immigrants statist...
3,21,3,SE article,https://ec.europa.eu/eurostat/NLP4StatRef/know...,https://ec.europa.eu/eurostat/statistics-expla...,Migrant integration statistics active citizenship,https://ec.europa.eu/eurostat/NLP4StatRef/know...,https://ec.europa.eu/eurostat/statistics-expla...,First and second generation immigrants statist...
4,4,3,SE article,https://ec.europa.eu/eurostat/NLP4StatRef/know...,https://ec.europa.eu/eurostat/statistics-expla...,Migrant integration statistics labour market i...,https://ec.europa.eu/eurostat/NLP4StatRef/know...,https://ec.europa.eu/eurostat/statistics-expla...,First and second generation immigrants statist...
5,4,3,SE article,https://ec.europa.eu/eurostat/NLP4StatRef/know...,https://ec.europa.eu/eurostat/statistics-expla...,Migrant integration statistics labour market i...,https://ec.europa.eu/eurostat/NLP4StatRef/know...,https://ec.europa.eu/eurostat/statistics-expla...,First and second generation immigrants statist...
6,4,3,SE article,https://ec.europa.eu/eurostat/NLP4StatRef/know...,https://ec.europa.eu/eurostat/statistics-expla...,Migrant integration statistics labour market i...,https://ec.europa.eu/eurostat/NLP4StatRef/know...,https://ec.europa.eu/eurostat/statistics-expla...,First and second generation immigrants statist...
7,4,3,SE article,https://ec.europa.eu/eurostat/NLP4StatRef/know...,https://ec.europa.eu/eurostat/statistics-expla...,Migrant integration statistics labour market i...,https://ec.europa.eu/eurostat/NLP4StatRef/know...,https://ec.europa.eu/eurostat/statistics-expla...,First and second generation immigrants statist...
8,14,3,SE article,https://ec.europa.eu/eurostat/NLP4StatRef/know...,https://ec.europa.eu/eurostat/statistics-expla...,Migrant integration statistics regional labour...,https://ec.europa.eu/eurostat/NLP4StatRef/know...,https://ec.europa.eu/eurostat/statistics-expla...,First and second generation immigrants statist...
9,14,3,SE article,https://ec.europa.eu/eurostat/NLP4StatRef/know...,https://ec.europa.eu/eurostat/statistics-expla...,Migrant integration statistics regional labour...,https://ec.europa.eu/eurostat/NLP4StatRef/know...,https://ec.europa.eu/eurostat/statistics-expla...,First and second generation immigrants statist...


In [143]:
## related type groups and assignment to colors
type_groups = np.unique(df3['relatedType'])
print(type_groups)
node_colors = Paired_7.hex_colors[:len(type_groups)]
node_colors

['GL article' 'Legislation' 'Publication' 'SE article']


['#A6CEE3', '#1F78B4', '#B2DF8A', '#33A02C']

In [144]:
## Create graph

g=net.Network(height='600px', width='75%',heading='')
g.barnes_hut()
g.repulsion(node_distance=100, spring_length=200)

## First the unique articles - source nodes, by articleID 
uniqueArticles = df3.groupby(['articleID'])[['articleTitle']].agg(list).reset_index() ## aggregate into lists - titles are repeated
col_index = int(np.where(type_groups=='SE article')[0]) ## index of color to use for SE articles
for i in range(len(uniqueArticles)): ## for some reason, single elements must be put in a list, add_node() does not work
    g.add_nodes([uniqueArticles.loc[i,'articleID']], color=[node_colors[col_index]], label = [uniqueArticles.loc[i,'articleTitle'][0]])

## Then the related nodes, by type
relatedGroups = df3.groupby(['relatedType'])[['relatedID','relatedTitle']].agg(list).reset_index() ## again aggregate into lists
for i in range(len(relatedGroups)):
    how_many_nodes = len(relatedGroups.loc[i,'relatedID'])
    g.add_nodes(relatedGroups.loc[i,'relatedID'],label=relatedGroups.loc[i,'relatedTitle'], color=[node_colors[i]]*how_many_nodes)

## The edges
for i in range(len(df3)):        
    n1 = int(df3.loc[i,'articleID'])
    n2 = int(df3.loc[i,'relatedID'])
    type_ind = int(np.where(type_groups==df3.loc[i,'relatedType'])[0])
    col =  node_colors[type_ind]
    lbl = type_groups[type_ind]
    g.add_edge(n1,n2,color=col,label=lbl)
g.show('example.html')
display(HTML('example.html'))
