# Population of the Knowledge DB - V2 #

### Load libraries and connect to Virtuoso

In [1]:
import os 
import re
import logging
import sys
import pyodbc
import hashlib
import pandas as pd
from datetime import datetime
from SPARQLWrapper import SPARQLWrapper, POST, DIGEST, GET
from SPARQLWrapper import JSON, INSERT, DELETE
import sparql_dataframe

In [2]:
def connect_db(DSN, DBA, UID, PWD):

    connection = pyodbc.connect('DSN={};DBA={};UID={};PWD={}'.format(DSN, 
                                                                     DBA,
                                                                     UID,
                                                                     PWD))
    cursor = connection.cursor()

    return connection, cursor


def connect_virtuoso(DSN, UID, PWD):

    sparql = SPARQLWrapper(DSN)
    sparql.setHTTPAuth(DIGEST)
    sparql.setCredentials(UID, PWD)
    sparql.setMethod(GET)

    return sparql


Replace the values with your own logins :

In [3]:
user = "dba"
login = "30gFcpQzj7sPtRu5bkes"

In [4]:
# Connection to CDB 
connection, cursor = connect_db('Virtuoso All', 
                                'ESTAT', 
                                user, 
                                login)


# Connection to the KDB 
endpoint = "http://virtuoso-test.kapcode.fr:8890/sparql/"
sparql = connect_virtuoso(endpoint, 
                          user, 
                          login)


### Define content selection functions

In [5]:
def select_query(columns, table, conditions=None): 

    if conditions:

        query = """
            SELECT {}
            FROM {}
            WHERE {}

        """.format(columns, table, conditions)

    else:

        query = """

        SELECT {}
        FROM {}
        """.format(columns, table)

    return query

#### Get all tables name from the CDB 

In [6]:
ESTAT_V1_tables_names = pd.read_sql(select_query('*',
                                          'ESTAT.information_schema.tables', 
                                          ''), 
                             connection)
ESTAT_V1_tables_names['TABLE_NAME']

0                  dat_article
1        dat_article_paragraph
2      dat_article_shared_link
3             dat_further_info
4                 dat_glossary
5                dat_link_info
6                  dat_link_tm
7         dat_paragraph_figure
8             dat_redirections
9         dat_related_concepts
10                 dat_sources
11        dat_statistical_data
12              dat_tm_results
13        dat_tm_results_words
14        mod_article_division
15    mod_resource_information
16           mod_resource_type
17            mod_topic_models
18                    tm_terms
19                   tm_topics
Name: TABLE_NAME, dtype: object

##### Load the Link info table 

In [7]:
ESTAT_V1_dat_link_info = pd.read_sql('SELECT ESTAT.V1.dat_link_info.id, title, url , ESTAT.V1.mod_resource_information.uri, ESTAT.V1.mod_resource_type.uri FROM ESTAT.V1.dat_link_info INNER JOIN ESTAT.V1.mod_resource_information ON ESTAT.V1.dat_link_info.resource_information_id = ESTAT.V1.mod_resource_information.id INNER JOIN ESTAT.V1.mod_resource_type ON ESTAT.V1.dat_link_info.resource_type_id = ESTAT.V1.mod_resource_type.id', 
                             connection)

print(ESTAT_V1_dat_link_info.shape)
ESTAT_V1_dat_link_info.head()

(10553, 5)


Unnamed: 0,id,title,url,uri,uri__1
0,43,African Union,http://www.au.int/,,Other
1,44,African Union in a nutshell,http://www.au.int/en/about/nutshell,,Other
2,53,ACP Secretariat website,http://www.acp.int/content/secretariat-acp,,Other
3,89,Agri benchmark Cash Crop Report 2015,http://www.agribenchmark.org/fileadmin/Dateiab...,,Other
4,90,agri benchmark,http://www.agribenchmark.org,,Other


##### Load Glossary related tables

In [8]:
ESTAT_V1_dat_glossary = pd.read_sql(select_query('*',
                                          'ESTAT.V1.dat_glossary', 
                                          ''), 
                             connection)
ESTAT_V1_dat_further_info = pd.read_sql(select_query('*',
                                          "ESTAT.V1.dat_further_info", 
                                          ''), 
                             connection)
ESTAT_V1_dat_related_concepts = pd.read_sql(select_query('*',
                                          "ESTAT.V1.dat_related_concepts", 
                                          ''), 
                             connection)
ESTAT_V1_dat_sources = pd.read_sql(select_query('*',
                                          "ESTAT.V1.dat_sources", 
                                          ''), 
                             connection)
ESTAT_V1_dat_statistical_data = pd.read_sql(select_query('*',
                                          "ESTAT.V1.dat_statistical_data", 
                                          ''), 
                             connection)
ESTAT_V1_dat_redirections = pd.read_sql('SELECT link_id, glossary_id, title FROM ESTAT.V1.dat_redirections INNER JOIN ESTAT.V1.dat_link_info ON ESTAT.V1.dat_link_info.id = link_id', connection)



In [9]:
ESTAT_V1_dat_glossary.head()

Unnamed: 0,id,definition,redirection,original_title,homepage,last_update
0,1,An accident at work in the framework ...,0,,0,2021-03-10 11:13:00
1,5,A fatal accident at work refers to an...,0,,0,2019-03-06 12:08:00
2,6,A non-fatal accident at work is...,0,,0,2018-09-12 15:42:00
3,8,Aggregate demand is the total amount of ...,0,,0,2017-07-28 11:15:00
4,9,The goods and services account shows ...,0,,0,2017-08-03 16:48:00


##### Load Statistical Articles related tables

In [10]:
ESTAT_V1_dat_article = pd.read_sql(select_query('*',
                                          'ESTAT.V1.dat_article', 
                                          'background_article = 0'), 
                             connection)

ESTAT_V1_dat_article_shared_link_legal = pd.read_sql('SELECT ESTAT.V1.dat_article_shared_link.* FROM ESTAT.V1.dat_article_shared_link INNER JOIN ESTAT.V1.dat_link_info ON ESTAT.V1.dat_link_info.id = link_id INNER JOIN ESTAT.V1.dat_article ON ESTAT.V1.dat_article.id = article_id WHERE background_article=0 AND resource_type_id = 2', connection)

ESTAT_V1_dat_article_shared_link_editorial = pd.read_sql('SELECT ESTAT.V1.dat_article_shared_link.* FROM ESTAT.V1.dat_article_shared_link INNER JOIN ESTAT.V1.dat_link_info ON ESTAT.V1.dat_link_info.id = link_id INNER JOIN ESTAT.V1.dat_article ON ESTAT.V1.dat_article.id = article_id WHERE background_article=0 AND resource_type_id IN (1, 6, 7, 8, 9, 10)', connection)

ESTAT_V1_dat_article_shared_link_statistic_data = pd.read_sql('SELECT ESTAT.V1.dat_article_shared_link.* FROM ESTAT.V1.dat_article_shared_link INNER JOIN ESTAT.V1.dat_link_info ON ESTAT.V1.dat_link_info.id = link_id INNER JOIN ESTAT.V1.dat_article ON ESTAT.V1.dat_article.id = article_id WHERE background_article=0 AND resource_type_id IN (11, 12, 13, 14)', connection)

ESTAT_V1_dat_article_core_data = pd.read_sql('SELECT article_id, link_id, url, data_sources FROM ESTAT.V1.dat_article_shared_link INNER JOIN ESTAT.V1.dat_article ON ESTAT.V1.dat_article.id = article_id INNER JOIN ESTAT.V1.dat_link_info ON link_id = ESTAT.V1.dat_link_info.id WHERE background_article = 0 AND article_division_id=1',connection)

In [11]:
ESTAT_V1_dat_article_core_data.head()

Unnamed: 0,article_id,link_id,url,data_sources
0,2905,2907,https://ec.europa.eu/eurostat/statistics-expla...,All figures in this article are based on seas...
1,2914,2917,https://ec.europa.eu/eurostat/statistics-expla...,An in-patient is a patient who is formally adm...
2,7,2943,https://ec.europa.eu/eurostat/statistics-expla...,"In December 2008, the European Parliament an..."
3,2946,2953,https://ec.europa.eu/eurostat/statistics-expla...,"In December 2008, the European Parliament an..."
4,2959,2961,https://ec.europa.eu/eurostat/statistics-expla...,EU data is taken from Eurostat's COMEXT da...


##### Load Statistical Background Articles related tables

In [12]:
ESTAT_V1_dat_background_article = pd.read_sql(select_query('*',
                                          'ESTAT.V1.dat_article', 
                                          'background_article = 1'), 
                             connection)

ESTAT_V1_dat_background_article_shared_link_legal = pd.read_sql('SELECT ESTAT.V1.dat_article_shared_link.* FROM ESTAT.V1.dat_article_shared_link INNER JOIN ESTAT.V1.dat_link_info ON ESTAT.V1.dat_link_info.id = link_id INNER JOIN ESTAT.V1.dat_article ON ESTAT.V1.dat_article.id = article_id WHERE background_article=1 AND resource_type_id = 2', connection)

ESTAT_V1_dat_background_article_shared_link_editorial = pd.read_sql('SELECT ESTAT.V1.dat_article_shared_link.* FROM ESTAT.V1.dat_article_shared_link INNER JOIN ESTAT.V1.dat_link_info ON ESTAT.V1.dat_link_info.id = link_id INNER JOIN ESTAT.V1.dat_article ON ESTAT.V1.dat_article.id = article_id WHERE background_article=1 AND resource_type_id IN (1, 6, 7, 8, 9, 10)', connection)

ESTAT_V1_dat_background_article_shared_link_statistic_data = pd.read_sql('SELECT ESTAT.V1.dat_article_shared_link.* FROM ESTAT.V1.dat_article_shared_link INNER JOIN ESTAT.V1.dat_link_info ON ESTAT.V1.dat_link_info.id = link_id INNER JOIN ESTAT.V1.dat_article ON ESTAT.V1.dat_article.id = article_id WHERE background_article=1 AND resource_type_id IN (11, 12, 13, 14)', connection)

ESTAT_V1_dat_background_article_core_data = pd.read_sql('SELECT article_id, link_id, url, data_sources FROM ESTAT.V1.dat_article_shared_link INNER JOIN ESTAT.V1.dat_article ON ESTAT.V1.dat_article.id = article_id INNER JOIN ESTAT.V1.dat_link_info ON link_id = ESTAT.V1.dat_link_info.id WHERE background_article = 1 AND article_division_id=1',connection)

In [13]:
ESTAT_V1_dat_background_article.head()

Unnamed: 0,id,context,data_sources,last_update,background_article,homepage
0,25,,,2020-11-18 18:09:00,1,0
1,67,Eurostat publishes data on education and trai...,,2021-03-24 19:19:00,1,0
2,102,,,2020-10-01 10:42:00,1,0
3,127,Most changes to landscapes are not visible o...,,2019-03-06 11:22:00,1,0
4,131,The SAPM was developed to provide data: Policy...,Survey organisation The legal basis for the ...,2020-04-03 11:35:00,1,0


### Populate the KDB

##### 1 - Insertion of URLs into the KDB

Add links and the basic metadata

In [14]:
#a checker pour voir si tout est bien ajouté avec la bonne relation 
def insert_link_info(DB, method):  # works for eurostat links, foreign links

    for i, row in DB.iterrows():
        id_ = row[0]
        title_ = row[1]
        title_ = title_.replace("'","").strip()
        url_ = row[2]
        resource_info_ = row[3]
        resource_type_ = row[4]
        
        query_insert_P = """

        PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> 
        PREFIX estat: <https://nlp4statref/knowledge/ontology/> 
        PREFIX dct: <http://purl.org/dc/terms/>

        INSERT { GRAPH <https://nlp4statref/knowledge/ontology/> { 

           estat:""" + str(id_) + """ rdf:about '""" + str(id_) + """'.
           estat:""" + str(id_) + """ dct:source '""" + str(url_) + """'. 
           estat:""" + str(id_) + """ dct:title '""" + str(title_) + """'. 
           estat:""" + str(id_) + """ estat:resourceInformation '""" + str(resource_info_) + """'. 
           estat:""" + str(id_) + """ dct:type '""" + str(resource_type_) + """'. 

        } }
        ;
        """

        sparql.setQuery(query_insert_P)
        sparql.method = method
        sparql.setReturnFormat(JSON)
        results = sparql.query().response.read()


In [15]:
insert_link_info(ESTAT_V1_dat_link_info, "POST")

###  2 - Insertion of Glossary elements into the KDB. 

For those elements, specific relations based on their metadata are added, then we add relations with others links contained in the tables associated to the concepts elements. Some info are already added through the link info table, so they should not be added anew.

In [16]:
def insert_glossary_elements(DB, classe, link_info_table, method): 

    threshold = len(DB.columns)
    if threshold > 3 :  # concept table

        for i, row in DB.iterrows():

            id_ = row[0]
            def_ = row[1]
            def_ = def_.replace("'","").strip()
            def_ = def_.replace("\\","").strip()
            #redirection_ = row[2] #add a condition on altlabel on that ?
            #original_title_ = row[3]
            last_update_ = row[5]
            
            # Get info from the eurostat_links
            title_ = link_info_table[link_info_table['id'] == id_].title.item()
            title_ = title_.replace("'","").strip()
            # Construct the query 
            query = """

                PREFIX estat: <https://nlp4statref/knowledge/ontology/> 
                PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
                PREFIX dct: <http://purl.org/dc/terms/>
                
                INSERT { GRAPH <https://nlp4statref/knowledge/ontology/> { 

                   estat:""" + str(id_) + """ skos:Concept '""" + str(id_) + """'.
                   estat:""" + str(id_) + """ skos:definition '""" + str(def_) + """'. 
                   estat:""" + str(id_) + """ dct:modified '""" + str(last_update_) + """'. 
                   estat:""" + str(id_) + """ skos:prefLabel '""" + str(title_) + """'.                    
                   
        } }
        ;
                """

            sparql.setQuery(query)
            sparql.method = method
            sparql.setReturnFormat(JSON)
            results = sparql.query().response.read()
            

    elif threshold == 3:  # links, related concept, statistical data,source and altLabel

        for i, row in DB.iterrows():

            id_ = row[0]
            concept_id_ = row[1]
            link_id_ = row[2]

            # Construct the query 
            query = """

                PREFIX estat: <https://nlp4statref/knowledge/ontology/> 
                PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

                INSERT { GRAPH <https://nlp4statref/knowledge/ontology/> { 

                   estat:""" + str(concept_id_) + ' ' + classe + """ '""" + str(link_id_) + """'. 

                } }
                ;
            """
            sparql.setQuery(query)
            sparql.method = method
            sparql.setReturnFormat(JSON)
            results = sparql.query().response.read()
    


In [17]:
insert_glossary_elements(ESTAT_V1_dat_glossary, '',ESTAT_V1_dat_link_info, "POST")

Add relations from the Concepts elements associated tables : 

In [18]:
# Populate
insert_glossary_elements(ESTAT_V1_dat_related_concepts, 'skos:related', '', 'POST')
insert_glossary_elements(ESTAT_V1_dat_statistical_data, 'estat:statisticalInformation', '', 'POST')
insert_glossary_elements(ESTAT_V1_dat_sources, 'estat:sourceInformation','', 'POST')
insert_glossary_elements(ESTAT_V1_dat_redirections, 'skos:altLabel', '','POST')

Pour la table ESTAT_V1_dat_further_info, la relation dépend du type de contenu, qui doit être récupéré dans la table ESTAT_V1_dat_link_info

In [None]:
#
#FONCTION A DEFINIR BASEE SUR insert_glossary_elements(ESTAT_V1_dat_further_info,
#                                                     'estat:furtherInformation', "POST")

If the Glossary homepage is contained in the same table, it should be added separately as it has different relations. 

###  3 - Insertion of Statistical Explained elements into the KDB. 

Deux objets dans un article : larticle lui meme et un Core Data (un excel avec les données utilisées et leurs métadonnées)

Exemple : https://ec.europa.eu/eurostat/statistics-explained/index.php?title=Exchange_rates_and_interest_rates
Excel + ce qu'il y a avant COntext

In [19]:
def insert_dat_article(DB, classe, method): 

    for i, row in DB.iterrows():

        id_ = row[0]
        last_update_ = str(row[4])

        # Construct the query 
        query = """

            PREFIX estat: <https://nlp4statref/knowledge/ontology/>
            PREFIX dct: <http://purl.org/dc/terms/>

            INSERT { GRAPH <https://nlp4statref/knowledge/ontology/> { 

               estat:""" + str(id_) + """ estat:StatisticalArticle '""" + str(id_) + """'.
               estat:""" + str(id_) + """ dct:modified '""" + str(last_update_) + """'. 

    } }
    ;
            """
    sparql.setQuery(query)
    sparql.method = method
    sparql.setReturnFormat(JSON)
    results = sparql.query().response.read()

In [20]:
insert_dat_article(ESTAT_V1_dat_article, '', "POST")

In [21]:
def insert_dat_article_shared_link(DB, classe, link_info_table, method): 

     for i, row in DB.iterrows():

            id_ = row[1]
            link_id_ = row[2]

            # Construct the query 
            query = """

                PREFIX estat: <https://nlp4statref/knowledge/ontology/> 
                PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

                INSERT { GRAPH <https://nlp4statref/knowledge/ontology/> { 

                   estat:""" + str(id_) + ' ' + classe + """ '""" + str(link_id_) + """'. 

                } }
                ;
            """
            sparql.setQuery(query)
            sparql.method = method
            sparql.setReturnFormat(JSON)
            results = sparql.query().response.read()


In [22]:
insert_dat_article_shared_link(ESTAT_V1_dat_article_shared_link_editorial, 'estat:relatedEditorialContent', '', 'POST')
insert_dat_article_shared_link(ESTAT_V1_dat_article_shared_link_legal, 'estat:relatedLegalInformation', '', 'POST')
insert_dat_article_shared_link(ESTAT_V1_dat_article_shared_link_statistic_data, 'estat:relatedStatisticData','', 'POST')

In [23]:
def insert_dat_article_core_data(DB, classe, link_info_table, method): 

     for i, row in DB.iterrows():

            id_ = row[0]
            link_id_ = row[1]
            url_ = row[2]
            description_ = row[3].replace("'","").strip()
            description_ = description_.replace("\\","").strip()

            # Construct the query 
            query = """

                PREFIX estat: <https://nlp4statref/knowledge/ontology/> 
                PREFIX dct: <http://www.w3.org/2004/02/skos/core#>

                INSERT { GRAPH <https://nlp4statref/knowledge/ontology/> { 

                   estat:""" + str(id_) + """ estat:StatisticsExplainedData '""" + str(link_id_) + """'.
                   estat:""" + str(link_id_) + """ dct:source '""" + str(url_) + """'. 
                   estat:""" + str(link_id_) + """ dct:description '""" + str(description_) + """'.

                } }
                ;
            """
            sparql.setQuery(query)
            sparql.method = method
            sparql.setReturnFormat(JSON)
            results = sparql.query().response.read()

In [24]:
insert_dat_article_core_data(ESTAT_V1_dat_article_core_data, '','', 'POST')

###  4 - Insertion of Background Articles elements into the KDB. 

In [25]:
def insert_dat_background_article(DB, classe, method): 

    for i, row in DB.iterrows():

        id_ = row[0]
        last_update_ = str(row[4])

        # Construct the query 
        query = """

            PREFIX estat: <https://nlp4statref/knowledge/ontology/>
            PREFIX dct: <http://purl.org/dc/terms/>

            INSERT { GRAPH <https://nlp4statref/knowledge/ontology/> { 

               estat:""" + str(id_) + """ estat:BackgroundArticle '""" + str(id_) + """'.
               estat:""" + str(id_) + """ dct:modified '""" + str(last_update_) + """'. 

    } }
    ;
            """
    sparql.setQuery(query)
    sparql.method = method
    sparql.setReturnFormat(JSON)
    results = sparql.query().response.read()

In [26]:
insert_dat_background_article(ESTAT_V1_dat_background_article, '', "POST")

In [27]:
insert_dat_article_shared_link(ESTAT_V1_dat_background_article_shared_link_editorial, 'estat:relatedEditorialContent', '', 'POST')
insert_dat_article_shared_link(ESTAT_V1_dat_background_article_shared_link_legal, 'estat:relatedLegalInformation', '', 'POST')
insert_dat_article_shared_link(ESTAT_V1_dat_background_article_shared_link_statistic_data, 'estat:relatedStatisticData','', 'POST')

In [28]:
insert_dat_article_core_data(ESTAT_V1_dat_background_article_core_data, '','', 'POST')

### See added statements 

In [31]:

SelectStatements = """
PREFIX estat: <https://nlp4statref/knowledge/ontology/> 
PREFIX dct: <http://purl.org/dc/terms/>
SELECT * FROM <https://nlp4statref/knowledge/ontology/>
WHERE { estat:1487 ?s ?o

}
"""

#statements_df = sparql_dataframe.get(endpoint, SelectStatements)
#print(statements_df.shape)
#print(statements_df.tail())

sparql.setQuery(SelectStatements)
sparql.method = "POST"
sparql.setReturnFormat(JSON)
results = sparql.query().convert()['results']['bindings']
results = pd.json_normalize(results)
print(results['o.value'].values[0])

1487


In [32]:
results = sparql.query().convert()['results']['bindings']
results = pd.json_normalize(results)
results

Unnamed: 0,s.type,s.value,o.type,o.value
0,uri,http://www.w3.org/1999/02/22-rdf-syntax-ns#about,literal,1487
1,uri,http://purl.org/dc/terms/source,literal,https://ec.europa.eu/eurostat/statistics-expla...
2,uri,http://purl.org/dc/terms/title,literal,Acquisition of citizenship statistics
3,uri,http://purl.org/dc/terms/type,literal,Other
4,uri,https://nlp4statref/knowledge/ontology/resourc...,literal,https://nlp4statref/knowledge/resource/authori...
