# Population of the Knowledge DB - V2 #

### Load libraries and connect to Virtuoso

In [1]:
import os 
import re
import logging
import sys
import pyodbc
import hashlib
import pandas as pd
from datetime import datetime
from SPARQLWrapper import SPARQLWrapper, POST, DIGEST, GET
from SPARQLWrapper import JSON, INSERT, DELETE
import sparql_dataframe

In [2]:
def connect_db(DSN, DBA, UID, PWD):

    connection = pyodbc.connect('DSN={};DBA={};UID={};PWD={}'.format(DSN, 
                                                                     DBA,
                                                                     UID,
                                                                     PWD))
    cursor = connection.cursor()

    return connection, cursor


def connect_virtuoso(DSN, UID, PWD):

    sparql = SPARQLWrapper(DSN)
    sparql.setHTTPAuth(DIGEST)
    sparql.setCredentials(UID, PWD)
    sparql.setMethod(GET)

    return sparql


Replace the values with your own logins :

In [None]:
user = "XXX"
login = "XXXX"

In [3]:
# Connection to CDB 
connection, cursor = connect_db('Virtuoso All', 
                                'ESTAT', 
                                user, 
                                login)


# Connection to the KDB 
endpoint = "http://virtuoso-test.kapcode.fr:8890/sparql/"
sparql = connect_virtuoso(endpoint, 
                          user, 
                          login)


### Define content selection functions

In [4]:
def select_query(columns, table, conditions=None): 

    if conditions:

        query = """
            SELECT {}
            FROM {}
            WHERE {}

        """.format(columns, table, conditions)

    else:

        query = """

        SELECT {}
        FROM {}
        """.format(columns, table)

    return query

#### Get all tables name from the CDB 

In [5]:
ESTAT_V1_tables_names = pd.read_sql(select_query('*',
                                          'ESTAT.information_schema.tables', 
                                          ''), 
                             connection)
ESTAT_V1_tables_names['TABLE_NAME']

0                  dat_article
1        dat_article_paragraph
2      dat_article_shared_link
3             dat_further_info
4                 dat_glossary
5                dat_link_info
6                  dat_link_tm
7         dat_paragraph_figure
8             dat_redirections
9         dat_related_concepts
10                 dat_sources
11        dat_statistical_data
12              dat_tm_results
13        dat_tm_results_words
14        mod_article_division
15    mod_resource_information
16           mod_resource_type
17            mod_topic_models
18                 tm_articles
19                    tm_terms
20                   tm_topics
21                   tm_values
Name: TABLE_NAME, dtype: object

##### Load the Link info table 

In [6]:
ESTAT_V1_dat_link_info = pd.read_sql(select_query('*',
                                          'ESTAT.V1.dat_link_info', 
                                          ''), 
                             connection)

print(ESTAT_V1_dat_link_info.shape)
ESTAT_V1_dat_link_info.head()

(7027, 5)


Unnamed: 0,id,title,url,resource_information_id,resource_type_id
0,1,Absences from work - quarterly statist...,https://ec.europa.eu/eurostat/statistics-expla...,1,
1,2,"Figure 1: Absences from work in the EU, Q1 20...",https://ec.europa.eu/eurostat/databrowser/view...,1,
2,3,"Figure 4: Absences from work, Q4 2019 - Q4 20...",https://ec.europa.eu/eurostat/databrowser/view...,1,
3,4,All articles on the labour market,/eurostat/statistics-explained/index.php?title...,1,
4,5,EU labour force survey,/eurostat/statistics-explained/index.php?title...,1,


##### Load Glossary related tables

In [7]:
ESTAT_V1_dat_glossary = pd.read_sql(select_query('*',
                                          'ESTAT.V1.dat_glossary', 
                                          ''), 
                             connection)
ESTAT_V1_dat_further_info = pd.read_sql(select_query('*',
                                          "ESTAT.V1.dat_further_info", 
                                          ''), 
                             connection)
ESTAT_V1_dat_related_concepts = pd.read_sql(select_query('*',
                                          "ESTAT.V1.dat_related_concepts", 
                                          ''), 
                             connection)
ESTAT_V1_dat_sources = pd.read_sql(select_query('*',
                                          "ESTAT.V1.dat_sources", 
                                          ''), 
                             connection)
ESTAT_V1_dat_statistical_data = pd.read_sql(select_query('*',
                                          "ESTAT.V1.dat_statistical_data", 
                                          ''), 
                             connection)


In [8]:
ESTAT_V1_dat_glossary.head()

Unnamed: 0,id,definition,redirection,original_title,homepage,last_update
0,5006,"An access point , in postal statistic...",0,,0,2018-05-17 13:17:00
1,5007,A letter box is a facility provided f...,0,,0,2018-05-17 13:39:00
2,5008,A place at which only stamps can be boug...,0,,0,2018-05-17 13:49:00
3,5009,"A Post office , Post agency or Po...",0,,0,2018-05-17 13:50:00
4,5010,A post office box (P.O. box) is a fac...,0,,0,2018-05-17 13:52:00


##### Load Statistical Articles related tables

In [10]:
ESTAT_V1_dat_article = pd.read_sql(select_query('*',
                                          'ESTAT.V1.dat_article', 
                                          ''), 
                             connection)
ESTAT_V1_dat_article_paragraph = pd.read_sql(select_query('*',
                                          'ESTAT.V1.dat_article_paragraph', 
                                          ''), 
                             connection)

ESTAT_V1_dat_paragraph_figure = pd.read_sql(select_query('*',
                                          'ESTAT.V1.dat_paragraph_figure', 
                                          ''), 
                             connection)
ESTAT_V1_dat_article_shared_link = pd.read_sql(select_query('*',
                                          'ESTAT.V1.dat_article_shared_link', 
                                          ''), 
                             connection)
ESTAT_V1_dat_article_paragraph = pd.read_sql(select_query('*',
                                          'ESTAT.V1.dat_article_paragraph', 
                                          ''), 
                             connection)
ESTAT_V1_mod_article_division = pd.read_sql(select_query('*',
                                          'ESTAT.V1.mod_article_division', 
                                          ''), 
                             connection)

In [11]:
ESTAT_V1_dat_article.head()

Unnamed: 0,id,context,data_sources,last_update,background_article,homepage
0,1,The COVID-19 pandemic hit Europe in January an...,All figures in this article are based on seas...,2021-04-14 16:09:00,0,0
1,10,The importance of action to prevent accidents ...,An in-patient is a patient who is formally adm...,2020-09-16 14:36:00,0,0
2,39,"A safe, healthy working environment is a cruci...","In December 2008, the European Parliament an...",2020-11-26 16:06:00,0,0
3,50,"A safe, healthy working environment is a cruci...","In December 2008, the European Parliament an...",2020-12-07 17:31:00,0,0
4,56,Trade is an important indicator of Europeâs ...,EU data is taken from Eurostat's COMEXT da...,2021-04-29 10:45:00,0,0


### Populate the KDB

##### 1 - Insertion of URLs into the KDB

Add links and the basic metadata

In [11]:
#a checker pour voir si tout est bien ajouté avec la bonne relation 
def insert_link_info(DB, method):  # works for eurostat links, foreign links

    for i, row in DB.iterrows():
        id_ = row[0]
        title_ = row[1]
        title_ = title_.replace("'","").strip()
        url_ = row[2]
        resource_info_id_ = row[3]
        resource_type_id_ = row[4]
        
        query_insert_P = """

        PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> 
        PREFIX estat: <https://nlp4statref/knowledge/ontology/> 
        PREFIX dct: <http://purl.org/dc/terms/>

        INSERT { GRAPH <https://nlp4statref/knowledge/ontology/> { 

           estat:""" + str(id_) + """ rdf:about '""" + str(id_) + """'.
           estat:""" + str(id_) + """ dct:source '""" + str(url_) + """'. 
           estat:""" + str(id_) + """ dct:title '""" + str(title_) + """'. 
           estat:""" + str(id_) + """ estat:resourceInformation '""" + str(resource_info_id_) + """'. 
           estat:""" + str(id_) + """ dct:type '""" + str(resource_type_id_) + """'. 

        } }
        ;
        """

        sparql.setQuery(query_insert_P)
        sparql.method = method
        sparql.setReturnFormat(JSON)
        results = sparql.query().response.read()


In [15]:
insert_link_info(ESTAT_V1_dat_link_info, "POST")

###  2 - Insertion of Glossary elements into the KDB. 

For those elements, specific relations based on their metadata are added, then we add relations with others links contained in the tables associated to the concepts elements. Some info are already added through the link info table, so they should not be added anew.

In [14]:
def insert_glossary_elements(DB, classe, link_info_table, method): 

    threshold = len(DB.columns)
    if threshold > 3 :  # concept table

        for i, row in DB.iterrows():

            id_ = row[0]
            def_ = row[1]
            def_ = def_.replace("'","").strip()
            redirection_ = row[2] #add a condition on altlabel on that ?
            original_title_ = row[3]
            last_update_ = row[5]
            
            # Get info from the eurostat_links
            title_ = link_info_table[link_info_table['id'] == id_].title.item()
            title_ = title_.replace("'","").strip()
            # Construct the query 
            query = """

                PREFIX estat: <https://nlp4statref/knowledge/ontology/> 
                PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
                PREFIX dct: <http://purl.org/dc/terms/>
                
                INSERT { GRAPH <https://nlp4statref/knowledge/ontology/> { 

                   estat:""" + str(id_) + """ skos:Concept '""" + str(id_) + """'.
                   estat:""" + str(id_) + """ skos:definition '""" + str(def_) + """'. 
                   estat:""" + str(id_) + """ dct:modified '""" + str(last_update_) + """'. 
                   estat:""" + str(id_) + """ skos:altLabel '""" + str(original_title_) + """'. 
                   estat:""" + str(id_) + """ skos:prefLabel '""" + str(title_) + """'.                    
                   
        } }
        ;
                """

            sparql.setQuery(query)
            sparql.method = method
            sparql.setReturnFormat(JSON)
            results = sparql.query().response.read()
            

    elif threshold == 3:  # links, related concept, statistical data and source

        for i, row in DB.iterrows():

            id_ = row[0]
            concept_id_ = row[1]
            link_id_ = row[2]

            # Construct the query 
            query = """

                PREFIX estat: <https://nlp4statref/knowledge/ontology/> 
                PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

                INSERT { GRAPH <https://nlp4statref/knowledge/ontology/> { 

                   estat:""" + str(concept_id_) + ' ' + classe + """ '""" + str(link_id_) + """'. 

                } }
                ;
            """
            sparql.setQuery(query)
            sparql.method = method
            sparql.setReturnFormat(JSON)
            results = sparql.query().response.read()
    


In [15]:
insert_glossary_elements(ESTAT_V1_dat_glossary, '',ESTAT_V1_dat_link_info, "POST")

QueryBadFormed: QueryBadFormed: a bad request has been sent to the endpoint, probably the sparql query is bad formed. 

Response:
b"Virtuoso 37000 Error SP030: SPARQL compiler, line 0: Bad escape sequence in a short single-quoted string at ''An activity is classified as knowledge intensive if employed tertiary educated persons (according to     ISCED    97 levels 5+6, according to ISCED 2011 levels 5 to 8) represent more than 33 % of the total employment in that activity. The definition is built based on the average number of employed persons aged 15-64 at aggregated EU-27 level according to     NACE    at 2-digit, using EU     Labour Force Survey    data     [1]     The KIA employment indicator was developed to offer an average, harmonised across all the sectors, to compare economies in regard to their knowledge intensity.    The total employment ratio is calculated on the population aged 15-64 according to the following formula:             [math]\\frac{Number\\'\n\nSPARQL query:\ndefine sql:big-data-const 0 \n\n                PREFIX estat: <https://nlp4statref/knowledge/ontology/> \n                PREFIX skos: <http://www.w3.org/2004/02/skos/core#>\n                PREFIX dct: <http://purl.org/dc/terms/>\n                \n                INSERT { GRAPH <https://nlp4statref/knowledge/ontology/> { \n\n                   estat:6453 skos:Concept '6453'.\n                   estat:6453 skos:definition 'An activity is classified as knowledge intensive if employed tertiary educated persons (according to     ISCED    97 levels 5+6, according to ISCED 2011 levels 5 to 8) represent more than 33 % of the total employment in that activity. The definition is built based on the average number of employed persons aged 15-64 at aggregated EU-27 level according to     NACE    at 2-digit, using EU     Labour Force Survey    data     [1]     The KIA employment indicator was developed to offer an average, harmonised across all the sectors, to compare economies in regard to their knowledge intensity.    The total employment ratio is calculated on the population aged 15-64 according to the following formula:             [math]\\frac{Number\\ of\\ persons\\ employed\\ in\\ sectors\\ identified\\ as\\ knowledge\\ intensive}{Total\\ number\\ of\\ persons\\ employed}[/math]             There are two aggregates in use based on this classification: total Knowledge Intensive Activities (KIA) and Knowledge Intensive Activities \xc3\xa2\xc2\x80\xc2\x93 Business Industries (KIABI).'. \n                   estat:6453 dct:modified '2020-02-03 17:07:00'. \n                   estat:6453 skos:altLabel 'None'. \n                   estat:6453 skos:prefLabel 'Knowledge Intensive Activity (KIA)'.                    \n                   \n        } }\n        ;\n                "

Add relations from the Concepts elements associated tables : 

In [None]:
# Populate
insert_glossary_elements(ESTAT_V1_dat_related_concepts, 'skos:related', 'POST')
insert_glossary_elements(ESTAT_V1_dat_statistical_data, 'estat:statisticalInformation', 'POST')
insert_glossary_elements(ESTAT_V1_dat_sources, 'estat:sourceInformation', 'POST')

Pour la table ESTAT_V1_dat_further_info, la relation dépend du type de contenu, qui doit être récupéré dans la table ESTAT_V1_dat_link_info

In [None]:
#
#FONCTION A DEFINIR BASEE SUR insert_glossary_elements(ESTAT_V1_dat_further_info,
#                                                     'estat:furtherInformation', "POST")

If the Glossary homepage is contained in the same table, it should be added separately as it has different relations. 

###  3 - Insertion of Statistical Explained elements into the KDB. 

Deux objets dans un article : larticle lui meme et un Core Data (un excel avec les données utilisées et leurs métadonnées)

Exemple : https://ec.europa.eu/eurostat/statistics-explained/index.php?title=Exchange_rates_and_interest_rates
Excel + ce qu'il y a avant COntext

In [41]:
ESTAT_V1_dat_article_paragraph.head()

Unnamed: 0,id,article_id,title,content,abstract,alert
0,1,881,Structural profile,The accommodation and food services sector re...,0,0
1,2,881,Sectoral analysis,According to most structural business indicat...,0,0
2,3,881,Country overview,"In absolute terms, Germany recorded the highe...",0,0
3,4,881,Size class analysis,The enterprise size structure of the EU-27...,0,0
4,5,881,Regions,The largest regional employment in the accomm...,0,0


In [None]:
def insert_dat_article(DB, classe, method): 

    for i, row in DB.iterrows():

        id_ = row[0]
        context_ = row[1]
        data_sources_ = row[2]
        last_update_ = str(row[4])
        background_article_ = row[5]
        homepage_ = row[6]

        # Construct the query 
        query = """

            PREFIX estat: <https://nlp4statref/knowledge/ontology/> 
            PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

            INSERT { GRAPH <https://nlp4statref/knowledge/ontology/> { 

               estat:""" + str(id_) + """ estat:StatisticsExplainedData '""" + str(id_) + """'.
               estat:""" + str(id_) + """ XXX:XX '""" + str(context_) + """'. 
               estat:""" + str(id_) + """ dct:source '""" + str(data_sources_) + """'. 
               estat:""" + str(id_) + """ dct:modified '""" + str(last_update_) + """'. 
               estat:""" + str(id_) + """ XX:XXX '""" + str(background_article_) + """'. 
               estat:""" + str(id_) + """ XXX:XXX '""" + str(homepage_) + """'. 

    } }
    ;
            """
    sparql.setQuery(query)
    sparql.method = method
    sparql.setReturnFormat(JSON)
    results = sparql.query().response.read()

In [None]:
insert_dat_article(ESTAT_V1_dat_article, '', "POST")

In [None]:
def insert_dat_article_paragraph(DB, classe, method): 

    for i, row in DB.iterrows():

        id_ = row[0]
        article_id_ = row[1]
        title_ = row[2]
        content_ = row[3]
        abstract_ = row[4]
        alert_ = row[5]

        # Construct the query 
        query = """

            PREFIX estat: <https://nlp4statref/knowledge/ontology/> 
            PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

            INSERT { GRAPH <https://nlp4statref/knowledge/ontology/> { 

               estat:""" + str(id_) + """ dct:isPartOf '""" + str(article_id_) + """'.
               estat:""" + str(id_) + """ dct:title '""" + str(title_) + """'. 
               estat:""" + str(id_) + """ XXX:XX '""" + str(content_) + """'. 
               estat:""" + str(id_) + """ XXX:XX '""" + str(abstract_) + """'. 
               estat:""" + str(id_) + """ XX:XXX '""" + str(alert_) + """'. 

    } }
    ;
            """
    sparql.setQuery(query)
    sparql.method = method
    sparql.setReturnFormat(JSON)
    results = sparql.query().response.read()


In [None]:
def insert_dat_article_shared_link(DB, classe, link_info_table, method): 

    for i, row in DB.iterrows():

        id_ = row[0]
        paragraph_id_ = row[1]
        link_id_ = row[2]
        article_division_id_ = row[3]
        
        # Get info from the eurostat_links
        url_ = link_info_table[link_info_table['id'] == link_id_].url.item()

        # Construct the query 
        query = """

            PREFIX estat: <https://nlp4statref/knowledge/ontology/> 
            PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

            INSERT { GRAPH <https://nlp4statref/knowledge/ontology/> { 

               estat:""" + str(id_) + """ dct:isPartOf '""" + str(paragraph_id_) + """'.
               estat:""" + str(id_) + """ dct:title '""" + str(title_) + """'. 
               estat:""" + str(id_) + """ XXX:XX '""" + str(content_) + """'. 
               estat:""" + str(id_) + """ XXX:XX '""" + str(abstract_) + """'. 
               estat:""" + str(id_) + """ XX:XXX '""" + str(alert_) + """'. 

    } }
    ;
            """
    sparql.setQuery(query)
    sparql.method = method
    sparql.setReturnFormat(JSON)
    results = sparql.query().response.read()


In [None]:
def insert_dat_paragraph_figure(DB, classe, link_info_table, method): 

    for i, row in DB.iterrows():

        id_ = row[0]
        paragraph_id_ = row[1]
        link_id_ = row[2]

        
        # Get info from the eurostat_links
        url_ = eurostat_links[eurostat_links['id'] == link_id_].url.item()

        # Construct the query 
        query = """

            PREFIX estat: <https://nlp4statref/knowledge/ontology/> 

            INSERT { GRAPH <https://nlp4statref/knowledge/ontology/> { 

               estat:""" + str(paragraph_id_) + """ estat:linkToResource '""" + str(url_) + """'. 

    } }
    ;
            """
    sparql.setQuery(query)
    sparql.method = method
    sparql.setReturnFormat(JSON)
    results = sparql.query().response.read()

### See added statements 

In [39]:

SelectStatements = """
PREFIX estat: <https://nlp4statref/knowledge/ontology/> 
PREFIX dct: <http://purl.org/dc/terms/>
SELECT * FROM <https://nlp4statref/knowledge/ontology/>
WHERE { estat:15 ?s ?o

}
"""

#statements_df = sparql_dataframe.get(endpoint, SelectStatements)
#print(statements_df.shape)
#print(statements_df.tail())

sparql.setQuery(SelectStatements)
sparql.method = "POST"
sparql.setReturnFormat(JSON)
results = sparql.query().convert()['results']['bindings']
results = pd.json_normalize(results)
print(results['o.value'].values[0])

15


In [40]:
results = sparql.query().convert()['results']['bindings']
results = pd.json_normalize(results)
results

Unnamed: 0,s.type,s.value,o.type,o.value
0,uri,http://www.w3.org/1999/02/22-rdf-syntax-ns#about,literal,15
1,uri,http://purl.org/dc/terms/source,literal,https://ec.europa.eu/eurostat/statistics-expla...
2,uri,http://purl.org/dc/terms/title,literal,Place at which only stamps can be bought
3,uri,http://purl.org/dc/terms/type,literal,
4,uri,https://nlp4statref/knowledge/ontology/resourc...,literal,1
