# Population of the Knowledge DB - V7 #

#### Summary :
        - Load libraries and connect to Virtuoso
        - Load elements from the Content DB 
        - Knowledge base Population 
            + SE Glossary Elements and related material
            + SE Statistical Articles (SA)
            + SE Core Data
            + SE SA related elements
            + SE Background Articles
            + Eurostat Glossary
            + Code Lists
        


### Load libraries and connect to Virtuoso

In [389]:
import os 
import re
import logging
import sys
import pyodbc
import hashlib
import pandas as pd
from datetime import datetime
from SPARQLWrapper import SPARQLWrapper, POST, DIGEST, GET
from SPARQLWrapper import JSON, INSERT, DELETE
import sparql_dataframe
from itertools import chain

In [390]:
def connect_db(DSN, DBA, UID, PWD):

    connection = pyodbc.connect('DSN={};DBA={};UID={};PWD={}'.format(DSN, 
                                                                     DBA,
                                                                     UID,
                                                                     PWD))
    cursor = connection.cursor()

    return connection, cursor


def connect_virtuoso(DSN, UID, PWD):

    sparql = SPARQLWrapper(DSN)
    sparql.setHTTPAuth(DIGEST)
    sparql.setCredentials(UID, PWD)
    sparql.setMethod(GET)

    return sparql


Replace the values with your own logins :

In [92]:
user = ""
login = ""

In [392]:
# Connection to CDB 
connection, cursor = connect_db('Virtuoso All', 
                                'ESTAT', 
                                user, 
                                login)


# Connection to the KDB 
endpoint = "http://virtuoso-test.kapcode.fr:8890/sparql/"
sparql = connect_virtuoso(endpoint, 
                          user, 
                          login)


### Define content selection functions

In [94]:
def select_query(columns, table, conditions=None): 

    if conditions:

        query = """
            SELECT {}
            FROM {}
            WHERE {}

        """.format(columns, table, conditions)

    else:

        query = """

        SELECT {}
        FROM {}
        """.format(columns, table)

    return query

#### Get all tables name from the CDB 

In [95]:
ESTAT_V1_tables_names = pd.read_sql(select_query('*',
                                          'ESTAT.information_schema.tables', 
                                          ''), 
                             connection)
ESTAT_V1_tables_names['TABLE_NAME']

0                     dat_article
1           dat_article_paragraph
2         dat_article_shared_link
3                dat_code_dataset
4                   dat_code_dico
5                  dat_collection
6         dat_collection_resource
7                     dat_dataset
8              dat_estat_glossary
9     dat_estatg_measurement_unit
10           dat_estatg_stat_unit
11               dat_further_info
12                   dat_glossary
13                  dat_link_info
14           dat_paragraph_figure
15               dat_redirections
16           dat_related_concepts
17                   dat_resource
18          dat_resource_altlabel
19                    dat_sources
20           dat_statistical_data
21           mod_article_division
22               mod_concept_type
23                mod_dictionnary
24                   mod_infotype
25               mod_lexical_type
26           mod_measurement_unit
27             mod_ramon_category
28       mod_resource_information
29            

##### Load the ressource info table 

In [96]:
ESTAT_V1_dat_resource = pd.read_sql(select_query('*',
                                          ' ESTAT.V1.dat_resource', 
                                          ''), 
                             connection)
print(ESTAT_V1_dat_resource.shape)

(55, 14)


In [97]:
ESTAT_V1_dat_resource

Unnamed: 0,id,label_en,label_fr,label_de,uri,date_created,date_modified,status_id,date_deprecated,definition,editorial_note,change_note,scope_note,infotype_id
0,0,Other,Autre,,,NaT,,,,,,,,
1,1,Eurostat,Eurostat,,https://nlp4statref/knowledge/resource/authori...,2021-06-01,,1.0,,Eurostat resource.,,18/05/2021 - Creation,,1.0
2,2,European Agency for Safety and Health at Work,,,https://nlp4statref/knowledge/resource/authori...,2021-06-29,,1.0,,,"ToDo : Notes, prefLabel @fr, Acronyms, Abbrevi...",29/06/2021 - Creation,,1.0
3,3,European Asylum support office,,,https://nlp4statref/knowledge/resource/authori...,2021-06-29,,1.0,,,"ToDo : Notes, prefLabel @fr, Acronyms, Abbrevi...",29/06/2021 - Creation,,1.0
4,4,European Central Bank,,,https://nlp4statref/knowledge/resource/authori...,2021-06-29,,1.0,,,"ToDo : Notes, prefLabel @fr, Acronyms, Abbrevi...",29/06/2021 - Creation,,1.0
5,5,European Centre for the Development of vocatio...,,,https://nlp4statref/knowledge/resource/authori...,2021-06-29,,1.0,,,"ToDo : Notes, prefLabel @fr, Acronyms, Abbrevi...",29/06/2021 - Creation,,1.0
6,6,European Council and council of the European U...,Conseil europÃ©en et Conseil de l'Union europÃ...,,https://nlp4statref/knowledge/resource/authori...,2021-06-29,,1.0,,,"ToDo : Notes, Acronyms, Abbreviations, Syntact...",29/06/2021 - Creation,,1.0
7,7,European Court of Auditors,,,https://nlp4statref/knowledge/resource/authori...,2021-06-29,,1.0,,,"ToDo : Notes, prefLabel @fr, Acronyms, Abbrevi...",29/06/2021 - Creation,,1.0
8,8,European Environment Agency,,,https://nlp4statref/knowledge/resource/authori...,2021-06-29,,1.0,,,"ToDo : Notes, prefLabel @fr, Acronyms, Abbrevi...",29/06/2021 - Creation,,1.0
9,9,European Environment Information and Observati...,,,https://nlp4statref/knowledge/resource/authori...,2021-06-29,,1.0,,,"ToDo : Notes, prefLabel @fr, Acronyms, Abbrevi...",29/06/2021 - Creation,,1.0


##### Load the Link info table 

In [98]:
ESTAT_V1_dat_link_info = pd.read_sql(select_query('*',
                                          ' ESTAT.V1.dat_link_info', 
                                          ''), 
                             connection)
print(ESTAT_V1_dat_link_info.shape)


(10556, 5)


In [99]:
ESTAT_V1_dat_link_info.head()

Unnamed: 0,id,title,url,resource_information_id,resource_type_id
0,1,Accident at work,https://ec.europa.eu/eurostat/statistics-expla...,1,39
1,2,"Accidents at work (ESAW, 2008 onwards)",http://ec.europa.eu/eurostat/cache/metadata/en...,1,46
2,3,Commission Regulation (EU) No 349/2011,http://eur-lex.europa.eu/LexUriServ/LexUriServ...,16,37
3,4,European Statistics on Accidents at Work (E...,http://ec.europa.eu/eurostat/product?code=KS-R...,1,0
4,5,Fatal accident at work,https://ec.europa.eu/eurostat/statistics-expla...,1,39


In [None]:
#Get the count of each ressource type in the Content:
temp = ESTAT_V1_dat_link_info.merge(ESTAT_V1_dat_resource, left_on='resource_type_id', right_on='id')
resource_type_count = temp.label_en.value_counts()
resource_type_count

##### Load Glossary related tables

In [286]:
ESTAT_V1_dat_glossary = pd.read_sql(select_query('*',
                                          'ESTAT.V1.dat_glossary', 
                                          ''), 
                             connection)

ESTAT_V1_dat_related_concepts = pd.read_sql(select_query('*',
                                          "ESTAT.V1.dat_related_concepts", 
                                          ''), 
                                            connection)      

ESTAT_V1_dat_sources = pd.read_sql(select_query('*',
                                          "ESTAT.V1.dat_sources", 
                                          ''), 
                             connection)
ESTAT_V1_dat_redirections = pd.read_sql(select_query('*',
                                          "ESTAT.V1.dat_redirections", 
                                          ''), 
                             connection)

ESTAT_V1_dat_further_info = pd.read_sql(select_query('*',
                                          'ESTAT.V1.dat_further_info', 
                                          ''), 
                             connection)
ESTAT_V1_dat_statistical_data = pd.read_sql(select_query('*',
                                          'ESTAT.V1.dat_statistical_data', 
                                          ''), 
                             connection)

##### Load Statistical Articles related tables

In [333]:
ESTAT_V1_dat_article_shared_link = pd.read_sql(select_query('*',
                                          'ESTAT.V1.dat_article_shared_link', 
                                          ''), 
                             connection)


In [105]:
ESTAT_V1_dat_article_paragraph = pd.read_sql(select_query('*',
                                          'ESTAT.V1.dat_article_paragraph', 
                                          ''), 
                             connection)

In [107]:
ESTAT_V1_dat_article = pd.read_sql(select_query('*',
                                          'ESTAT.V1.dat_article', 
                                          'background_article = 0'), 
                             connection)

ESTAT_V1_dat_article_core_data = pd.read_sql('SELECT article_id, link_id, url, data_sources FROM ESTAT.V1.dat_article_shared_link INNER JOIN ESTAT.V1.dat_article ON ESTAT.V1.dat_article.id = article_id INNER JOIN ESTAT.V1.dat_link_info ON link_id = ESTAT.V1.dat_link_info.id WHERE background_article = 0 AND article_division_id=1',connection)

##### Load Statistical Background Articles related tables

In [108]:
ESTAT_V1_dat_background_article = pd.read_sql(select_query('*',
                                          'ESTAT.V1.dat_article', 
                                          'background_article = 1'), 
                             connection)

ESTAT_V1_dat_background_article_core_data = pd.read_sql('SELECT article_id, link_id, url, data_sources FROM ESTAT.V1.dat_article_shared_link INNER JOIN ESTAT.V1.dat_article ON ESTAT.V1.dat_article.id = article_id INNER JOIN ESTAT.V1.dat_link_info ON link_id = ESTAT.V1.dat_link_info.id WHERE background_article = 1 AND article_division_id=1',connection)


##### Load Eurostat Glossary related tables 

In [109]:
ESTAT_V1_dat_estat_glossary = pd.read_sql(select_query('*',
                                          'ESTAT.V1.dat_estat_glossary', 
                                          ''), 
                             connection)

##### Load Code List related tables

In [111]:
ESTAT_V1_dat_code_dicos = pd.read_sql(select_query('*',
                                  'ESTAT.V1.dat_code_dico', 
                                  ''), 
                                    connection)


### Populate the KDB

### 1 - Define functions used to create links between elements of the Content DB

In [353]:
#Allows to find a prefix based on the link_id : 
def find_element_prefix_from_id(id_, SEG_table, SBA_table, SA_table, SED_table):
    prefix = ""
    if (id_ in SEG_table.id.values): prefix = " SEG:"           
    if (id_ in SBA_table.id.values): prefix = " SBA:"
    if (id_ in SA_table.id.values): prefix = " SA:"
    if (id_ in SED_table.article_id.values): prefix = " SED:"
    return(prefix)

In [354]:
# Dictionnary used to get the right prefix for each element : 
resource_type = ['https://nlp4statref/knowledge/resource/authority/resource-type#statistic-reference-metadata',
                "https://nlp4statref/knowledge/resource/authority/resource-type#miscellaneous",
                "https://nlp4statref/knowledge/resource/authority/resource-type#statistic-database",
                "https://nlp4statref/knowledge/resource/authority/resource-type#statistical-data-report",
                "https://nlp4statref/knowledge/resource/authority/resource-type#publication",
                "https://nlp4statref/knowledge/resource/authority/resource-type#european-union-law",
                 "https://nlp4statref/knowledge/resource/authority/resource-type#news",
                 "https://nlp4statref/knowledge/resource/authority/resource-type#infography",
                 "https://nlp4statref/knowledge/resource/authority/resource-type#legal-context",
                 "https://nlp4statref/knowledge/resource/authority/resource-type#statistic-table",
                 "https://nlp4statref/knowledge/resource/authority/resource-type#glossary-concept",
                ]

about_element = ["<https://nlp4statref/knowledge/ontology/StatisticReferenceMetadata#>",
                "<https://nlp4statref/knowledge/ontology/Miscellaneous#>",
                "<https://nlp4statref/knowledge/ontology/StatisticDataset#>",
                "<https://nlp4statref/knowledge/resource/statistical-data-report#>",
                "<https://nlp4statref/knowledge/ontology/Publication#>",
                "<https://nlp4statref/knowledge/ontology/EuropeanUnionLaw#>",
                "<https://nlp4statref/knowledge/ontology/News#>",
                "<https://nlp4statref/knowledge/ontology/Infography#>",
                "<https://nlp4statref/knowledge/ontology/LegalContext#>",
                "<https://nlp4statref/knowledge/ontology/StatisticData#>",
                "<https://nlp4statref/knowledge/resource/statistics-explained-glossary#>"]
prefix_list = [" SRM:"," MISC:"," SDS:"," SDR:", " PUB:", " EUL:", " NEWS:", " INFOG:", " LEGC:", " ST:", " SEG:"]
resource_types_prefix_table = {'resource_type':resource_type,
                             'ontology_element':about_element,
                             'prefix':prefix_list}
resource_types_prefix_table = pd.DataFrame(resource_types_prefix_table)

In [355]:
#Allows to find a prefix based on the resource type  : 
def find_element_prefix_from_type(resource_type, resource_types_prefix_table):
    onto_element = resource_types_prefix_table.loc[resource_types_prefix_table.resource_type == resource_type]["ontology_element"].item()
    prefix = resource_types_prefix_table.loc[resource_types_prefix_table.resource_type == resource_type]["prefix"].item()
    return(onto_element, prefix)

###  2 - Insertion of Glossary elements into the KDB. 

In [356]:
## Find the homepage : 
SEGlossary_homepage_id = ESTAT_V1_dat_glossary.loc[ESTAT_V1_dat_glossary['homepage']==1]['id'].item()

In [357]:
def decode_string(string):
  string = string.encode("ascii", "ignore")
  string_decode = string.decode()
  return string_decode
ESTAT_V1_dat_glossary.definition = ESTAT_V1_dat_glossary.definition.apply(decode_string)


In [358]:
#Version updatée par rapport a la nouvelle table dat_resource
def insert_glossary_elements(DB, link_info_table, resource_table, redirections_table, method, homepage_id): 

    for i, row in DB.iterrows():
        #print(i)
        id_ = row["id"]
        definition_ = row["definition"]
        definition_ = definition_.replace("'","").strip()
        definition_ = definition_.replace("\\","").strip()
        last_update_ = str(row["last_update"])
        homepage_ = row["homepage"]
        #redirection_ = row["redirection"]
         # Get info from the eurostat_links
        title_ = link_info_table[link_info_table['id'] == id_].title.item()
        title_ = title_.replace("'","").strip()
        url_ = link_info_table[link_info_table['id'] == id_].url.item()
        resourcetype_id_ = link_info_table.loc[link_info_table['id'] == id_]['resource_type_id'].item()
        resourceinfo_id_ = link_info_table.loc[link_info_table['id'] == id_]['resource_information_id'].item() 
        #get the label from resourcetype and info  id
        resourceinfo_ = resource_table.loc[resource_table['id'] == resourceinfo_id_]['uri'].item()
        if resourcetype_id_ in resource_table['id']:
            resourcetype_ = resource_table.loc[resource_table['id'] == resourcetype_id_]['uri'].item() 
            resourcetype_query_part = "SEG:" + str(id_) + """ dct:type '""" +  str(resourcetype_) + "'."
        else : resourcetype_query_part = ""
        
        
        #get original title for redirected pages 
        if id_ in redirections_table.glossary_id.values :
            OT_query_part = []
            for link_id_ in redirections_table[redirections_table['glossary_id'] == id_].link_id :
                original_title_ = link_info_table[link_info_table['id'] == link_id_].title.item()
                subpart = "SEG:" + str(id_) + """ skos:altLabel '""" + str(original_title_) + "'."
                OT_query_part.append(subpart)
            OT_query_part = '\n'.join(OT_query_part)
            #print(redirections_table[redirections_table['glossary_id'] == id_].link_id)
            #redirection_id_ = redirections_table[redirections_table['glossary_id'] == id_].link_id.item()
            #original_title_ = link_info_table[link_info_table['id'] == redirection_id_].title.item()
            #OT_query_part = "SEG:" + str(id_) + """ skos:altLabel '""" + str(original_title_) + "'."
        else :
            OT_query_part = ""
            
        #Distinguish the homepage from the rest 
        if id_ != homepage_id :        
            concept_query_part = "SEG:" + str(id_) + """ skos:Concept SEG:""" +  str(id_) + "."
            title_query_part = "SEG:" + str(id_) + """ skos:prefLabel '""" +  str(title_) + "'."
            def_query_part = "SEG:" + str(id_) + """ skos:definition '""" +  str(definition_) + "'."
            #extraction_query_part = ""
            topconcept_query_part = ""
            publisher_query_part = ""
            inscheme_query_part = """SEG:""" + str(id_) +  """ skos:inScheme 'https://nlp4statref/knowledge/resource/vocabulary/glossary/statistics-explained-glossary'."""

            #exactmatch_query_part_1 = """SEG:""" + str(id_) +  """skos:exactMatch estat:EurostatConcept."""
            #exactmatch_query_part_2 = """SEG:""" + str(id_) +  """skos:exactMatch estat:NamedEntities."""

        else :
            concept_query_part = "SEG:" + str(id_) + """ skos:ConceptScheme  'https://nlp4statref/knowledge/resource/vocabulary/glossary/statistics-explained-glossary'."""
            title_query_part = "SEG:" + str(id_) + """ dct:title '""" +  str(title_) + "'."
            def_query_part = "SEG:" + str(id_) + """ dct:description '""" +  str(definition_) + "'."
            #extraction_query_part = scraping date ? 
            topconcept_query_part = []
            for concept_id_ in DB.id:
                if concept_id_ != homepage_id : 
                    subpart = "SEG:" + str(homepage_id) + """ skos:hasTopConcept SEG:""" +  str(concept_id_) + "."
                    topconcept_query_part.append(subpart)
            topconcept_query_part = '\n'.join(topconcept_query_part)
            publisher_query_part = """SEG:""" + str(id_) +  """ dct:publisher 'http://publications.europa.eu/resource/authority/corporate-body/ESTAT'."""
            inscheme_query_part = ""
            #exactmatch_query_part_1 = ""
            #exactmatch_query_part_2 = ""
                  
    
        # Construct the query 
        query = """

            PREFIX dct: <http://purl.org/dc/terms/>
            PREFIX estat: <https://nlp4statref/knowledge/ontology/>
            PREFIX SEG: <https://nlp4statref/knowledge/resource/statistics-explained-glossary#>
            PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
            
            INSERT { GRAPH <https://nlp4statref/knowledge/ontology/> { 

              """ + concept_query_part + """ 
              """ + inscheme_query_part + """              
              """ + topconcept_query_part + """
              """ + title_query_part + """
              """ + def_query_part + """
              """ + publisher_query_part + """
              """ + OT_query_part + """
              SEG:""" + str(id_) + """ dct:modified '""" + str(last_update_) + """'.
              SEG:""" + str(id_) + """ dct:source '""" + str(url_) + """'.  
              """ + resourcetype_query_part + """
              SEG:""" + str(id_) + """ estat:resourceInformation '""" + str(resourceinfo_) + """'.  
             
              
              
        } }
        ;
            """
        #Take out special characters : 
        query.encode('latin-1')
        if (i == 0): print(query)
        sparql.setQuery(query)
        sparql.method = method
        sparql.setReturnFormat(JSON)
        results = sparql.query().response.read()

In [359]:
###run the function 
insert_glossary_elements(DB = ESTAT_V1_dat_glossary,
                         link_info_table = ESTAT_V1_dat_link_info,
                         resource_table = ESTAT_V1_dat_resource,
                         redirections_table = ESTAT_V1_dat_redirections,
                         method = 'POST', homepage_id = SEGlossary_homepage_id) 



            PREFIX dct: <http://purl.org/dc/terms/>
            PREFIX estat: <https://nlp4statref/knowledge/ontology/>
            PREFIX SEG: <https://nlp4statref/knowledge/resource/statistics-explained-glossary#>
            PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
            
            INSERT { GRAPH <https://nlp4statref/knowledge/ontology/> { 

              SEG:1 skos:Concept SEG:1. 
              SEG:1 skos:inScheme 'https://nlp4statref/knowledge/resource/vocabulary/glossary/statistics-explained-glossary'.              
              
              SEG:1 skos:prefLabel 'Accident at work'.
              SEG:1 skos:definition 'An     accident at work    in the framework of the administrative data collection on European Statistics on Accidents at Work (ESAW) is a discrete occurrence during the course of work which leads to physical or mental harm. The phrase in the course of work means whilst engaged in an occupational activity or during the time spent at work. This

In [360]:
def insert_SE_glossary_concepts_relations(related_concepts_table, link_info_table, method): 
    for i, row in related_concepts_table.iterrows():
        concept_id_ = row["glossary_id"]
        link_id_ = row["link_id"] 

        related_concept_query_part = "SEG:" + str(concept_id_) + """ skos:related SEG:""" +  str(link_id_) + "."
        # Construct the query 
        query = """
            PREFIX SEG: <https://nlp4statref/knowledge/resource/statistics-explained-glossary#>
            PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
            INSERT { GRAPH <https://nlp4statref/knowledge/ontology/> { 
              """ + related_concept_query_part + """
        } }
        ;
            """
        #Take out special characters : 
        query.encode('latin-1')
        if (i == 0): print(query)
        sparql.setQuery(query)
        sparql.method = method
        sparql.setReturnFormat(JSON)
        results = sparql.query().response.read()

In [361]:
insert_SE_glossary_concepts_relations(ESTAT_V1_dat_related_concepts, ESTAT_V1_dat_link_info, 'POST')


            PREFIX SEG: <https://nlp4statref/knowledge/resource/statistics-explained-glossary#>
            PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
            INSERT { GRAPH <https://nlp4statref/knowledge/ontology/> { 
              SEG:1 skos:related SEG:5.
        } }
        ;
            


In [362]:
#V7.1
def insert_glossary_elements_relations(DB, link_info_table, resource_table, resource_types_prefix_table, SEG_table, SBA_table, SA_table, SED_table, method):      
    
    for i, row in DB.iterrows():
        concept_id_ = row["glossary_id"]
        link_id_ = row["link_id"] #concept et link sont de la meme sequence unique. 
  
        #Get elements from the eurostat_links
        resourcetype_id_ = link_info_table.loc[link_info_table['id'] == link_id_]['resource_type_id'].item()
        resourceinfo_id_ = link_info_table.loc[link_info_table['id'] == link_id_]['resource_information_id'].item() 
        title_ = link_info_table.loc[link_info_table['id'] == link_id_]['title'].item()
        title_ = title_.replace("'","").strip()
        url_ = link_info_table.loc[link_info_table['id'] == link_id_]['url'].item()
        
        #get the label from resourcetype and info  id
        resourceinfo_ = resource_table.loc[resource_table['id'] == resourceinfo_id_]['uri'].item() 
        resourcetype_ = resource_table.loc[resource_table['id'] == resourcetype_id_]['uri'].item() 
        
        relation_ = ""
        #print(resourcetype_id_)
        
        #Init query parts : 
        prefix_part = ""
        title_query_part = ""
        url_query_part = ""
        resourcetype_query_part = ""
        resourceinfo_query_part = ""
        
        ##Get the type of link : 
        if (resourcetype_id_ in [37]): relation_ = "estat:relatedLegallnformation"
        if (resourcetype_id_ in [0,41,42,43,44,45]): relation_ = "estat:relatedEditorialContent"
        if (resourcetype_id_ in [46,47,48,49]): relation_ = "estat:relatedStatisticData"
        if (relation_ != "") :
            #Get the prefix   
            s_prefix = find_element_prefix_from_id(concept_id_, 
                                           SEG_table = SEG_table,  
                                           SBA_table=SBA_table,
                                           SA_table=SA_table, 
                                           SED_table=SED_table)
            o_prefix = find_element_prefix_from_id(link_id_, 
                               SEG_table = SEG_table,  
                               SBA_table=SBA_table,
                               SA_table=SA_table, 
                               SED_table=SED_table)
            if (o_prefix == "") :
                if (resourcetype_ != None):
                    ontology_element, o_prefix = find_element_prefix_from_type(resourcetype_, 
                                                                               resource_types_prefix_table = resource_types_prefix_table)
                    prefix_part = "PREFIX"+ o_prefix + " " + ontology_element 
                    resourcetype_query_part = o_prefix + str(link_id_) + """ dct:type '""" + str(resourcetype_) + """'."""
                    resourceinfo_query_part = o_prefix + str(link_id_) + """ estat:resourceInformation '""" + str(resourceinfo_) + """'."""
                    title_query_part = o_prefix + str(link_id_) +""" dct:title '""" +  str(title_) + "'."
                    url_query_part = o_prefix + str(link_id_) + """ dct:source '""" + str(url_) + """'."""
                    related_query_part = s_prefix + str(concept_id_) + ' ' + relation_ + o_prefix + str(link_id_) + "."
                else : 
                    continue
            else :
                  related_query_part = s_prefix + str(concept_id_) + ' ' + relation_ + o_prefix + str(link_id_) + "."

            # Construct the query 
            query = """
                PREFIX dct: <http://purl.org/dc/terms/>
                PREFIX SEG: <https://nlp4statref/knowledge/resource/statistics-explained-glossary#>
                PREFIX SBA: <https://nlp4statref/knowledge/resource/background-article#>
                PREFIX SA: <https://nlp4statref/knowledge/resource/statistical-article#>
                PREFIX SED: <https://nlp4statref/knowledge/resource/statistics-explained-data#>
                PREFIX estat: <https://nlp4statref/knowledge/ontology/> 
                PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
                """ + prefix_part + """

                INSERT { GRAPH <https://nlp4statref/knowledge/ontology/> { 
                   """ + title_query_part + """
                   """ + url_query_part + """
                   """ + resourcetype_query_part + """
                   """ + resourceinfo_query_part + """
                   """ + related_query_part + """

                } }
                ;
            """
            if (i  == 0): print(query)
            sparql.setQuery(query)
            sparql.method = method
            sparql.setReturnFormat(JSON)
            results = sparql.query().response.read()

In [363]:
insert_glossary_elements_relations(DB = ESTAT_V1_dat_further_info,
                                   resource_types_prefix_table = resource_types_prefix_table,
                                   resource_table = ESTAT_V1_dat_resource,
                                   link_info_table = ESTAT_V1_dat_link_info,
                                   SEG_table = ESTAT_V1_dat_glossary,  
                                   SBA_table=ESTAT_V1_dat_background_article,
                                   SA_table=ESTAT_V1_dat_article, 
                                   SED_table=ESTAT_V1_dat_article_core_data,
                                   method = 'POST')


                PREFIX dct: <http://purl.org/dc/terms/>
                PREFIX SEG: <https://nlp4statref/knowledge/resource/statistics-explained-glossary#>
                PREFIX SBA: <https://nlp4statref/knowledge/resource/background-article#>
                PREFIX SA: <https://nlp4statref/knowledge/resource/statistical-article#>
                PREFIX SED: <https://nlp4statref/knowledge/resource/statistics-explained-data#>
                PREFIX estat: <https://nlp4statref/knowledge/ontology/> 
                PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
                PREFIX SDS: <https://nlp4statref/knowledge/ontology/StatisticDataset#>

                INSERT { GRAPH <https://nlp4statref/knowledge/ontology/> { 
                    SDS:2 dct:title 'Accidents at work (ESAW, 2008 onwards)'.
                    SDS:2 dct:source 'http://ec.europa.eu/eurostat/cache/metadata/en/hsw_acc_work_esms.htm'.
                    SDS:2 dct:type 'https://nlp4statref/knowledge/resource/au

In [364]:
insert_glossary_elements_relations(DB = ESTAT_V1_dat_statistical_data,
                                   resource_types_prefix_table = resource_types_prefix_table,
                                   resource_table = ESTAT_V1_dat_resource,
                                   link_info_table = ESTAT_V1_dat_link_info,
                                   SEG_table = ESTAT_V1_dat_glossary,  
                                   SBA_table=ESTAT_V1_dat_background_article,
                                   SA_table=ESTAT_V1_dat_article, 
                                   SED_table=ESTAT_V1_dat_article_core_data,
                                   method = 'POST')

In [365]:
def insert_SE_glossary_sources(sources_concepts_table, link_info_table, resource_types_prefix_table, resource_table, method): 
    for i, row in sources_concepts_table.iterrows():
        concept_id_ = row["glossary_id"]
        link_id_ = row["link_id"] 
        #Get ressource type from the eurostat_links
        resourcetype_id_ = link_info_table.loc[link_info_table['id'] == link_id_]['resource_type_id'].item()
        resourcetype_ = resource_table.loc[resource_table['id'] == resourcetype_id_]['uri'].item() 
        #
        if (resourcetype_ != None):
            ontology_element, o_prefix = find_element_prefix_from_type(resourcetype_, 
                                                               resource_types_prefix_table = resource_types_prefix_table)

            prefix_part = "PREFIX"+ o_prefix + " " + ontology_element 

            related_concept_query_part = "SEG:" + str(concept_id_) + """ estat:sourceInformation"""+ o_prefix +  str(link_id_) + "."
            # Construct the query 
            query = """
                PREFIX SEG: <https://nlp4statref/knowledge/resource/statistics-explained-glossary#>
                PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
                PREFIX estat: <https://nlp4statref/knowledge/ontology/> 
                """ + prefix_part + """
                INSERT { GRAPH <https://nlp4statref/knowledge/ontology/> { 
                  """ + related_concept_query_part + """
            } }
            ;
                """
            #Take out special characters : 
            query.encode('latin-1')
            if (i == 0): print(query)
            sparql.setQuery(query)
            sparql.method = method
            sparql.setReturnFormat(JSON)
            results = sparql.query().response.read()

In [366]:
insert_SE_glossary_sources(ESTAT_V1_dat_sources,
                           ESTAT_V1_dat_link_info, 
                           resource_types_prefix_table,
                           ESTAT_V1_dat_resource,
                           'POST')

###  3 - Insertion of Statistical Explained elements into the KDB. 

###  3.1 - Insertion of Statistical Explained articles

In [367]:
## Find the homepage : 
SA_homepage_id = ESTAT_V1_dat_article.loc[ESTAT_V1_dat_article['homepage']==1]['id'].item()

In [368]:
def insert_dat_article(DB, article_shared_link_table, link_info_table, resource_table, homepage_id, method): 

    for i, row in DB.iterrows():

        id_ = row['id']
        if id_ != homepage_id :    
            last_update_ = str(row["last_update"])

            # Get info from the eurostat_links
            title_ = link_info_table[link_info_table['id'] == id_].title.item()
            title_ = title_.replace("'","").strip()
            url_ = link_info_table[link_info_table['id'] == id_].url.item()
 
            resourcetype_id_ = link_info_table.loc[link_info_table['id'] == id_]['resource_type_id'].item()
            resourceinfo_id_ = link_info_table.loc[link_info_table['id'] == id_]['resource_information_id'].item() 
            #get the label from resourcetype and info  id
            resourceinfo_ = resource_table.loc[resource_table['id'] == resourceinfo_id_]['uri'].item()
            if resourcetype_id_ in resource_table['id']:
                resourcetype_ = resource_table.loc[resource_table['id'] == resourcetype_id_]['uri'].item() 
                resourcetype_query_part = "SA:" + str(id_) + """ dct:type '""" +  str(resourcetype_) + "'."
            else : resourcetype_query_part = ""
            # add source data 
            shared_links = article_shared_link_table[article_shared_link_table['article_id'] == id_]
            if (1 in shared_links['article_division_id'].values) : 
                source_query_part = []
                for link_id_ in shared_links.loc[shared_links['article_division_id']==1]['link_id'].values:
                    subpart = "SA:" + str(id_) + """ estat:sourceData SDR:""" +  str(link_id_) + "."
                    source_query_part.append(subpart)
                source_query_part = '\n'.join(source_query_part)
            else : 
                source_query_part = ""
          
            # Construct the query 
            query = """
                PREFIX dct: <http://purl.org/dc/terms/>
                PREFIX estat: <https://nlp4statref/knowledge/ontology/>
                PREFIX SA: <https://nlp4statref/knowledge/resource/statistical-article#>
                PREFIX SDR: <https://nlp4statref/knowledge/resource/statistical-data-report#>
                INSERT { GRAPH <https://nlp4statref/knowledge/ontology/> { 
                  SA:""" + str(id_) + """ estat:StatisticalArticle SA:""" + str(id_) + """.
                  """ + resourcetype_query_part + """
                  SA:""" + str(id_) + """ dct:title '""" + str(title_) + """'.  
                  SA:""" + str(id_) + """ dct:description 'Statistics Explained article'.
                  SA:""" + str(id_) + """ dct:publisher 'http://publications.europa.eu/resource/authority/corporate-body/ESTAT'.
                  SA:""" + str(id_) + """ estat:resourceInformation '""" + str(resourceinfo_) + """'.
                  SA:""" + str(id_) + """ dct:modified '""" + str(last_update_) + """'.
                  SA:""" + str(id_) + """ dct:source '""" + str(url_) + """'.  
                  """ + source_query_part + """
            } }
            ;
                """
            if (i == 0): print(query)
            sparql.setQuery(query)
            sparql.method = method
            sparql.setReturnFormat(JSON)
            results = sparql.query().response.read()

In [369]:
insert_dat_article(DB = ESTAT_V1_dat_article,
                   article_shared_link_table = ESTAT_V1_dat_article_shared_link,
                              link_info_table = ESTAT_V1_dat_link_info,
                   resource_table = ESTAT_V1_dat_resource,
                              homepage_id =  SA_homepage_id,
                              method = "POST")


                PREFIX dct: <http://purl.org/dc/terms/>
                PREFIX estat: <https://nlp4statref/knowledge/ontology/>
                PREFIX SA: <https://nlp4statref/knowledge/resource/statistical-article#>
                PREFIX SDR: <https://nlp4statref/knowledge/resource/statistical-data-report#>
                INSERT { GRAPH <https://nlp4statref/knowledge/ontology/> { 
                  SA:7 estat:StatisticalArticle SA:7.
                  SA:7 dct:type 'https://nlp4statref/knowledge/resource/authority/resource-type#statistical-article'.
                  SA:7 dct:title 'Accidents at work statistics'.  
                  SA:7 dct:description 'Statistics Explained article'.
                  SA:7 dct:publisher 'http://publications.europa.eu/resource/authority/corporate-body/ESTAT'.
                  SA:7 estat:resourceInformation 'https://nlp4statref/knowledge/resource/authority/resource-information#eurostat'.
                  SA:7 dct:modified '2020-11-26 16:06:00'.
  

###  3.2 - Insertion of Statistical Explained core data 

Core data --> Excel part of SArticles 

In [370]:
def insert_dat_article_core_data(DB, link_info_table, article_paragraph_table, article_table, resource_table, method): 
  
    for i, row in DB.iterrows():

        id_ = row['link_id']
        
        article_id_ = row["article_id"]
        #Get last update from article table : 
        last_update_ = str( article_table[article_table['id'] == article_id_].last_update.item())
        #Get info from the eurostat_links
        url_ = link_info_table[link_info_table['id'] == id_].url.item()
        title_ = link_info_table[link_info_table['id'] == article_id_].title.item()
        title_ = title_.replace("'","").strip()
        resourcetype_id_ = link_info_table.loc[link_info_table['id'] == id_]['resource_type_id'].item()
        resourceinfo_id_ = link_info_table.loc[link_info_table['id'] == id_]['resource_information_id'].item() 
        
        #get the label from resourcetype and info  id
        resourceinfo_ = resource_table.loc[resource_table['id'] == resourceinfo_id_]['uri'].item()
        if resourcetype_id_ in resource_table['id']:
            resourcetype_ = resource_table.loc[resource_table['id'] == resourcetype_id_]['uri'].item() 
            resourcetype_query_part = "SDR:" + str(id_) + """ dct:type '""" +  str(resourcetype_) + "'."
        else : resourcetype_query_part = ""        
        

        # Construct the query 
        query = """

            PREFIX dct: <http://purl.org/dc/terms/>
            PREFIX estat: <https://nlp4statref/knowledge/ontology/>
            PREFIX SDR: <https://nlp4statref/knowledge/resource/statistical-data-report#>
            PREFIX SA: <https://nlp4statref/knowledge/resource/statistical-article#>
            
            INSERT { GRAPH <https://nlp4statref/knowledge/ontology/> { 

              SDR:""" + str(id_) + """ estat:StatisticalDataReport SDR:""" + str(id_) + """.
              """ + resourcetype_query_part + """
              SDR:""" + str(id_) + """ dct:title '""" + str(title_) + """'.  
              SDR:""" + str(id_) + """ dct:publisher 'http://publications.europa.eu/resource/authority/corporate-body/ESTAT'.
              SDR:""" + str(id_) + """ estat:resourceInformation '""" + str(resourceinfo_) + """'.
              SDR:""" + str(id_) + """ dct:modified '""" + str(last_update_) + """'.
              SDR:""" + str(id_) + """ dct:source '""" + str(url_) + """'.  
              SDR:""" + str(id_) + """ estat:dataInformation SA:""" + str(article_id_) + """.  

        } }
        ;
            """
        if (i == 0): print(query)
        sparql.setQuery(query)
        sparql.method = method
        sparql.setReturnFormat(JSON)
        results = sparql.query().response.read()

In [371]:
insert_dat_article_core_data(DB = ESTAT_V1_dat_article_core_data,
                              link_info_table = ESTAT_V1_dat_link_info,
                             article_paragraph_table= ESTAT_V1_dat_article_paragraph,
                             article_table = ESTAT_V1_dat_article,
                             resource_table = ESTAT_V1_dat_resource,
                              method = "POST")



            PREFIX dct: <http://purl.org/dc/terms/>
            PREFIX estat: <https://nlp4statref/knowledge/ontology/>
            PREFIX SDR: <https://nlp4statref/knowledge/resource/statistical-data-report#>
            PREFIX SA: <https://nlp4statref/knowledge/resource/statistical-article#>
            
            INSERT { GRAPH <https://nlp4statref/knowledge/ontology/> { 

              SDR:2907 estat:StatisticalDataReport SDR:2907.
              SDR:2907 dct:type 'https://nlp4statref/knowledge/resource/authority/resource-type#statistic-reference-metadata'.
              SDR:2907 dct:title 'Absences from work - quarterly statistics'.  
              SDR:2907 dct:publisher 'http://publications.europa.eu/resource/authority/corporate-body/ESTAT'.
              SDR:2907 estat:resourceInformation 'https://nlp4statref/knowledge/resource/authority/resource-information#eurostat'.
              SDR:2907 dct:modified '2021-06-28 09:09:00'.
              SDR:2907 dct:source 'https://ec.eur

###  3.3 - Insertion of Statistical Explained articles related elements 

In the following, some elements are not yet added to the KDB (SBA, SA) and are added inside this function 

In [373]:
#V7.1
def insert_articles_elements_relations(DB, link_info_table, resource_table, resource_types_prefix_table, SEG_table, SBA_table, SA_table, SED_table, method):      
    
    for i, row in DB.iterrows():
        article_id = row["article_id"]
        link_id_ = row["link_id"] #concept et link sont de la meme sequence unique. 
  
        #Get elements from the eurostat_links
        resourcetype_id_ = link_info_table.loc[link_info_table['id'] == link_id_]['resource_type_id'].item()
        resourceinfo_id_ = link_info_table.loc[link_info_table['id'] == link_id_]['resource_information_id'].item() 
        title_ = link_info_table.loc[link_info_table['id'] == link_id_]['title'].item()
        title_ = title_.replace("'","").strip()
        url_ = link_info_table.loc[link_info_table['id'] == link_id_]['url'].item()
        
        #get the label from resourcetype and info  id
        resourceinfo_ = resource_table.loc[resource_table['id'] == resourceinfo_id_]['uri'].item() 
        resourcetype_ = resource_table.loc[resource_table['id'] == resourcetype_id_]['uri'].item() 
        
        relation_ = ""
        #print(resourcetype_id_)
        
        #Init query parts : 
        prefix_part = ""
        title_query_part = ""
        url_query_part = ""
        resourcetype_query_part = ""
        resourceinfo_query_part = ""
        
        ##Get the type of link : 
        if (resourcetype_id_ in [37]): relation_ = "estat:relatedLegallnformation"
        if (resourcetype_id_ in [0,41,42,43,44,45]): relation_ = "estat:relatedEditorialContent"
        if (resourcetype_id_ in [46,47,48,49]): relation_ = "estat:relatedStatisticData"
        if (relation_ != "") :
            #Get the prefix   
            s_prefix = find_element_prefix_from_id(article_id, 
                                           SEG_table = SEG_table,  
                                           SBA_table=SBA_table,
                                           SA_table=SA_table, 
                                           SED_table=SED_table)
            o_prefix = find_element_prefix_from_id(link_id_, 
                               SEG_table = SEG_table,  
                               SBA_table=SBA_table,
                               SA_table=SA_table, 
                               SED_table=SED_table)
            if (o_prefix == "") :
                if (resourcetype_ != None):
                    ontology_element, o_prefix = find_element_prefix_from_type(resourcetype_, 
                                                                               resource_types_prefix_table = resource_types_prefix_table)
                    prefix_part = "PREFIX"+ o_prefix + " " + ontology_element 
                    resourcetype_query_part = o_prefix + str(link_id_) + """ dct:type '""" + str(resourcetype_) + """'."""
                    resourceinfo_query_part = o_prefix + str(link_id_) + """ estat:resourceInformation '""" + str(resourceinfo_) + """'."""
                    title_query_part = o_prefix + str(link_id_) +""" dct:title '""" +  str(title_) + "'."
                    url_query_part = o_prefix + str(link_id_) + """ dct:source '""" + str(url_) + """'."""
                    related_query_part = s_prefix + str(article_id) + ' ' + relation_ + o_prefix + str(link_id_) + "."
                else : 
                    continue
            else :
                  related_query_part = s_prefix + str(article_id) + ' ' + relation_ + o_prefix + str(link_id_) + "."

            # Construct the query 
            query = """
                PREFIX dct: <http://purl.org/dc/terms/>
                PREFIX SEG: <https://nlp4statref/knowledge/resource/statistics-explained-glossary#>
                PREFIX SBA: <https://nlp4statref/knowledge/resource/background-article#>
                PREFIX SA: <https://nlp4statref/knowledge/resource/statistical-article#>
                PREFIX SED: <https://nlp4statref/knowledge/resource/statistics-explained-data#>
                PREFIX estat: <https://nlp4statref/knowledge/ontology/> 
                PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
                """ + prefix_part + """

                INSERT { GRAPH <https://nlp4statref/knowledge/ontology/> { 
                   """ + title_query_part + """
                   """ + url_query_part + """
                   """ + resourcetype_query_part + """
                   """ + resourceinfo_query_part + """
                   """ + related_query_part + """
                } }
                ;
            """
            if (i == 0): print(query)
            sparql.setQuery(query)
            sparql.method = method
            sparql.setReturnFormat(JSON)
            results = sparql.query().response.read()

In [374]:
insert_articles_elements_relations(DB = ESTAT_V1_dat_article_shared_link,
                                   resource_types_prefix_table = resource_types_prefix_table,
                                   resource_table = ESTAT_V1_dat_resource,
                                   link_info_table = ESTAT_V1_dat_link_info,
                                   SEG_table = ESTAT_V1_dat_glossary,  
                                   SBA_table=ESTAT_V1_dat_background_article,
                                   SA_table=ESTAT_V1_dat_article, 
                                   SED_table=ESTAT_V1_dat_article_core_data,
                                   method = 'POST')


                PREFIX dct: <http://purl.org/dc/terms/>
                PREFIX SEG: <https://nlp4statref/knowledge/resource/statistics-explained-glossary#>
                PREFIX SBA: <https://nlp4statref/knowledge/resource/background-article#>
                PREFIX SA: <https://nlp4statref/knowledge/resource/statistical-article#>
                PREFIX SED: <https://nlp4statref/knowledge/resource/statistics-explained-data#>
                PREFIX estat: <https://nlp4statref/knowledge/ontology/> 
                PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
                PREFIX SRM: <https://nlp4statref/knowledge/ontology/StatisticReferenceMetadata#>

                INSERT { GRAPH <https://nlp4statref/knowledge/ontology/> { 
                    SRM:2907 dct:title 'Tables and Figures - update April 2021 -.xlsx'.
                    SRM:2907 dct:source 'https://ec.europa.eu/eurostat/statistics-explained/images/3/32/Tables_and_Figures_-_update_April_2021_-.xlsx'.
              

###  4 - Insertion of Background Articles elements into the KDB. 

###  4.1 - Insertion of Background Articles

In [375]:
#a mettre à jour 
## Find the homepage : 
SBA_homepage_id = ESTAT_V1_dat_background_article.loc[ESTAT_V1_dat_background_article['homepage']==1]['id'].item()

In [376]:
SBA_homepage_id

10556

In [377]:
#a mettre à jour 
def insert_dat_background_article(DB, article_shared_link_table, link_info_table, resource_table, homepage_id, method): 

    for i, row in DB.iterrows():

        id_ = row['id']
        if id_ != homepage_id :    
            last_update_ = str(row["last_update"])

            # Get info from the eurostat_links
            title_ = link_info_table[link_info_table['id'] == id_].title.item()
            title_ = title_.replace("'","").strip()
            url_ = link_info_table[link_info_table['id'] == id_].url.item()
            
            resourcetype_id_ = link_info_table.loc[link_info_table['id'] == id_]['resource_type_id'].item()
            resourceinfo_id_ = link_info_table.loc[link_info_table['id'] == id_]['resource_information_id'].item() 
            #get the label from resourcetype and info  id
            resourceinfo_ = resource_table.loc[resource_table['id'] == resourceinfo_id_]['uri'].item()
            if resourcetype_id_ in resource_table['id']:
                resourcetype_ = resource_table.loc[resource_table['id'] == resourcetype_id_]['uri'].item() 
                resourcetype_query_part = "SBA:" + str(id_) + """ dct:type '""" +  str(resourcetype_) + "'."
            else : resourcetype_query_part = ""
        
            # add source data 
            shared_links = article_shared_link_table[article_shared_link_table['article_id'] == id_]
            if (1 in shared_links['article_division_id'].values) : 
                shared_links_source = shared_links.loc[shared_links['article_division_id']==1]['link_id'].values
                source_query_part = []
                for val in shared_links_source: # if multiple values 
                    subpart = """SBA:""" + str(id_) + """ estat:sourceData SDR:"""   + str(val) + """."""
                    source_query_part.append(subpart)
                source_query_part = '\n'.join(source_query_part)      
            else : source_query_part = ""
            # Construct the query 
            query = """

                PREFIX dct: <http://purl.org/dc/terms/>
                PREFIX estat: <https://nlp4statref/knowledge/ontology/>
                PREFIX SBA: <https://nlp4statref/knowledge/resource/background-article#>
                PREFIX SDR: <https://nlp4statref/knowledge/resource/statistical-data-report#>
                INSERT { GRAPH <https://nlp4statref/knowledge/ontology/> { 

                  SBA:""" + str(id_) + """ estat:BackgroundArticle SBA:""" + str(id_) + """.
                  """ + resourcetype_query_part + """
                  SBA:""" + str(id_) + """ dct:title '""" + str(title_) + """'.  
                  SBA:""" + str(id_) + """ dct:description 'Background article'.
                  SBA:""" + str(id_) + """ dct:publisher 'http://publications.europa.eu/resource/authority/corporate-body/ESTAT'.
                  SBA:""" + str(id_) + """ estat:resourceInformation '""" + str(resourceinfo_) + """'.
                  SBA:""" + str(id_) + """ dct:modified '""" + str(last_update_) + """'.
                  SBA:""" + str(id_) + """ dct:source '""" + str(url_) + """'.  
                  """ + source_query_part + """

            } }
            ;
                """
            if (i == 0): print(query)
            sparql.setQuery(query)
            sparql.method = method
            sparql.setReturnFormat(JSON)
            results = sparql.query().response.read()

In [378]:
#a mettre à jour 
insert_dat_background_article(DB = ESTAT_V1_dat_background_article,
                   article_shared_link_table = ESTAT_V1_dat_article_shared_link,
                              link_info_table = ESTAT_V1_dat_link_info,
                              resource_table = ESTAT_V1_dat_resource,
                              homepage_id =  SBA_homepage_id,
                              method = "POST")



                PREFIX dct: <http://purl.org/dc/terms/>
                PREFIX estat: <https://nlp4statref/knowledge/ontology/>
                PREFIX SBA: <https://nlp4statref/knowledge/resource/background-article#>
                PREFIX SDR: <https://nlp4statref/knowledge/resource/statistical-data-report#>
                INSERT { GRAPH <https://nlp4statref/knowledge/ontology/> { 

                  SBA:25 estat:BackgroundArticle SBA:25.
                  SBA:25 dct:type 'https://nlp4statref/knowledge/resource/authority/resource-type#background-article'.
                  SBA:25 dct:title 'Short-term business statistics introduced'.  
                  SBA:25 dct:description 'Background article'.
                  SBA:25 dct:publisher 'http://publications.europa.eu/resource/authority/corporate-body/ESTAT'.
                  SBA:25 estat:resourceInformation 'https://nlp4statref/knowledge/resource/authority/resource-information#eurostat'.
                  SBA:25 dct:modified '2020-

###  4.2 - Insertion of Background Articles core data 

Core data --> Excel part of SArticles 

In [379]:
#a mettre à jour 
def insert_dat_article_core_data(DB, link_info_table, article_paragraph_table, article_table, resource_table, method): 
  
    for i, row in DB.iterrows():

        id_ = row['link_id']
        
        article_id_ = row["article_id"]
        #Get last update from article table : 
        last_update_ = str( article_table[article_table['id'] == article_id_].last_update.item())
        #Get info from the eurostat_links
        url_ = link_info_table[link_info_table['id'] == id_].url.item()
        title_ = link_info_table[link_info_table['id'] == article_id_].title.item()
        title_ = title_.replace("'","").strip()

        resourcetype_id_ = link_info_table.loc[link_info_table['id'] == id_]['resource_type_id'].item()
        resourceinfo_id_ = link_info_table.loc[link_info_table['id'] == id_]['resource_information_id'].item() 
        #get the label from resourcetype and info  id
        resourceinfo_ = resource_table.loc[resource_table['id'] == resourceinfo_id_]['uri'].item()
        if resourcetype_id_ in resource_table['id']:
            resourcetype_ = resource_table.loc[resource_table['id'] == resourcetype_id_]['uri'].item() 
            resourcetype_query_part = "SDR:" + str(id_) + """ dct:type '""" +  str(resourcetype_) + "'."
        else : resourcetype_query_part = ""
        

        # Construct the query 
        query = """

            PREFIX dct: <http://purl.org/dc/terms/>
            PREFIX estat: <https://nlp4statref/knowledge/ontology/>
            PREFIX SDR: <https://nlp4statref/knowledge/resource/statistical-data-report#>
            PREFIX SBA: <https://nlp4statref/knowledge/resource/background-article#>
            
            INSERT { GRAPH <https://nlp4statref/knowledge/ontology/> { 

              SDR:""" + str(id_) + """ estat:StatisticalDataReport SDR:""" + str(id_) + """.
              """ + resourcetype_query_part + """
              SDR:""" + str(id_) + """ dct:title '""" + str(title_) + """'.  
              SDR:""" + str(id_) + """ dct:publisher 'http://publications.europa.eu/resource/authority/corporate-body/ESTAT'.
              SDR:""" + str(id_) + """ estat:resourceInformation '""" + str(resourceinfo_) + """'.
              SDR:""" + str(id_) + """ dct:modified '""" + str(last_update_) + """'.
              SDR:""" + str(id_) + """ dct:source '""" + str(url_) + """'.  
              SDR:""" + str(id_) + """ estat:dataInformation SBA:""" + str(article_id_) + """.  

        } }
        ;
            """
        if (i == 0): print(query)
        sparql.setQuery(query)
        sparql.method = method
        sparql.setReturnFormat(JSON)
        results = sparql.query().response.read()

In [380]:
#a mettre à jour 
insert_dat_article_core_data(DB = ESTAT_V1_dat_background_article_core_data,
                              link_info_table = ESTAT_V1_dat_link_info,
                             article_paragraph_table= ESTAT_V1_dat_article_paragraph,
                             article_table = ESTAT_V1_dat_background_article,
                              resource_table = ESTAT_V1_dat_resource,
                              method = "POST")



            PREFIX dct: <http://purl.org/dc/terms/>
            PREFIX estat: <https://nlp4statref/knowledge/ontology/>
            PREFIX SDR: <https://nlp4statref/knowledge/resource/statistical-data-report#>
            PREFIX SBA: <https://nlp4statref/knowledge/resource/background-article#>
            
            INSERT { GRAPH <https://nlp4statref/knowledge/ontology/> { 

              SDR:4382 estat:StatisticalDataReport SDR:4382.
              SDR:4382 dct:type 'None'.
              SDR:4382 dct:title 'Migrant integration statistics introduced'.  
              SDR:4382 dct:publisher 'http://publications.europa.eu/resource/authority/corporate-body/ESTAT'.
              SDR:4382 estat:resourceInformation 'https://nlp4statref/knowledge/resource/authority/resource-information#eurostat'.
              SDR:4382 dct:modified '2021-02-26 14:02:00'.
              SDR:4382 dct:source 'http://ec.europa.eu/eurostat/web/migrant-integration/statistics-illustrated'.  
              SDR:438

In [381]:
SelectStatements = """

PREFIX SBA: <https://nlp4statref/knowledge/resource/background-article#>
SELECT * FROM <https://nlp4statref/knowledge/ontology/>
WHERE { SBA:25 ?s ?o

}       
"""
sparql.setQuery(SelectStatements)
sparql.method = "POST"
sparql.setReturnFormat(JSON)
results = sparql.query().convert()['results']['bindings']
results = pd.json_normalize(results)
print(results['o.value'].values[0])

2020-11-18 18:09:00


###  5 - Insertion of Estat Glossary elements into the KDB. 

This part is a temporary version based on a bulk download of the glossary. Some elements are missing from the CDB.

In [382]:
## Find the homepage : 
EurostatGlossary_homepage_id = ESTAT_V1_dat_link_info.loc[ESTAT_V1_dat_link_info['title']=="Eurostat's Concepts and Definitions Database"]['id'].item()

In [383]:
EurostatGlossary_homepage_link_info = ESTAT_V1_dat_link_info.loc[ESTAT_V1_dat_link_info['id']==EurostatGlossary_homepage_id]

Attention, la homepage de ce glossaire est differente du reste

In [384]:
EurostatGlossary_homepage_link_info

Unnamed: 0,id,title,url,resource_information_id,resource_type_id
9025,9026,Eurostat's Concepts and Definitions Database,https://ec.europa.eu/eurostat/ramon/nomenclatu...,1,40


In [385]:
###for the homepage 

def insert_estat_glossary_homepage(link_info_table_homepage_subset, resource_table
, method): 
       
    id_ = link_info_table_homepage_subset["id"].item()
    title_ = link_info_table_homepage_subset["title"].item()
    title_ = title_.replace("'","").strip()
    url_ = link_info_table_homepage_subset["url"].item()
    resourcetype_id_ = link_info_table_homepage_subset.loc[link_info_table_homepage_subset['id'] == id_]['resource_type_id'].item()
    resourceinfo_id_ = link_info_table_homepage_subset.loc[link_info_table_homepage_subset['id'] == id_]['resource_information_id'].item() 
    #get the label from resourcetype and info  id
    resourceinfo_ = resource_table.loc[resource_table['id'] == resourceinfo_id_]['uri'].item()
    if resourcetype_id_ in resource_table['id']:
        resourcetype_ = resource_table.loc[resource_table['id'] == resourcetype_id_]['uri'].item() 
        resourcetype_query_part = "EG:" + str(id_) + """ dct:type '""" +  str(resourcetype_) + "'."
    else : resourcetype_query_part = ""
    concept_query_part = "EG:" + str(id_) + """ skos:ConceptScheme  'https://nlp4statref/knowledge/resource/vocabulary/glossary/eurostat-glossary'."""
    title_query_part = "EG:" + str(id_) + """ dct:title '""" +  str(title_) + "'."
    publisher_query_part = """EG:""" + str(id_) +  """ dct:publisher 'http://publications.europa.eu/resource/authority/corporate-body/ESTAT'."""


    # Construct the query 
    query = """
        PREFIX dct: <http://purl.org/dc/terms/>
        PREFIX estat: <https://nlp4statref/knowledge/ontology/>
        PREFIX EG: <https://nlp4statref/knowledge/resource/terminology/glossary/eurostat-glossary#>
        INSERT { GRAPH <https://nlp4statref/knowledge/ontology/> { 
          """ + concept_query_part + """     
          """ + title_query_part + """
          """ + publisher_query_part + """
          EG:""" + str(id_) + """ dct:source '""" + str(url_) + """'.  
          """ + resourcetype_query_part + """
          EG:""" + str(id_) + """ estat:resourceInformation '""" + str(resourceinfo_) + """'.  
    } }
    ;
        """
    #Take out special characters : 
    query.encode('latin-1')
    print(query)
    sparql.setQuery(query)
    sparql.method = method
    sparql.setReturnFormat(JSON)
    results = sparql.query().response.read()

In [386]:
###run the function 
insert_estat_glossary_homepage(link_info_table_homepage_subset = EurostatGlossary_homepage_link_info, 
                               resource_table = ESTAT_V1_dat_resource,
                               method= 'POST')


        PREFIX dct: <http://purl.org/dc/terms/>
        PREFIX estat: <https://nlp4statref/knowledge/ontology/>
        PREFIX EG: <https://nlp4statref/knowledge/resource/terminology/glossary/eurostat-glossary#>
        INSERT { GRAPH <https://nlp4statref/knowledge/ontology/> { 
          EG:9026 skos:ConceptScheme  'https://nlp4statref/knowledge/resource/vocabulary/glossary/eurostat-glossary'.     
          EG:9026 dct:title 'Eurostats Concepts and Definitions Database'.
          EG:9026 dct:publisher 'http://publications.europa.eu/resource/authority/corporate-body/ESTAT'.
          EG:9026 dct:source 'https://ec.europa.eu/eurostat/ramon/nomenclatures/index.cfm?TargetUrl=LST_NOM_DTL_GLOSSARY&StrNom=CODED2&StrLanguageCode=EN'.  
          EG:9026 dct:type 'https://nlp4statref/knowledge/resource/authority/resource-type#glossary-home-page'.
          EG:9026 estat:resourceInformation 'https://nlp4statref/knowledge/resource/authority/resource-information#eurostat'.  
    } }
    ;
    

In [387]:
# This is done separetely for memory issues : 
def insert_estat_glossary_homepage_top_concepts(estat_glossary_table, method): 
       
    id_ = 0
    concept_query_part = "EG:" + str(id_) + """ skos:ConceptScheme  'https://nlp4statref/knowledge/resource/vocabulary/glossary/eurostat-glossary'."""
    for concept_id_ in estat_glossary_table.id:
        topconcept_query_part = "EG:" + str(id_) + """ skos:hasTopConcept EG:""" +  str(concept_id_) + "."
        # Construct the query 
        query = """
            PREFIX EG: <https://nlp4statref/knowledge/resource/terminology/glossary/eurostat-glossary#>
            PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
            INSERT { GRAPH <https://nlp4statref/knowledge/ontology/> { 
              """ + topconcept_query_part + """
        } }
        ;
            """
        #Take out special characters : 
        query.encode('latin-1')
        sparql.setQuery(query)
        sparql.method = method
        sparql.setReturnFormat(JSON)
        results = sparql.query().response.read()

In [393]:
###run the function 
insert_estat_glossary_homepage_top_concepts(estat_glossary_table = ESTAT_V1_dat_estat_glossary, 
                                            method = 'POST')
       

In [394]:
def insert_estat_glossary_elements(DB, method): 

    
    for i, row in DB.iterrows():
        id_ = row["id"]
        title_ = row["term"]
        title_ = title_.replace("'","").strip()
        definition_ = row["definition"]
        definition_ = definition_.replace("'","").strip()
        definition_ = definition_.replace("\\","").strip()  
        last_update_ = str(row["date_update"])

        concept_query_part = "EG:" + str(id_) + """ skos:Concept EG:""" +  str(id_) + "."
        title_query_part = "EG:" + str(id_) + """ skos:prefLabel '""" +  str(title_) + "'."
        def_query_part = "EG:" + str(id_) + """ skos:definition '""" +  str(definition_) + "'."
        inscheme_query_part = """EG:""" + str(id_) +  """ skos:inScheme 'https://nlp4statref/knowledge/resource/vocabulary/glossary/eurostat-glossary'."""
        publisher_query_part = """EG:""" + str(id_) +  """ dct:publisher 'http://publications.europa.eu/resource/authority/corporate-body/ESTAT'."""


        # Construct the query 
        query = """
            PREFIX dct: <http://purl.org/dc/terms/>
            PREFIX estat: <https://nlp4statref/knowledge/ontology/>
            PREFIX EG: <https://nlp4statref/knowledge/resource/terminology/glossary/eurostat-glossary#>
            PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
            INSERT { GRAPH <https://nlp4statref/knowledge/ontology/> { 
              """ + concept_query_part + """ 
              """ + inscheme_query_part + """       
              """ + title_query_part + """
              """ + def_query_part + """
              EG:""" + str(id_) + """ dct:modified '""" + str(last_update_) + """'.
        } }
        ;
            """
        #Take out special characters : 
        query.encode('latin-1')
        if (i == 0): print(query)
        sparql.setQuery(query)
        sparql.method = method
        sparql.setReturnFormat(JSON)
        results = sparql.query().response.read()

In [395]:
###run the function 
insert_estat_glossary_elements(DB = ESTAT_V1_dat_estat_glossary,
                         method = 'POST') 


            PREFIX dct: <http://purl.org/dc/terms/>
            PREFIX estat: <https://nlp4statref/knowledge/ontology/>
            PREFIX EG: <https://nlp4statref/knowledge/resource/terminology/glossary/eurostat-glossary#>
            PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
            INSERT { GRAPH <https://nlp4statref/knowledge/ontology/> { 
              EG:1 skos:Concept EG:1. 
              EG:1 skos:inScheme 'https://nlp4statref/knowledge/resource/vocabulary/glossary/eurostat-glossary'.       
              EG:1 skos:prefLabel '(n,k) rule'.
              EG:1 skos:definition 'A cell is regarded as confidential, if the n largest units contribute more than k % to the cell total, e.g. n=2 and k=85 means that a cell is defined as risky if the two largest units contribute more than 85 % to the cell total. The n and k are given by the statistical authority. In some NSIs the values of n and k are confidential.'.
              EG:1 dct:modified '2019-05-10 00:00:00'.
     

###  6 - Insertion of Code List elements into the KDB. 

In [397]:
def insert_code_list_elements(DB, method): 

    
    for i, row in DB.iterrows():
        id_ = row["id"]
        code_ = row["code"]
        label_ = row["label"]
        label_ = label_.replace("'","").strip()

        concept_query_part = "CD:" + str(id_) + """ skos:Concept CD:""" +  str(id_) + "."
        code_query_part = "CD:" + str(id_) + """ skos:notation '""" +  str(code_) + "'."
        label_query_part = "CD:" + str(id_) + """ skos:prefLabel '""" +  str(label_) + "'."
        


        # Construct the query 
        query = """
            PREFIX CD: <https://nlp4statref/knowledge/resource/authority/code-list/CodeDictionary#>
            PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
            INSERT { GRAPH <https://nlp4statref/knowledge/ontology/> { 
              """ + concept_query_part + """ 
              """ + code_query_part + """
              """ + label_query_part + """
        } }
        ;
            """
        #Take out special characters : 
        query.encode('latin-1')
        if (i == 0): print(query)
        sparql.setQuery(query)
        sparql.method = method
        sparql.setReturnFormat(JSON)
        results = sparql.query().response.read()

In [398]:
insert_code_list_elements(ESTAT_V1_dat_code_dicos, method = 'POST')


            PREFIX CD: <https://nlp4statref/knowledge/resource/authority/code-list/CodeDictionary#>
            PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
            INSERT { GRAPH <https://nlp4statref/knowledge/ontology/> { 
              CD:1 skos:Concept CD:1. 
              CD:1 skos:notation 'TOTAL'.
              CD:1 skos:prefLabel 'Total'.
        } }
        ;
            


EndPointNotFound: EndPointNotFound: it was impossible to connect with the endpoint in that address, check if it is correct. 

Response:
b'<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML//EN">\n<html>\n  <head>\n    <title>Error HTTP/1.1 404 File not found</title>\n  </head>\n  <body>\n    <h3>Error HTTP/1.1 404 File not found</h3><pre>\nThe requested URL was not found    URI  = \'/sparql/\'\n  </pre></body></html>\n'

In [None]:
### See added statements 

In [None]:

SelectStatements = """
PREFIX estat: <https://nlp4statref/knowledge/ontology/> 
PREFIX dct: <http://purl.org/dc/terms/>
SELECT * FROM <https://nlp4statref/knowledge/ontology/>
WHERE { estat:1487 ?s ?o

}
"""

#statements_df = sparql_dataframe.get(endpoint, SelectStatements)
#print(statements_df.shape)
#print(statements_df.tail())

sparql.setQuery(SelectStatements)
sparql.method = "POST"
sparql.setReturnFormat(JSON)
results = sparql.query().convert()['results']['bindings']
results = pd.json_normalize(results)
print(results['o.value'].values[0])

In [None]:
results = sparql.query().convert()['results']['bindings']
results = pd.json_normalize(results)
results