# Explore the Content DB tables

### Load libraries and connect to Virtuoso

In [3]:
import os 
import re
import logging
import sys
import pyodbc
import hashlib
import pandas as pd
from datetime import datetime
from SPARQLWrapper import SPARQLWrapper, POST, DIGEST, GET
from SPARQLWrapper import JSON, INSERT, DELETE
import sparql_dataframe
from itertools import chain

In [4]:
def connect_db(DSN, DBA, UID, PWD):

    connection = pyodbc.connect('DSN={};DBA={};UID={};PWD={}'.format(DSN, 
                                                                     DBA,
                                                                     UID,
                                                                     PWD))
    cursor = connection.cursor()

    return connection, cursor


def connect_virtuoso(DSN, UID, PWD):

    sparql = SPARQLWrapper(DSN)
    sparql.setHTTPAuth(DIGEST)
    sparql.setCredentials(UID, PWD)
    sparql.setMethod(GET)

    return sparql


Replace the values with your own logins :

In [5]:
user = "dba"
login = "30gFcpQzj7sPtRu5bkes"

In [6]:
# Connection to CDB 
connection, cursor = connect_db('Virtuoso All', 
                                'ESTAT', 
                                user, 
                                login)


# Connection to the KDB 
endpoint = "http://virtuoso-test.kapcode.fr:8890/sparql/"
sparql = connect_virtuoso(endpoint, 
                          user, 
                          login)


### Define content selection functions

In [7]:
def select_query(columns, table, conditions=None): 

    if conditions:

        query = """
            SELECT {}
            FROM {}
            WHERE {}

        """.format(columns, table, conditions)

    else:

        query = """

        SELECT {}
        FROM {}
        """.format(columns, table)

    return query

#### Get all tables name from the CDB 

In [8]:
ESTAT_V1_tables_names = pd.read_sql(select_query('*',
                                          'ESTAT.information_schema.tables', 
                                          ''), 
                             connection)
ESTAT_V1_tables_names['TABLE_NAME']

0                     dat_article
1           dat_article_paragraph
2         dat_article_shared_link
3                dat_code_dataset
4                   dat_code_dico
5                  dat_collection
6         dat_collection_resource
7                     dat_dataset
8              dat_estat_glossary
9     dat_estatg_measurement_unit
10           dat_estatg_stat_unit
11               dat_further_info
12                   dat_glossary
13                  dat_link_info
14           dat_paragraph_figure
15               dat_redirections
16           dat_related_concepts
17                   dat_resource
18          dat_resource_altlabel
19                    dat_sources
20           dat_statistical_data
21           mod_article_division
22               mod_concept_type
23                mod_dictionnary
24                   mod_infotype
25               mod_lexical_type
26           mod_measurement_unit
27             mod_ramon_category
28       mod_resource_information
29            

    "ESTAT.V1.mod_article_division" # Type de parties des articles 
    "ESTAT.V1.mod_concept_type"#type de concepts (term ou var)
    "ESTAT.V1.mod_dictionnary"# code du dictionnaire 

    "ESTAT.V1.mod_infotype"# ressource type ou info 
    "ESTAT.V1.mod_lexical_type"# type de lexiques 
    "ESTAT.V1.mod_measurement_unit"# type de mesures
    "ESTAT.V1.mod_ramon_category"# ramon nomenclatures 
    "ESTAT.V1.mod_resource_information"# resource info DEPRECATED
    "ESTAT.V1.mod_resource_type"# resource type DEPRECATED
    "ESTAT.V1.mod_statistical_unit"# type of statistical units estatglossary
    "ESTAT.V1.mod_status"# current or deprecated (status in ressources )

    #Topic related tables : 
    "ESTAT.V1.tm_articles_to_topics"# 
     "ESTAT.V1.tm_terms"#
    "ESTAT.V1.tm_topics"#
    "ESTAT.V1.tm_values"#

    "ESTAT.V1.dat_article" #SA articles 
    "ESTAT.V1.dat_article_paragraph" #paragraphs inside articles 
    "ESTAT.V1.dat_article_shared_link" #links inside articles 
    "ESTAT.V1.dat_code_dataset" #dataset linked to code list elements 
    "ESTAT.V1.dat_code_dico" #tout ce qu'il y a dans le dictionnaire 
    "ESTAT.V1.dat_collection" #describe collection of ressources types (editorial, legal, ...)
    "ESTAT.V1.dat_collection_resource" #liens collections/ressources 
    "ESTAT.V1.dat_dataset" #datasets labels 
    "ESTAT.V1.dat_estat_glossary" #Estat glossary 
    "ESTAT.V1.dat_estatg_measurement_unit" #measurement units for each element 
    "ESTAT.V1.dat_estatg_stat_unit" #statistical  units for each element 
    "ESTAT.V1.dat_further_info" #further links of SEG
    "ESTAT.V1.dat_glossary" #SEG elements 
    "ESTAT.V1.dat_link_info" #urls and titles 
    "ESTAT.V1.dat_paragraph_figure" #figure inside articles paragraphs 
    "ESTAT.V1.dat_redirections" #redirections from SEG elements 
    "ESTAT.V1.dat_related_concepts" #related SEG concepts 
    "ESTAT.V1.dat_resource" #table of resources 
    "ESTAT.V1.dat_resource_altlabel" #other labels for the resources 
    "ESTAT.V1.dat_sources" #source of the SEG elements 
    "ESTAT.V1.dat_statistical_data" #statistical data of the SEG elements 

##### Load the ressource info table 

In [9]:
ESTAT_V1_dat_resource = pd.read_sql(select_query('*',
                                          ' ESTAT.V1.dat_resource', 
                                          ''), 
                             connection)
print(ESTAT_V1_dat_resource.shape)

(55, 14)


In [10]:
ESTAT_V1_dat_resource

Unnamed: 0,id,label_en,label_fr,label_de,uri,date_created,date_modified,status_id,date_deprecated,definition,editorial_note,change_note,scope_note,infotype_id
0,0,Other,Autre,,,NaT,,,,,,,,
1,1,Eurostat,Eurostat,,https://nlp4statref/knowledge/resource/authori...,2021-06-01,,1.0,,Eurostat resource.,,18/05/2021 - Creation,,1.0
2,2,European Agency for Safety and Health at Work,,,https://nlp4statref/knowledge/resource/authori...,2021-06-29,,1.0,,,"ToDo : Notes, prefLabel @fr, Acronyms, Abbrevi...",29/06/2021 - Creation,,1.0
3,3,European Asylum support office,,,https://nlp4statref/knowledge/resource/authori...,2021-06-29,,1.0,,,"ToDo : Notes, prefLabel @fr, Acronyms, Abbrevi...",29/06/2021 - Creation,,1.0
4,4,European Central Bank,,,https://nlp4statref/knowledge/resource/authori...,2021-06-29,,1.0,,,"ToDo : Notes, prefLabel @fr, Acronyms, Abbrevi...",29/06/2021 - Creation,,1.0
5,5,European Centre for the Development of vocatio...,,,https://nlp4statref/knowledge/resource/authori...,2021-06-29,,1.0,,,"ToDo : Notes, prefLabel @fr, Acronyms, Abbrevi...",29/06/2021 - Creation,,1.0
6,6,European Council and council of the European U...,Conseil europÃ©en et Conseil de l'Union europÃ...,,https://nlp4statref/knowledge/resource/authori...,2021-06-29,,1.0,,,"ToDo : Notes, Acronyms, Abbreviations, Syntact...",29/06/2021 - Creation,,1.0
7,7,European Court of Auditors,,,https://nlp4statref/knowledge/resource/authori...,2021-06-29,,1.0,,,"ToDo : Notes, prefLabel @fr, Acronyms, Abbrevi...",29/06/2021 - Creation,,1.0
8,8,European Environment Agency,,,https://nlp4statref/knowledge/resource/authori...,2021-06-29,,1.0,,,"ToDo : Notes, prefLabel @fr, Acronyms, Abbrevi...",29/06/2021 - Creation,,1.0
9,9,European Environment Information and Observati...,,,https://nlp4statref/knowledge/resource/authori...,2021-06-29,,1.0,,,"ToDo : Notes, prefLabel @fr, Acronyms, Abbrevi...",29/06/2021 - Creation,,1.0


##### Load the Link info table 

In [11]:
ESTAT_V1_dat_link_info = pd.read_sql(select_query('*',
                                          ' ESTAT.V1.dat_link_info', 
                                          ''), 
                             connection)
print(ESTAT_V1_dat_link_info.shape)


(10556, 5)


In [12]:
ESTAT_V1_dat_link_info.head()

Unnamed: 0,id,title,url,resource_information_id,resource_type_id
0,1,Accident at work,https://ec.europa.eu/eurostat/statistics-expla...,1,39
1,2,"Accidents at work (ESAW, 2008 onwards)",http://ec.europa.eu/eurostat/cache/metadata/en...,1,46
2,3,Commission Regulation (EU) No 349/2011,http://eur-lex.europa.eu/LexUriServ/LexUriServ...,16,37
3,4,European Statistics on Accidents at Work (E...,http://ec.europa.eu/eurostat/product?code=KS-R...,1,0
4,5,Fatal accident at work,https://ec.europa.eu/eurostat/statistics-expla...,1,39


##### Load Glossary related tables

In [13]:
ESTAT_V1_dat_glossary = pd.read_sql(select_query('*',
                                          'ESTAT.V1.dat_glossary', 
                                          ''), 
                             connection)

ESTAT_V1_dat_related_concepts = pd.read_sql(select_query('*',
                                          "ESTAT.V1.dat_related_concepts", 
                                          ''), 
                                            connection)      

ESTAT_V1_dat_sources = pd.read_sql(select_query('*',
                                          "ESTAT.V1.dat_sources", 
                                          ''), 
                             connection)
ESTAT_V1_dat_redirections = pd.read_sql(select_query('*',
                                          "ESTAT.V1.dat_redirections", 
                                          ''), 
                             connection)

ESTAT_V1_dat_further_info = pd.read_sql(select_query('*',
                                          'ESTAT.V1.dat_further_info', 
                                          ''), 
                             connection)
ESTAT_V1_dat_statistical_data = pd.read_sql(select_query('*',
                                          'ESTAT.V1.dat_statistical_data', 
                                          ''), 
                             connection)

##### Load Statistical Articles related tables

In [14]:
ESTAT_V1_dat_article_shared_link = pd.read_sql(select_query('*',
                                          'ESTAT.V1.dat_article_shared_link', 
                                          ''), 
                             connection)


In [15]:
ESTAT_V1_dat_article_paragraph = pd.read_sql(select_query('*',
                                          'ESTAT.V1.dat_article_paragraph', 
                                          ''), 
                             connection)

In [16]:
ESTAT_V1_dat_article = pd.read_sql(select_query('*',
                                          'ESTAT.V1.dat_article', 
                                          'background_article = 0'), 
                             connection)

ESTAT_V1_dat_article_core_data = pd.read_sql('SELECT article_id, link_id, url, data_sources FROM ESTAT.V1.dat_article_shared_link INNER JOIN ESTAT.V1.dat_article ON ESTAT.V1.dat_article.id = article_id INNER JOIN ESTAT.V1.dat_link_info ON link_id = ESTAT.V1.dat_link_info.id WHERE background_article = 0 AND article_division_id=1',connection)

##### Load Statistical Background Articles related tables

In [17]:
ESTAT_V1_dat_background_article = pd.read_sql(select_query('*',
                                          'ESTAT.V1.dat_article', 
                                          'background_article = 1'), 
                             connection)

ESTAT_V1_dat_background_article_core_data = pd.read_sql('SELECT article_id, link_id, url, data_sources FROM ESTAT.V1.dat_article_shared_link INNER JOIN ESTAT.V1.dat_article ON ESTAT.V1.dat_article.id = article_id INNER JOIN ESTAT.V1.dat_link_info ON link_id = ESTAT.V1.dat_link_info.id WHERE background_article = 1 AND article_division_id=1',connection)


##### Load Eurostat Glossary related tables 

In [18]:
ESTAT_V1_dat_estat_glossary = pd.read_sql(select_query('*',
                                          'ESTAT.V1.dat_estat_glossary', 
                                          ''), 
                             connection)

##### Load Code List related tables

In [19]:
ESTAT_V1_dat_code_dicos = pd.read_sql(select_query('*',
                                  'ESTAT.V1.dat_code_dico', 
                                  ''), 
                                    connection)


### Explore

In [20]:
#Get the count of each ressource type in the Content:
temp = ESTAT_V1_dat_link_info.merge(ESTAT_V1_dat_resource, left_on='resource_type_id', right_on='id')
resource_type_count = temp.label_en.value_counts()
resource_type_count

Other                            5942
Glossary concept                 2049
Statistical article               642
Statistics Reference Metadata     578
European Union Law                560
Statistics Database               257
Background article                250
News                              128
Miscellaneous                      61
Publication                        43
Infography                         29
Article                             8
Legal context                       3
Statistics Explained Data           3
Glossary home page                  2
Statistics Table                    1
Name: label_en, dtype: int64

In [21]:
ESTAT_V1_dat_glossary #print the whole table

Unnamed: 0,id,definition,redirection,original_title,homepage,last_update
0,1,An accident at work in the framework ...,0,,0,2021-03-10 11:13:00
1,5,A fatal accident at work refers to an...,0,,0,2019-03-06 12:08:00
2,6,A non-fatal accident at work is...,0,,0,2018-09-12 15:42:00
3,8,Aggregate demand is the total amount of ...,0,,0,2017-07-28 11:15:00
4,9,The goods and services account shows ...,0,,0,2017-08-03 16:48:00
...,...,...,...,...,...,...
1309,2319,"Actual individual consumption , abbrevia...",0,,0,2017-07-28 11:21:00
1310,2321,Activity rate is the percentage of a...,0,,0,2020-11-05 15:00:00
1311,2322,The activation policies are policies ...,0,,0,2018-08-24 17:02:00
1312,2324,"<Brief user-oriented definition, one or a fe...",0,,0,2020-08-05 16:44:00


In [22]:
ESTAT_V1_dat_code_dicos.shape #get the shape 

(241367, 4)

In [23]:
ESTAT_V1_dat_estat_glossary.head() # get an extract

Unnamed: 0,id,order_id,code_id,term,ramon_cat_id,concept_type_id,definition,context,remark,date_create,date_update
0,1,753479,12789,"(n,k) rule",33,1,"A cell is regarded as confidential, if the n l...",,,2010-05-03,2019-05-10
1,2,753489,12799,"(p,q) rule",33,1,It is assumed that out of publicly available i...,,,2010-05-03,2019-05-10
2,3,1744196,19247,Âµ-ARGUS,33,1,Software that creates safe micro-data files.,,,2017-06-15,2019-05-10
3,4,336744,5545,Abandoned wine-growing area,12,1,Total area of land under vines which is no lon...,,,2005-05-03,2019-05-10
4,5,1854753,20003,Abduction by a legal guardian,151,1,Abduction of a minor by a legal guardian who i...,,,2017-12-05,2019-05-10
