# Extraction for Knowledge Graph creation #

### Load libraries and connect to Virtuoso

In [19]:
import os 
import re
import logging
import sys
import pyodbc
import hashlib
import pandas as pd
from datetime import datetime
from SPARQLWrapper import SPARQLWrapper, POST, DIGEST, GET
from SPARQLWrapper import JSON, INSERT, DELETE
import sparql_dataframe

In [20]:
def connect_db(DSN, DBA, UID, PWD):

    connection = pyodbc.connect('DSN={};DBA={};UID={};PWD={}'.format(DSN, 
                                                                     DBA,
                                                                     UID,
                                                                     PWD))
    cursor = connection.cursor()

    return connection, cursor


def connect_virtuoso(DSN, UID, PWD):

    sparql = SPARQLWrapper(DSN)
    sparql.setHTTPAuth(DIGEST)
    sparql.setCredentials(UID, PWD)
    sparql.setMethod(GET)

    return sparql


Replace the values with your own logins :

In [21]:
user = "pierre"
login = "PlwWavJ0DwVZdgvzEUyG"

In [22]:
# Connection to CDB 
connection, cursor = connect_db('Virtuoso All', 
                                'ESTAT', 
                                user, 
                                login)


# Connection to the KDB 
endpoint = "http://virtuoso-test.kapcode.fr:8890/sparql/"
sparql = connect_virtuoso(endpoint, 
                          user, 
                          login)


### Select triplets based on specific properties  

The relations list allow to specify which relations to take into account for the knowledge graph 

In [31]:
relations = ["estat:relatedLegallnformation",
             "estat:relatedEditorialContent",
             "estat:relatedStatisticData",
             "estat:sourceInformation",
             "estat:sourceData",
             "estat:dataInformation",
             "skos:related"]

In [32]:
for rel in relations :
    ###Get links 
    RelationsStatements = """
    PREFIX estat: <https://nlp4statref/knowledge/ontology/> 
    PREFIX dct: <http://purl.org/dc/terms/>
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
    SELECT * FROM <https://nlp4statref/knowledge/ontology/>
    WHERE {
        ?s """ + rel + """ ?o .
    }
    """
    print(RelationsStatements)
    sparql.setQuery(RelationsStatements)
    sparql.method = "POST"
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()['results']['bindings']
    results = pd.json_normalize(results)
    results["p.value"] = rel
    print(results.shape)
        
    if (rel == relations[0]): results_relations = results
    else : results_relations =  pd.concat([results_relations,results])

print(results_relations.shape)
results_relations.to_excel("data/results_relations.xlsx")


    PREFIX estat: <https://nlp4statref/knowledge/ontology/> 
    PREFIX dct: <http://purl.org/dc/terms/>
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
    SELECT * FROM <https://nlp4statref/knowledge/ontology/>
    WHERE {
        ?s estat:relatedLegallnformation ?o .
    }
    
(1317, 5)

    PREFIX estat: <https://nlp4statref/knowledge/ontology/> 
    PREFIX dct: <http://purl.org/dc/terms/>
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
    SELECT * FROM <https://nlp4statref/knowledge/ontology/>
    WHERE {
        ?s estat:relatedEditorialContent ?o .
    }
    
(398, 5)

    PREFIX estat: <https://nlp4statref/knowledge/ontology/> 
    PREFIX dct: <http://purl.org/dc/terms/>
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
    SELECT * FROM <https://nlp4statref/knowledge/ontology/>
    WHERE {
        ?s estat:relatedStatisticData ?o .
    }
    
(1607, 5)

    PREFIX estat: <https://nlp4statref/knowledge/ontology/> 
    PREFIX dct: <http://purl.org/dc/ter

### Get the titles of those elements 

The titles are used for naming the elements of the graph and also to filter out elements of interest. This second use can be completed by other fields like the description, for instance. The selection of elements will in the end be replaced by enrichement from the knowledge database.

In [29]:
titles = ["skos:prefLabel", "dct:title"]
for tit in titles:
    ###Get links 
    TitlesStatements = """
    PREFIX estat: <https://nlp4statref/knowledge/ontology/> 
    PREFIX dct: <http://purl.org/dc/terms/>
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
    SELECT ?s ?o FROM <https://nlp4statref/knowledge/ontology/>
    WHERE {
        ?s """ + tit + """ ?o
    }
    """
    print(TitlesStatements)
    sparql.setQuery(TitlesStatements)
    sparql.method = "POST"
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()['results']['bindings']
    results = pd.json_normalize(results)
    print(results.shape)
    
    if (tit == titles[0]): results_titles = results
    else : results_titles =  pd.concat([results_titles,results])
results_titles.to_excel("data/results_titles.xlsx")


    PREFIX estat: <https://nlp4statref/knowledge/ontology/> 
    PREFIX dct: <http://purl.org/dc/terms/>
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
    SELECT ?s ?o FROM <https://nlp4statref/knowledge/ontology/>
    WHERE {
        ?s skos:prefLabel ?o
    }
    
(1313, 4)

    PREFIX estat: <https://nlp4statref/knowledge/ontology/> 
    PREFIX dct: <http://purl.org/dc/terms/>
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
    SELECT ?s ?o FROM <https://nlp4statref/knowledge/ontology/>
    WHERE {
        ?s dct:title ?o
    }
    
(3175, 4)


### Get Ressource type of those elements 

The types of the elements is used to characterize them in the graph. 

In [30]:
TypesStatements = """
PREFIX estat: <https://nlp4statref/knowledge/ontology/> 
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
SELECT ?s ?o FROM <https://nlp4statref/knowledge/ontology/>
WHERE {
    ?s dct:type ?o
}
"""
print(TypesStatements)
sparql.setQuery(TypesStatements)
sparql.method = "POST"
sparql.setReturnFormat(JSON)
results = sparql.query().convert()['results']['bindings']
results = pd.json_normalize(results)
print(results.shape)
results.to_excel("data/results_types.xlsx")


PREFIX estat: <https://nlp4statref/knowledge/ontology/> 
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
SELECT ?s ?o FROM <https://nlp4statref/knowledge/ontology/>
WHERE {
    ?s dct:type ?o
}

(4458, 4)


Let's switch now to R for formatting