# Extraction for Knowledge Graph creation #

### Load libraries and connect to Virtuoso

In [19]:
import os 
import re
import logging
import sys
import pyodbc
import hashlib
import pandas as pd
from datetime import datetime
from SPARQLWrapper import SPARQLWrapper, POST, DIGEST, GET
from SPARQLWrapper import JSON, INSERT, DELETE
import sparql_dataframe

In [20]:
def connect_db(DSN, DBA, UID, PWD):

    connection = pyodbc.connect('DSN={};DBA={};UID={};PWD={}'.format(DSN, 
                                                                     DBA,
                                                                     UID,
                                                                     PWD))
    cursor = connection.cursor()

    return connection, cursor


def connect_virtuoso(DSN, UID, PWD):

    sparql = SPARQLWrapper(DSN)
    sparql.setHTTPAuth(DIGEST)
    sparql.setCredentials(UID, PWD)
    sparql.setMethod(GET)

    return sparql


Replace the values with your own logins :

In [21]:
user = "pierre"
login = "PlwWavJ0DwVZdgvzEUyG"

In [22]:
# Connection to CDB 
connection, cursor = connect_db('Virtuoso All', 
                                'ESTAT', 
                                user, 
                                login)


# Connection to the KDB 
endpoint = "http://virtuoso-test.kapcode.fr:8890/sparql/"
sparql = connect_virtuoso(endpoint, 
                          user, 
                          login)


### Select triplets based on specific properties  

In [23]:
relations = ["skos:inScheme",
             "skos:ConceptScheme",
             "skos:hasTopConcept",
             "estat:relatedLegallnformation",
             "estat:relatedEditorialContent",
             "estat:relatedStatisticData",
             "estat:sourceInformation",
             "estat:sourceData",
             "estat:dataInformation"]
titles = ['skos:prefLabel',
          "dct:title"]
for rel in relations :
    ###Get links 
    RelationsStatements = """
    PREFIX estat: <https://nlp4statref/knowledge/ontology/> 
    PREFIX dct: <http://purl.org/dc/terms/>
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
    SELECT * FROM <https://nlp4statref/knowledge/ontology/>
    WHERE {
        ?s """ + rel + """ ?o .
    }
    """
    print(RelationsStatements)
    sparql.setQuery(RelationsStatements)
    sparql.method = "POST"
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()['results']['bindings']
    results = pd.json_normalize(results)
    results["p.value"] = rel
    print(results.shape)
        
    if (rel == relations[0]): results_relations = results
    else : results_relations =  pd.concat([results_relations,results])

#statements_df = sparql_dataframe.get(endpoint, SelectStatements)
#print(statements_df.shape)
#print(statements_df.tail())
#print(results['o.value'].values[0])
#results = sparql.query().convert()['results']['bindings']
#results = pd.json_normalize(results)
#results
print(results_relations.shape)
results_relations.to_excel("data/results_relations.xlsx")


    PREFIX estat: <https://nlp4statref/knowledge/ontology/> 
    PREFIX dct: <http://purl.org/dc/terms/>
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
    SELECT * FROM <https://nlp4statref/knowledge/ontology/>
    WHERE {
        ?s skos:inScheme ?o .
    }
    
(10000, 5)

    PREFIX estat: <https://nlp4statref/knowledge/ontology/> 
    PREFIX dct: <http://purl.org/dc/terms/>
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
    SELECT * FROM <https://nlp4statref/knowledge/ontology/>
    WHERE {
        ?s skos:ConceptScheme ?o .
    }
    
(2, 5)

    PREFIX estat: <https://nlp4statref/knowledge/ontology/> 
    PREFIX dct: <http://purl.org/dc/terms/>
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
    SELECT * FROM <https://nlp4statref/knowledge/ontology/>
    WHERE {
        ?s skos:hasTopConcept ?o .
    }
    
(10000, 5)

    PREFIX estat: <https://nlp4statref/knowledge/ontology/> 
    PREFIX dct: <http://purl.org/dc/terms/>
    PREFIX skos: <http://www.w

### Get the titles of those elements 

In [24]:
titles = {'elements':["skos:Concept", "skos:ConceptScheme", "estat:StatisticalArticle", "estat:StatisticalDataReport", "estat:BackgroundArticle"],
          'title':["skos:prefLabel", "dct:title", "dct:title",  "dct:title", "dct:title"]}
titles = pd.DataFrame(titles)
for i, row in titles.iterrows():
    tit = row['title']
    ele = row['elements']
    
    ###Get links 
    TitlesStatements = """
    PREFIX estat: <https://nlp4statref/knowledge/ontology/> 
    PREFIX dct: <http://purl.org/dc/terms/>
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
    SELECT ?s ?o FROM <https://nlp4statref/knowledge/ontology/>
    WHERE {
        ?s """ + ele + """ ?s .
        ?s """ + tit + """ ?o
    }
    """
    print(TitlesStatements)
    sparql.setQuery(TitlesStatements)
    sparql.method = "POST"
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()['results']['bindings']
    results = pd.json_normalize(results)
    #     results["p.value"] = tit   
    
    if (ele == titles['elements'].values[0]): title_results = results
    else : title_results =  pd.concat([title_results,results])
    print(results.shape)

#statements_df = sparql_dataframe.get(endpoint, SelectStatements)
#print(statements_df.shape)
#print(statements_df.tail())
#print(results['o.value'].values[0])
#results = sparql.query().convert()['results']['bindings']
#results = pd.json_normalize(results)
#results
print(title_results.shape)
title_results.to_excel("data/results_titles.xlsx")


    PREFIX estat: <https://nlp4statref/knowledge/ontology/> 
    PREFIX dct: <http://purl.org/dc/terms/>
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
    SELECT ?s ?o FROM <https://nlp4statref/knowledge/ontology/>
    WHERE {
        ?s skos:Concept ?s .
        ?s skos:prefLabel ?o
    }
    
(10000, 4)

    PREFIX estat: <https://nlp4statref/knowledge/ontology/> 
    PREFIX dct: <http://purl.org/dc/terms/>
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
    SELECT ?s ?o FROM <https://nlp4statref/knowledge/ontology/>
    WHERE {
        ?s skos:ConceptScheme ?s .
        ?s dct:title ?o
    }
    
(0, 0)

    PREFIX estat: <https://nlp4statref/knowledge/ontology/> 
    PREFIX dct: <http://purl.org/dc/terms/>
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
    SELECT ?s ?o FROM <https://nlp4statref/knowledge/ontology/>
    WHERE {
        ?s estat:StatisticalArticle ?s .
        ?s dct:title ?o
    }
    
(641, 4)

    PREFIX estat: <https://nlp4statref/know

### Get Ressource type of those elements 

In [25]:
types = {'elements':["skos:Concept", "skos:ConceptScheme", "estat:StatisticalArticle", "estat:StatisticalDataReport", "estat:BackgroundArticle"],
          'type':["dct:type", "dct:type", "dct:type",  "dct:type", "dct:type"]}
types = pd.DataFrame(types)
for i, row in types.iterrows():
    typ = row['type']
    ele = row['elements']
    
    ###Get links 
    TypesStatements = """
    PREFIX estat: <https://nlp4statref/knowledge/ontology/> 
    PREFIX dct: <http://purl.org/dc/terms/>
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
    SELECT ?s ?o FROM <https://nlp4statref/knowledge/ontology/>
    WHERE {
        ?s """ + ele + """ ?s .
        ?s """ + typ + """ ?o
    }
    """
    print(TypesStatements)
    sparql.setQuery(TypesStatements)
    sparql.method = "POST"
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()['results']['bindings']
    results = pd.json_normalize(results)
    #     results["p.value"] = tit   
    
    if (ele == types['elements'].values[0]): types_results = results
    else : types_results =  pd.concat([types_results,results])
    print(results.shape)

#statements_df = sparql_dataframe.get(endpoint, SelectStatements)
#print(statements_df.shape)
#print(statements_df.tail())
#print(results['o.value'].values[0])
#results = sparql.query().convert()['results']['bindings']
#results = pd.json_normalize(results)
#results
print(types_results.shape)
types_results.to_excel("data/results_types.xlsx")


    PREFIX estat: <https://nlp4statref/knowledge/ontology/> 
    PREFIX dct: <http://purl.org/dc/terms/>
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
    SELECT ?s ?o FROM <https://nlp4statref/knowledge/ontology/>
    WHERE {
        ?s skos:Concept ?s .
        ?s dct:type ?o
    }
    
(1314, 4)

    PREFIX estat: <https://nlp4statref/knowledge/ontology/> 
    PREFIX dct: <http://purl.org/dc/terms/>
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
    SELECT ?s ?o FROM <https://nlp4statref/knowledge/ontology/>
    WHERE {
        ?s skos:ConceptScheme ?s .
        ?s dct:type ?o
    }
    
(0, 0)

    PREFIX estat: <https://nlp4statref/knowledge/ontology/> 
    PREFIX dct: <http://purl.org/dc/terms/>
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
    SELECT ?s ?o FROM <https://nlp4statref/knowledge/ontology/>
    WHERE {
        ?s estat:StatisticalArticle ?s .
        ?s dct:type ?o
    }
    
(641, 4)

    PREFIX estat: <https://nlp4statref/knowledge/ont

### Get Ressource info of those elements 

In [26]:
infos = {'elements':["skos:Concept", "skos:ConceptScheme", "estat:StatisticalArticle", "estat:StatisticalDataReport", "estat:BackgroundArticle"],
          'info':["estat:resourceInformation", "estat:resourceInformation", "estat:resourceInformation",
                  "estat:resourceInformation", "estat:resourceInformation"]}
infos = pd.DataFrame(infos)
for i, row in infos.iterrows():
    inf = row['info']
    ele = row['elements']
    
    ###Get links 
    InfosStatements = """
    PREFIX estat: <https://nlp4statref/knowledge/ontology/> 
    PREFIX dct: <http://purl.org/dc/terms/>
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
    SELECT ?s ?o FROM <https://nlp4statref/knowledge/ontology/>
    WHERE {
        ?s """ + ele + """ ?s .
        ?s """ + inf + """ ?o
    }
    """
    print(InfosStatements)
    sparql.setQuery(InfosStatements)
    sparql.method = "POST"
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()['results']['bindings']
    results = pd.json_normalize(results)
    #     results["p.value"] = tit   
    
    if (ele == types['elements'].values[0]): infos_results = results
    else : infos_results =  pd.concat([infos_results,results])
    print(results.shape)

#statements_df = sparql_dataframe.get(endpoint, SelectStatements)
#print(statements_df.shape)
#print(statements_df.tail())
#print(results['o.value'].values[0])
#results = sparql.query().convert()['results']['bindings']
#results = pd.json_normalize(results)
#results
print(infos_results.shape)
infos_results.to_excel("data/results_infos.xlsx")


    PREFIX estat: <https://nlp4statref/knowledge/ontology/> 
    PREFIX dct: <http://purl.org/dc/terms/>
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
    SELECT ?s ?o FROM <https://nlp4statref/knowledge/ontology/>
    WHERE {
        ?s skos:Concept ?s .
        ?s estat:resourceInformation ?o
    }
    
(1314, 4)

    PREFIX estat: <https://nlp4statref/knowledge/ontology/> 
    PREFIX dct: <http://purl.org/dc/terms/>
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
    SELECT ?s ?o FROM <https://nlp4statref/knowledge/ontology/>
    WHERE {
        ?s skos:ConceptScheme ?s .
        ?s estat:resourceInformation ?o
    }
    
(0, 0)

    PREFIX estat: <https://nlp4statref/knowledge/ontology/> 
    PREFIX dct: <http://purl.org/dc/terms/>
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
    SELECT ?s ?o FROM <https://nlp4statref/knowledge/ontology/>
    WHERE {
        ?s estat:StatisticalArticle ?s .
        ?s estat:resourceInformation ?o
    }
    
(641, 4)

 