In [43]:
#!python -m spacy download pt_core_news_lg
#!python -m spacy download en_core_web_lg
#%pip install -U scikit-learn

In [1]:
import os
import io
import stardog
import pandas as pd
import spacy
from spacy import displacy
from sklearn.cluster import DBSCAN
import numpy as np
import requests
from unidecode import unidecode
import re

### Specify Stardog connection details

In [2]:
# Stardog variables
STARDOG_ENDPOINT = os.getenv('STARDOG_ENDPOINT')
STARDOG_USERNAME = os.getenv("STARDOG_USERNAME")
STARDOG_PASSWORD = os.getenv("STARDOG_PASSWORD")

connection_details = {
    'endpoint': STARDOG_ENDPOINT,
    'username': STARDOG_USERNAME,
    'password': STARDOG_PASSWORD
}

In [3]:
database_name = 'IndigenousSlavery'
conn = stardog.Connection(database_name, **connection_details)

In [4]:
# Query que busca as teses e os seus abstracts. Foi incluído um filtro para buscar apenas 
# os abstract em português, inglês ou em outra língua
query = """
SELECT ?thesis ?abstract (lang(?abstract) AS ?lang) WHERE {
  ?thesis a <http://purl.org/ontology/bibo/Thesis>.
  ?thesis <http://purl.org/ontology/bibo/abstract> ?abstract.
  
}
LIMIT 20
"""
# FILTER (langMatches(lang(?abstract),"pt"))
# FILTER (langMatches(lang(?abstract),"en"))
# FILTER (lang(?abstract) NOT IN("pt", "en"))

csv_results = conn.select(query, content_type='text/csv')
thesis_abstract = pd.read_csv(io.BytesIO(csv_results))
thesis_abstract


Unnamed: 0,thesis,abstract,lang
0,tag:stardog:api:com_a_condicao_de_servir_gratu...,A presente pesquisa parte da emancipação de es...,pt
1,tag:stardog:api:capitaes_e_mateus__relacoes_so...,Orientador: Robert Wayne Andrew Slenes,de
2,tag:stardog:api:liberdade_sob_tensao_negros_e_...,Post-abolition can be conceptualized as a hist...,en
3,tag:stardog:api:o_colegio_sao_benedito_e_a_esc...,In the post-abolition period and in the First ...,en
4,tag:stardog:api:d_pedro_ii_e_a_emancipacao_dos...,When you mention the topic abolition of slaver...,en
5,tag:stardog:api:identidades_na_perspectiva_da_...,This study presents the results of an ethnogra...,en
6,tag:stardog:api:dos_lacos_entre_jose_e_innocen...,In order to comprehend the complex process of ...,en
7,tag:stardog:api:o_processo_de_emancipacao_da_e...,This paper aims to analyze the process of eman...,en
8,tag:stardog:api:animais_como_pessoas_a_abordag...,The present study addresses the Abolitionist A...,en
9,tag:stardog:api:clubes_negros_de_futebol_em_sa...,"In this paper, through the investigation of tw...",en


In [5]:
# Extraíndo as entidades dos abstracts

#Carregando os modelos SpaCy para inglês e português 
nlp_en = spacy.load("en_core_web_lg")
nlp_pt = spacy.load("pt_core_news_lg")

#Dicionário que receberá as instâncias de cada tese
instances_dic = {}

for n in range(len(thesis_abstract)):

    # Processando os abstracts em português
    if thesis_abstract['lang'][n] == 'pt':

        doc_pt = nlp_pt(thesis_abstract['abstract'][n])
        persons = []
        gpes = []
        orgs = []
        for ent in doc_pt.ents:
            if ent.label_ == "PER": #"PERSON":
                persons.append(ent)
            if ent.label_ == "LOC": #"GPE":
                gpes.append(ent)
            if ent.label_ == "ORG":
                orgs.append(ent)

        instances_dic[thesis_abstract['thesis'][n].replace('tag:stardog:api:','')] = {'PER':persons, 'LOC':gpes, 'ORG':orgs}

    # Processando os abstracts em inglês
    if thesis_abstract['lang'][n] == 'en':

        doc_en = nlp_en(thesis_abstract['abstract'][n])
        persons = []
        gpes = []
        orgs = []
        for ent in doc_en.ents:
            if ent.label_ == "PERSON":
                persons.append(ent)
            if ent.label_ == "GPE":
                gpes.append(ent)
            if ent.label_ == "ORG":
                orgs.append(ent)

        instances_dic[thesis_abstract['thesis'][n].replace('tag:stardog:api:','')] = {'PER':persons, 'LOC':gpes, 'ORG':orgs}

In [6]:
# Para uma lista de entidades verificar se são a mesma instância e qual o termo mais comum para representá-las

def entidades_consolidadas(list_ent, th):
    
    if list_ent == []:
        return None

    # lista com os vetores das entidades
    list_vectors = np.array([i.vector for i in list_ent])

    #clusterizando os vetores de acordo com threshold th
    clustering = DBSCAN(eps=1-th, min_samples=1, metric='cosine').fit(list_vectors)

    #processnado os clusters 
    ents_dic ={}

    for i in set(clustering.labels_):
        clus_index = np.where(clustering.labels_ == i)[0]
        clus_ent = np.take(np.array(list_ent, dtype="object"), clus_index)

        label = []
        vec = []
        for ent in clus_ent:
            vec.append(ent.vector)
            label.append(ent.text)

        ents_dic[max(set(label), key=label.count)] = {'labels': list(set(label)), 'vector': np.average(vec, axis=0)}

    return ents_dic



In [7]:
# Reprocessando as listas de entidades para eliminar as entidades duplicadas de cada tese e obtendo o vetor de cada entidade
th = 0.75

for key in instances_dic:
    for type_ent in instances_dic[key]:
        list_ent = instances_dic[key][type_ent]
        instances_dic[key][type_ent] = entidades_consolidadas(list_ent, th)
#entidades_consolidadeas(list_ent, th)


Coletando dados da DBPEDIA

In [8]:
# Coletando todas as labels das entidades extraídas dos abstracts

people_labels = []
local_labels = []
org_labels = []

for key in instances_dic:
    for type_ent in instances_dic[key]:

        list_ent = instances_dic[key][type_ent]
        if list_ent != None:
            for ent in list_ent:
                if type_ent == 'PER':
                    people_labels = people_labels + list_ent[ent]['labels']
                if type_ent == 'LOC':
                    local_labels = local_labels + list_ent[ent]['labels']
                if type_ent == 'ORG':
                    org_labels = org_labels + list_ent[ent]['labels']

people_labels = list(set(people_labels))
local_labels = list(set(local_labels))
org_labels = list(set(org_labels))

In [9]:
len(org_labels), len(people_labels), len(local_labels)

(45, 72, 28)

In [10]:
# Funcao para buscar as labels na DBPEDIA. A funcao retorna as URI a abstracts de entidades que estao registradas na DBPEDIA.
# sc_th é o threshold do score de busca para uma URI da DBPEDIA ser retornada.
def DBPEDIA_search(labels, classe, sc_th):

    dbpedia_ent = []

    for ent_bruto in labels:
        # processando as labels para ser buscada na DBPEDIA
        ent = re.sub('[^a-zA-Z0-9_ ]', '', unidecode(ent_bruto))
        ent_vector = "'" + ent.replace(" ", "', '") + "'"
        ent_contains = ent.replace(" ", " AND ")

        # Sparql query para fazer busca em linguagem natural e retornar os resultados rankeados pelo score ?sc.
        query = """ 
            define input:ifp "IFP_OFF"  
            select ?s1 as ?c1, (bif:search_excerpt (bif:vector (""" + ent_vector + """), ?o1)) as ?c2, ?sc, ?rank, ?g, ?abstract 
            where 
            { 
            select ?s1, (?sc * 3e-1) as ?sc, ?o1, (sql:rnk_scale (<LONG::IRI_RANK> (?s1))) as ?rank, ?g, ?abstract 
            
            where  
            { 
                quad map virtrdf:DefaultQuadMap 
                { 
                graph ?g 
                { 
                    ?s1 ?s1textp ?o1 .
                    ?o1 bif:contains  '(""" + ent_contains + """)'  option (score ?sc)  .
                    ?s1 a """ + classe + """.
                    ?s1 dbo:abstract ?abstract.
                    FILTER (lang(?abstract) IN("pt", "en"))
                }
                } 
            }

            order by desc (?sc * 3e-1 + sql:rnk_scale (<LONG::IRI_RANK> (?s1)))  limit 5  offset 0 
            } 
            """
        # URL da DBPEDIA e request
        url = 'http://dbpedia.org/sparql'

        r = requests.get(url, params = {'format': 'json', 'query': query})
        data = r.json()

        # processando os resultados obtidos
        if data['results']['bindings'] != []:

            bindings = []
            for r in data['results']['bindings']:
                if float(r['sc']['value']) > sc_th:
                    bindings.append((ent_bruto, r['sc']['value'], r['c1']['value'], r['abstract']['value']))
            
            dbpedia_ent = dbpedia_ent + list(set(bindings))
    return dbpedia_ent  



In [34]:
ex_DB = DBPEDIA_search(people_labels[:50], 'dbo:Person', 5.0)
#ex_DB = DBPEDIA_search(local_labels[:5], 'dbo:Place', 5.0)
#ex_DB = DBPEDIA_search(org_labels[:5], 'dbo:Organisation', 5.0) 



In [50]:
ex_DB[0][35:]

[('Pedro II',
  '11.4',
  'http://dbpedia.org/resource/Pedro_II_of_Brazil',
  'Dom Pedro II (2 December 1825 – 5 December 1891), nicknamed "the Magnanimous" (Portuguese: O Magnânimo), was the second and last monarch of the Empire of Brazil, reigning for over 58 years. He was born in Rio de Janeiro, the seventh child of Emperor Dom Pedro I of Brazil and Empress Dona Maria Leopoldina and thus a member of the Brazilian branch of the House of Braganza. His father\'s abrupt abdication and departure to Europe in 1831 left the five-year-old as emperor and led to a grim and lonely childhood and adolescence, obliged to spend his time studying in preparation for rule. His experiences with court intrigues and political disputes during this period greatly affected his later character; he grew into a man with a strong sense of duty and devotion toward his country and his people, yet increasingly resentful of his role as monarch. Pedro II inherited an empire on the verge of disintegration, but he tu

In [63]:

#org_labels[:15]

{'head': {'link': [], 'vars': ['c1', 'c2', 'sc', 'rank', 'g', 'abstract']},
 'results': {'distinct': False, 'ordered': True, 'bindings': []}}

In [10]:
query = """     
SELECT ?person ?abstract
WHERE{
?person rdfs:label 'José do Patrocínio'@pt.
?person a dbo:Person.
?person dbo:abstract ?abstract.
FILTER (langMatches(lang(?abstract),"pt"))
}
"""

In [134]:
from unidecode import unidecode
ex = "Barão de Cotegipe"
ex = unidecode(ex)
ex_vector = "'" + ex.replace(" ", "', '") + "'"
ex_contains = ex.replace(" ", " AND ")
unidecode(ex)

'Barao de Cotegipe'

In [130]:
query = """ 
 define input:ifp "IFP_OFF"  
 select ?s1 as ?c1, (bif:search_excerpt (bif:vector (""" + ex_vector + """), ?o1)) as ?c2, ?sc, ?rank, ?g, ?abstract 
 where 
 { 
 select ?s1, (?sc * 3e-1) as ?sc, ?o1, (sql:rnk_scale (<LONG::IRI_RANK> (?s1))) as ?rank, ?g, ?abstract 
 
 where  
  { 
    quad map virtrdf:DefaultQuadMap 
    { 
      graph ?g 
      { 
        ?s1 ?s1textp ?o1 .
        ?o1 bif:contains  '(""" + ex_contains + """)'  option (score ?sc)  .
        ?s1 a dbo:Person.
        ?s1 dbo:abstract ?abstract.
      }
     }
    
  }

 order by desc (?sc * 3e-1 + sql:rnk_scale (<LONG::IRI_RANK> (?s1)))  limit 10  offset 0 
 } 


"""

#?s1 a dbo:Person.
#?s1 dbo:abstract ?abstract.

In [131]:
import requests

url = 'http://dbpedia.org/sparql'

r = requests.get(url, params = {'format': 'json', 'query': query})
data = r.json()

In [132]:
data['results']['bindings']
#data['results']['bindings'][0]['person']['value']
#data['results']['bindings'][0]['abstract']['value']

[]