In [62]:
#!python -m spacy download pt_core_news_lg
#%pip install -U scikit-learn

In [163]:
import os
import io
import stardog
import pandas as pd
import spacy
from spacy import displacy
from sklearn.cluster import DBSCAN
import numpy as np
import requests
from unidecode import unidecode
import re

### Specify Stardog connection details

In [136]:
# Stardog variables
STARDOG_ENDPOINT = os.getenv('STARDOG_ENDPOINT')
STARDOG_USERNAME = os.getenv("STARDOG_USERNAME")
STARDOG_PASSWORD = os.getenv("STARDOG_PASSWORD")

connection_details = {
    'endpoint': STARDOG_ENDPOINT,
    'username': STARDOG_USERNAME,
    'password': STARDOG_PASSWORD
}

In [137]:
database_name = 'IndigenousSlavery'
conn = stardog.Connection(database_name, **connection_details)

In [138]:
query = """
SELECT ?thesis ?abstract (lang(?abstract) AS ?lang) WHERE {
  ?thesis a <http://purl.org/ontology/bibo/Thesis>.
  ?thesis <http://purl.org/ontology/bibo/abstract> ?abstract.
  FILTER (langMatches(lang(?abstract),"pt"))
}
LIMIT 100
"""
#  FILTER (langMatches(lang(?abstract),"en"))
# FILTER (lang(?abstract) NOT IN("pt", "en"))

csv_results = conn.select(query, content_type='text/csv')
thesis_abstract = pd.read_csv(io.BytesIO(csv_results))
thesis_abstract


Unnamed: 0,thesis,abstract,lang
0,tag:stardog:api:com_a_condicao_de_servir_gratu...,A presente pesquisa parte da emancipação de es...,pt
1,tag:stardog:api:educando_libertos_escravizados...,Esta tese parte da trajetória de Cincinato Fra...,pt
2,tag:stardog:api:marambaia_historia_memoria_e_d...,A presente tese se desdobra em dois objetivos ...,pt
3,tag:stardog:api:mulheres_negras_politicas_iden...,O objetivo do presente estudo é conhecer as in...,pt
4,tag:stardog:api:os_sentidos_da_liberdade_traje...,Esta tese acompanhará trajetórias individuais ...,pt
...,...,...,...
95,tag:stardog:api:reforma_agraria_e_preco_justo_...,Dissertação (mestrado)—Universidade de Brasíli...,pt
96,tag:stardog:api:o_trabalho_domestico_e_a_segun...,Dissertação (mestrado)—Universidade de Brasíli...,pt
97,tag:stardog:api:outras_faces_do_abolicionismo_...,"Tese (doutorado)—Universidade de Brasília, Ins...",pt
98,tag:stardog:api:para_depois_da_escravidao_os_p...,O objetivo desta pesquisa é realizar uma refle...,pt


In [139]:
#nlp_en = spacy.load("en_core_web_trf")
nlp_pt = spacy.load("pt_core_news_lg")

instances_dic = {}

for n in range(len(thesis_abstract)):

    doc_pt = nlp_pt(thesis_abstract['abstract'][n])
    persons = []
    gpes = []
    orgs = []
    for ent in doc_pt.ents:
        if ent.label_ == "PER": #"PERSON":
            persons.append(ent)
        if ent.label_ == "LOC": #"GPE":
            gpes.append(ent)
        if ent.label_ == "ORG":
            orgs.append(ent)

    instances_dic[thesis_abstract['thesis'][n].replace('tag:stardog:api:','')] = {'PER':persons, 'LOC':gpes, 'ORG':orgs}
    
#    print(ent.text, ent.start_char, ent.end_char, ent.label_)

#for ent in doc_pt.ents:
#    print(ent.text, ent.start_char, ent.end_char, ent.label_)

In [140]:
#total = 0
#total_set = 0
#for i in instances_dic:

#    if instances_dic[i]['PER'] != []:
#        total = total + len(instances_dic[i]['PER'])
#        total_set = total_set + len(set(instances_dic[i]['PER']))
#        print (instances_dic[i]['PER'])
#print (total, total_set)


In [141]:
# Para uma lista de entidades verificar se são a mesma instância e qual o termo mais comum para representá-las

def entidades_consolidadas(list_ent, th):
    
    if list_ent == []:
        return None

    # lista com os vetores das entidades
    list_vectors = np.array([i.vector for i in list_ent])

    #clusterizando os vetores de acordo com threshold th
    clustering = DBSCAN(eps=1-th, min_samples=1, metric='cosine').fit(list_vectors)

    #processnado os clusters 
    ents_dic ={}

    for i in set(clustering.labels_):
        clus_index = np.where(clustering.labels_ == i)[0]
        clus_ent = np.take(np.array(list_ent, dtype="object"), clus_index)

        label = []
        vec = []
        for ent in clus_ent:
            vec.append(ent.vector)
            label.append(ent.text)

        ents_dic[max(set(label), key=label.count)] = {'labels': list(set(label)), 'vector': np.average(vec, axis=0)}

    return ents_dic



In [142]:
th = 0.75
#list_ent = instances_dic['vivencias_e_experiencias_associativas_negras_em_bagers_no_posabolicao_imprensa_carnaval_e_clubes_sociais_negros_na_fronteira_sul_do_brasil_19131980']['LOC']
#list_ent = instances_dic['com_a_condicao_de_servir_gratuitamente_a_mim_ou_a_meus_herdeiros__alforrias_contratos_e_experiencias_de_trabalho_de_libertos_porto_alegre_1884__1888']['LOC']

for key in instances_dic:
    for type_ent in instances_dic[key]:
        list_ent = instances_dic[key][type_ent]
        instances_dic[key][type_ent] = entidades_consolidadas(list_ent, th)
#entidades_consolidadeas(list_ent, th)


Coletando dados da Wikidata

In [171]:
people_labels = []

for key in instances_dic:
    #for type_ent in instances_dic[key]:
    type_ent = 'PER'
    list_ent = instances_dic[key][type_ent]
    if list_ent != None:
        for ent in list_ent:
            people_labels = people_labels + list_ent[ent]['labels']

dbpedia_people = []

for per_bruto in set(people_labels):

    #ex = "Barão de Cotegipe"
    per = re.sub('[^a-zA-Z0-9_ ]', '', unidecode(per_bruto))
    per_vector = "'" + per.replace(" ", "', '") + "'"
    per_contains = per.replace(" ", " AND ")

    #query = """     
    #    SELECT ?person ?abstract
    #    WHERE{
    #    ?person rdfs:label '""" + per + """'@pt.
    #    ?person a dbo:Person.
    #    ?person dbo:abstract ?abstract.
    #    FILTER (langMatches(lang(?abstract),"pt"))
    #    }
    #    """

    query = """ 
        define input:ifp "IFP_OFF"  
        select ?s1 as ?c1, (bif:search_excerpt (bif:vector (""" + per_vector + """), ?o1)) as ?c2, ?sc, ?rank, ?g, ?abstract 
        where 
        { 
        select ?s1, (?sc * 3e-1) as ?sc, ?o1, (sql:rnk_scale (<LONG::IRI_RANK> (?s1))) as ?rank, ?g, ?abstract 
        
        where  
        { 
            quad map virtrdf:DefaultQuadMap 
            { 
            graph ?g 
            { 
                ?s1 ?s1textp ?o1 .
                ?o1 bif:contains  '(""" + per_contains + """)'  option (score ?sc)  .
                ?s1 a dbo:Person.
                ?s1 dbo:abstract ?abstract.
                FILTER (langMatches(lang(?abstract),"pt"))
            }
            } 
        }

        order by desc (?sc * 3e-1 + sql:rnk_scale (<LONG::IRI_RANK> (?s1)))  limit 10  offset 0 
        } 
        """

    url = 'http://dbpedia.org/sparql'

    r = requests.get(url, params = {'format': 'json', 'query': query})
    data = r.json()

    if data['results']['bindings'] != []:

        bindings = []
        for r in data['results']['bindings']:
            bindings.append((r['c1']['value'], r['abstract']['value']))
        #dbpedia_people.append((data['results']['bindings'][0]['person']['value'], data['results']['bindings'][0]['abstract']['value']))
        dbpedia_people = dbpedia_people + list(set(bindings))
        


ConnectionError: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))

In [173]:
dbpedia_people
#per_bruto

33

In [10]:
query = """     
SELECT ?person ?abstract
WHERE{
?person rdfs:label 'José do Patrocínio'@pt.
?person a dbo:Person.
?person dbo:abstract ?abstract.
FILTER (langMatches(lang(?abstract),"pt"))
}
"""

In [134]:
from unidecode import unidecode
ex = "Barão de Cotegipe"
ex = unidecode(ex)
ex_vector = "'" + ex.replace(" ", "', '") + "'"
ex_contains = ex.replace(" ", " AND ")
unidecode(ex)

'Barao de Cotegipe'

In [130]:
query = """ 
 define input:ifp "IFP_OFF"  
 select ?s1 as ?c1, (bif:search_excerpt (bif:vector (""" + ex_vector + """), ?o1)) as ?c2, ?sc, ?rank, ?g, ?abstract 
 where 
 { 
 select ?s1, (?sc * 3e-1) as ?sc, ?o1, (sql:rnk_scale (<LONG::IRI_RANK> (?s1))) as ?rank, ?g, ?abstract 
 
 where  
  { 
    quad map virtrdf:DefaultQuadMap 
    { 
      graph ?g 
      { 
        ?s1 ?s1textp ?o1 .
        ?o1 bif:contains  '(""" + ex_contains + """)'  option (score ?sc)  .
        ?s1 a dbo:Person.
        ?s1 dbo:abstract ?abstract.
      }
     }
    
  }

 order by desc (?sc * 3e-1 + sql:rnk_scale (<LONG::IRI_RANK> (?s1)))  limit 10  offset 0 
 } 


"""

#?s1 a dbo:Person.
#?s1 dbo:abstract ?abstract.

In [131]:
import requests

url = 'http://dbpedia.org/sparql'

r = requests.get(url, params = {'format': 'json', 'query': query})
data = r.json()

In [132]:
data['results']['bindings']
#data['results']['bindings'][0]['person']['value']
#data['results']['bindings'][0]['abstract']['value']

[]