In [1]:
#%pip install beautifulsoup4
#%pip install pip-system-certs

In [1]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import json
import stardog
import os
import io
from langchain_community.graphs import Neo4jGraph
from PyPDF2 import PdfReader
from langdetect import detect
import spacy
import re
from unidecode import unidecode

Conectando ao Neo4j 

In [2]:
from neo4j import GraphDatabase

# Neo4j variables
NEO4J_URL = os.getenv("NEO4J_URL")
NEO4J_USERNAME =os.getenv("NEO4J_USERNAME")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")


graph = GraphDatabase.driver(NEO4J_URL, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

- Buscar os links para a tese no Neo4j
- Buscar o link do PDF no repositório da universidade
- Baixar o PDF
- Extrair o texto do PDF 

Buscar links para a tese no Neo4j

In [3]:
query1 = """
MATCH (t:Thesis)-[:publisher]-(uni:University{uri:'tag:stardog:api:USP'})
WHERE NOT EXISTS {(t{processed:true})}
RETURN distinct(t["uri"]) as Thesis, t["repository"] as repository 
"""

graph = GraphDatabase.driver(NEO4J_URL, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))
results = graph.execute_query(query1).records
graph.close()

#results = graph.query(query2)
# Convert the results to a DataFrame
thesis_repository = pd.DataFrame(results, columns=["Thesis", "repository"])
thesis_repository = thesis_repository[200:300].reset_index(drop=True) 
thesis_repository

Unnamed: 0,Thesis,repository
0,tag:stardog:api:comunalismo_e_educacao_comunit...,https://www.teses.usp.br/teses/disponiveis/48/...
1,tag:stardog:api:contando_historias_revelando_t...,https://www.teses.usp.br/teses/disponiveis/27/...
2,tag:stardog:api:indigenas_e_ensino_medio_em_ro...,https://www.teses.usp.br/teses/disponiveis/48/...
3,tag:stardog:api:tan_criolla_criolla_como_yo_id...,https://www.teses.usp.br/teses/disponiveis/8/8...
4,tag:stardog:api:a_jurisdicao_especial_indigena...,https://www.teses.usp.br/teses/disponiveis/107...
...,...,...
95,tag:stardog:api:brincadeira_de_feira_por_uma_p...,http://www.teses.usp.br/teses/disponiveis/8/81...
96,tag:stardog:api:a_idade_da_terra_glauber_rocha...,http://www.teses.usp.br/teses/disponiveis/8/81...
97,tag:stardog:api:frontal_system_changes_in_the_...,http://www.teses.usp.br/teses/disponiveis/21/2...
98,tag:stardog:api:colonialismo_e_ocupacao_tupini...,http://www.teses.usp.br/teses/disponiveis/71/7...


Criando as utils functions

In [4]:

# Funcao para coletar link do pdf

def get_pdf_link(url):
    
    #Fazer requisição e parsear o arquivo html
    f = requests.get(url, verify=True).text#, verify=False) 
    soup = bs(f, "html.parser")
    
    #Coletando link para as teses
    for doc in soup.find_all('a', href=True):
        if doc['href'].endswith('.pdf'):
            path = doc['href']
            prefix_uni = 'https://www.teses.usp.br'
            link = prefix_uni + path

            return link
    
    return None

#fazer download do arquivo
def download_file(pdf_link, thesis_uri):
    # NOTE the stream=True parameter below
    r = requests.get(pdf_link, verify=True, stream=True)
    # raise_for_status() is not needed with stream=True
    # r.raise_for_status()
    with open('tese 3.pdf', 'wb') as f:
        for chunk in r.iter_content(chunk_size=8192):
            f.write(chunk)
    return 

#Carregando os modelos SpaCy para inglês e português 
nlp_en = spacy.load("en_core_web_lg")
nlp_pt = spacy.load("pt_core_news_lg")

# Funcão que recebe uma string e a limpar para ficar no formato aceitável para uma URI
def process_uri(x):
    return (re.sub('[^a-zA-Z0-9_ ]', '',
            unidecode(x.strip())
                        .replace(" ", "_")
                        .replace("[","")
                        .replace("]","")
                        .replace("?","")
                        .replace("'","")
                        .lower()))


# Função para extrair texto de uma tese em PDF e identificar entidades
def extratc_text(tese_path):
    
    # Processando o arquivo em PDF
    reader = PdfReader(tese_path)
    number_of_pages = len(reader.pages)
    excerpt = {}
    

    # Iterando por cada página
    for page_number in range(number_of_pages):
        
        
        # Extraindo o texto da página
        if page_number == 0:
            page_text = (reader.pages[page_number].extract_text() + '\n ' 
                        + reader.pages[page_number+1].extract_text()[:400])
        else:
            
            if page_number == number_of_pages -1:
                page_text = (reader.pages[page_number-1].extract_text()[-400:] + '\n ' 
                            + reader.pages[page_number].extract_text())
            
            else:
                page_text = (reader.pages[page_number-1].extract_text()[-400:] + '\n ' 
                            + reader.pages[page_number].extract_text() + '\n ' 
                            + reader.pages[page_number+1].extract_text()[:400])  
      
        # Detectando o idioma do texto
        try:
            lang = detect(page_text)
            
        except:
            lang = 'Não_detectado'
        
        # Extraindo as entidades
        page_extracted = False
        if lang == 'pt':
            doc = nlp_pt(page_text)
            page_extracted = True
        if lang == 'en':
            doc = nlp_en(page_text)
            page_extracted = True
        
        if page_extracted:

            persons = []
            gpes = []
            #orgs = []
            for ent in doc.ents:
                if ent.label_ == "PER": #"PERSON":
                    persons.append(process_uri(ent.text))
                if ent.label_ == "LOC": #"GPE":
                    gpes.append(process_uri(ent.text))
                #if ent.label_ == "ORG":
                #    orgs.append(process_uri(ent.text))
                    
            excerpt[page_number] = {'text': page_text, 
                                    'lang': lang, 
                                    'persons': list(set(persons)), 
                                    'gpes': list(set(gpes)),
                                #'orgs': list(set(orgs))
                                }

    return excerpt

Iterando por todas as URI para extrair o texto e gravar no grafo NEO4j

In [5]:
# Gravar hora de início do processamento
start_time = pd.Timestamp.now()
parcial_time = pd.Timestamp.now()

for i in range(0,len(thesis_repository)):

    # Coletando o link do PDF
    url = thesis_repository['repository'][i]
    thesis_uri = thesis_repository['Thesis'][i][16:]
    pdf_link = get_pdf_link(url)
    if pdf_link is not None:
        print(f"{i} - Downloading and extracting: {thesis_uri} from {pdf_link}")
        download_file(pdf_link, thesis_uri)
        
        # Extraindo o texto e entidades
        tese_path = 'tese 3.pdf'
        excerpt = extratc_text(tese_path)

        excerpts = []
        persons = []
        gpes = []

        # Salvando as triplas em CSV
        for n in excerpt.keys():

            excerpt_uri = thesis_repository['Thesis'][i] + '_p_' + str(n)
            excerpt_text = excerpt[n]['text']
            if "'" in excerpt_text:
                excerpt_text = excerpt_text.replace("'", "")

            excerpt_page = n
            excerpt_lang = excerpt[n]['lang']
            
            excerpts.append([thesis_repository['Thesis'][i], excerpt_uri, excerpt_text, str(excerpt_page), excerpt_lang])
                
            #Criando as relações com as entidades
            for person in excerpt[n]['persons']:
                persons.append([excerpt_uri, 'tag:stardog:api:' + person])
            
            for gpe in excerpt[n]['gpes']:
                gpes.append([excerpt_uri, 'tag:stardog:api:' + gpe])
        
        # Salvando os dados em CSV no diretório de importação do Neo4j
        pd.DataFrame(excerpts).to_csv('C:\\Users\\facordei\\.Neo4jDesktop\\relate-data\\dbmss\\dbms-792eabfc-5517-4220-ab29-30ef6ec3236e\\import\\excerpts3.csv', index=False, header=False)
        pd.DataFrame(persons).to_csv('C:\\Users\\facordei\\.Neo4jDesktop\\relate-data\\dbmss\\dbms-792eabfc-5517-4220-ab29-30ef6ec3236e\\import\\persons3.csv', index=False, header=False)
        pd.DataFrame(gpes).to_csv('C:\\Users\\facordei\\.Neo4jDesktop\\relate-data\\dbmss\\dbms-792eabfc-5517-4220-ab29-30ef6ec3236e\\import\\gpes3.csv', index=False, header=False)
        
        #Deletando o arquivo PDF após o processamento
        os.remove('tese 3.pdf')

        print(f"Loading triplas: {thesis_uri}")

        # Importanto os dados para o Neo4j
        #Excerpts
        query = """
        LOAD CSV FROM 'file:///excerpts3.csv'
        AS row
        MATCH  (t:Thesis{uri: row[0]})
        MERGE  (e:Exerpt{
                uri: row[1],
                text: row[2],
                page: row[3],
                lang: row[4]
                })-[r:BFO_0000050]-(t);
        """
        graph = GraphDatabase.driver(NEO4J_URL, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))
        results = graph.execute_query(query)
        graph.close()

        #Persons
        query = """
        LOAD CSV FROM 'file:///persons3.csv'
        AS row
        MATCH  (e:Exerpt{uri: row[0]})
        MERGE  (p:Person{uri: row[1]})
        MERGE  (e)-[r:mentions]-(p);
        """
        graph = GraphDatabase.driver(NEO4J_URL, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))
        results = graph.execute_query(query)
        graph.close()

        #GPEs
        query = """
        LOAD CSV FROM 'file:///gpes3.csv'
        AS row
        MATCH  (e:Exerpt{uri: row[0]})
        MERGE  (p:Place{uri: row[1]})
        MERGE  (e)-[r:mentions]-(p);
        """
        graph = GraphDatabase.driver(NEO4J_URL, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))
        results = graph.execute_query(query)
        graph.close()

        # Removendo os arquivos CSV após a importação
        os.remove('C:\\Users\\facordei\\.Neo4jDesktop\\relate-data\\dbmss\\dbms-792eabfc-5517-4220-ab29-30ef6ec3236e\\import\\excerpts3.csv')
        os.remove('C:\\Users\\facordei\\.Neo4jDesktop\\relate-data\\dbmss\\dbms-792eabfc-5517-4220-ab29-30ef6ec3236e\\import\\persons3.csv')
        os.remove('C:\\Users\\facordei\\.Neo4jDesktop\\relate-data\\dbmss\\dbms-792eabfc-5517-4220-ab29-30ef6ec3236e\\import\\gpes3.csv')

        print(f"Total time: {pd.Timestamp.now() - start_time}")
        print(f"Partial time: {pd.Timestamp.now() - parcial_time}")
        # Atualizando a hora parcial
        parcial_time = pd.Timestamp.now()
    else:
        print(f"No PDF link found for {thesis_uri}")

    # gravando no grafo que a tese foi processada
    query = """
    MATCH  (t:Thesis{uri: \'""" + thesis_repository['Thesis'][i] +  """\'})
    SET t.processed = true
    """
    graph = GraphDatabase.driver(NEO4J_URL, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))
    results = graph.execute_query(query)
    graph.close()

0 - Downloading and extracting: comunalismo_e_educacao_comunitaria_em_oaxaca_a_aprendizagem_propria_em_tempos_de_militarismo_e_reforma_empresarial_da_educacao from https://www.teses.usp.br/teses/disponiveis/48/48134/tde-05122019-161248/publico/JOAO_FRANCISCO_MIGLIARI_BRANCO_rev.pdf
Loading triplas: comunalismo_e_educacao_comunitaria_em_oaxaca_a_aprendizagem_propria_em_tempos_de_militarismo_e_reforma_empresarial_da_educacao


Transaction failed and will be retried in 1.1096565450705378s (ForsetiClient[transactionId=230976, clientId=3] can't acquire ExclusiveLock{owner=ForsetiClient[transactionId=230966, clientId=2]} on NODE_RELATIONSHIP_GROUP_DELETE(12402) because holders of that lock are waiting for ForsetiClient[transactionId=230976, clientId=3].
 Wait list:ExclusiveLock[
Client[230966] waits for [ForsetiClient[transactionId=230976, clientId=3]]])
Transaction failed and will be retried in 1.0285972459208235s (ForsetiClient[transactionId=231003, clientId=2] can't acquire ExclusiveLock{owner=ForsetiClient[transactionId=230996, clientId=4]} on NODE_RELATIONSHIP_GROUP_DELETE(107562) because holders of that lock are waiting for ForsetiClient[transactionId=231003, clientId=2].
 Wait list:ExclusiveLock[
Client[230996] waits for [ForsetiClient[transactionId=231003, clientId=2]]])


Total time: 0 days 00:09:58.181749
Partial time: 0 days 00:09:58.181749
1 - Downloading and extracting: contando_historias_revelando_tradicoes_encontros_com_os_indigenas_no_amazonas from https://www.teses.usp.br/teses/disponiveis/27/27155/tde-05032021-212508/publico/VanessaBenitesBordin.pdf
Loading triplas: contando_historias_revelando_tradicoes_encontros_com_os_indigenas_no_amazonas


Transaction failed and will be retried in 1.1198106729718922s (ForsetiClient[transactionId=231079, clientId=4] can't acquire ExclusiveLock{owner=ForsetiClient[transactionId=231076, clientId=3]} on NODE_RELATIONSHIP_GROUP_DELETE(108070) because holders of that lock are waiting for ForsetiClient[transactionId=231079, clientId=4].
 Wait list:ExclusiveLock[
Client[231076] waits for [ForsetiClient[transactionId=231079, clientId=4]]])


Total time: 0 days 00:19:00.624828
Partial time: 0 days 00:09:02.443079
2 - Downloading and extracting: indigenas_e_ensino_medio_em_roraima_demandas_de_estudantes_macuxi from https://www.teses.usp.br/teses/disponiveis/48/48134/tde-01022021-141247/publico/MARCOS_ANTONIO_DE_OLIVEIRA_rev.pdf
Loading triplas: indigenas_e_ensino_medio_em_roraima_demandas_de_estudantes_macuxi
Total time: 0 days 00:22:45.612313
Partial time: 0 days 00:03:44.987485
3 - Downloading and extracting: tan_criolla_criolla_como_yo_identidade_politica_e_genero_nas_correspondencias_de_gabriela_mistral_e_victoria_ocampo_19261956 from https://www.teses.usp.br/teses/disponiveis/8/8138/tde-03032020-160001/publico/2019_AnaBeatrizMauaNunes_VCorr.pdf
Loading triplas: tan_criolla_criolla_como_yo_identidade_politica_e_genero_nas_correspondencias_de_gabriela_mistral_e_victoria_ocampo_19261956
Total time: 0 days 00:30:36.914304
Partial time: 0 days 00:07:51.300966
4 - Downloading and extracting: a_jurisdicao_especial_indigena_uma

Transaction failed and will be retried in 1.1974315021729185s (ForsetiClient[transactionId=232034, clientId=2] can't acquire ExclusiveLock{owner=ForsetiClient[transactionId=232036, clientId=1]} on NODE_RELATIONSHIP_GROUP_DELETE(124973) because holders of that lock are waiting for ForsetiClient[transactionId=232034, clientId=2].
 Wait list:ExclusiveLock[
Client[232036] waits for [ForsetiClient[transactionId=232034, clientId=2]]])


Total time: 0 days 02:33:59.554550
Partial time: 0 days 00:17:52.547170
27 - Downloading and extracting: a_oferta_de_areas_para_o_mercado_de_terras_de_compensacao_de_reserva_legal_para_a_consolidacao_territorial_das_unidades_de_conservacao_um_estudo_para_o_bioma_cerrado from https://www.teses.usp.br/teses/disponiveis/106/106132/tde-19112018-154831/publico/AnaKarinaTESEfinalcorrigida2018.pdf
Loading triplas: a_oferta_de_areas_para_o_mercado_de_terras_de_compensacao_de_reserva_legal_para_a_consolidacao_territorial_das_unidades_de_conservacao_um_estudo_para_o_bioma_cerrado


Transaction failed and will be retried in 1.158479159761557s (ForsetiClient[transactionId=232189, clientId=2] can't acquire ExclusiveLock{owner=ForsetiClient[transactionId=232188, clientId=1]} on NODE_RELATIONSHIP_GROUP_DELETE(108070) because holders of that lock are waiting for ForsetiClient[transactionId=232189, clientId=2].
 Wait list:ExclusiveLock[
Client[232188] waits for [ForsetiClient[transactionId=232189, clientId=2]]])


Total time: 0 days 02:41:38.693548
Partial time: 0 days 00:07:39.138998
28 - Downloading and extracting: a_questao_indigena_na_comissao_da_verdade_e_reconciliacao_do_peru from https://www.teses.usp.br/teses/disponiveis/100/100135/tde-02052018-231643/publico/FAVARIFEG_A_questao_indigena_na_CVR_versao_corrigida.pdf
Loading triplas: a_questao_indigena_na_comissao_da_verdade_e_reconciliacao_do_peru
Total time: 0 days 02:49:08.825722
Partial time: 0 days 00:07:30.132174
29 - Downloading and extracting: ibirapitanga_rastros_amp_impressoes from https://www.teses.usp.br/teses/disponiveis/27/27159/tde-01112017-103506/publico/FabiolaSallesMariano.pdf
Loading triplas: ibirapitanga_rastros_amp_impressoes
Total time: 0 days 02:51:01.177287
Partial time: 0 days 00:01:52.351565
30 - Downloading and extracting: recantando_muitos_cantos_a_musica_caipira_como_espaco_de_articulacao_de_encontros from https://www.teses.usp.br/teses/disponiveis/27/27157/tde-31052017-101546/publico/FABIOLAROSA.pdf
Loading tr

Transaction failed and will be retried in 1.043034853318518s (ForsetiClient[transactionId=232314, clientId=3] can't acquire ExclusiveLock{owner=ForsetiClient[transactionId=232306, clientId=4]} on NODE_RELATIONSHIP_GROUP_DELETE(30328) because holders of that lock are waiting for ForsetiClient[transactionId=232314, clientId=3].
 Wait list:ExclusiveLock[
Client[232306] waits for [ForsetiClient[transactionId=232314, clientId=3]]])


Total time: 0 days 02:56:26.067308
Partial time: 0 days 00:05:24.889023
31 - Downloading and extracting: o_labirinto_da_colonizacao_mexico_territorio_e_destino_manifesto from https://www.teses.usp.br/teses/disponiveis/8/8161/tde-02062017-094102/publico/2017_MateusDeSaBarretoBarros_VCorr.pdf
Loading triplas: o_labirinto_da_colonizacao_mexico_territorio_e_destino_manifesto
Total time: 0 days 03:01:15.793125
Partial time: 0 days 00:04:49.725817
32 - Downloading and extracting: traduzindo_um_pais_um_estudo_da_traducao_de_marcadores_culturais_a_partir_de_corpora_paralelos_de_textos_portuguesingles_sobre_o_censo_demografico_brasileiro from https://www.teses.usp.br/teses/disponiveis/8/8160/tde-08042019-113333/publico/2018_AlineMilaniRomeiroPereira_VCorr.pdf
Loading triplas: traduzindo_um_pais_um_estudo_da_traducao_de_marcadores_culturais_a_partir_de_corpora_paralelos_de_textos_portuguesingles_sobre_o_censo_demografico_brasileiro
Total time: 0 days 03:03:59.455403
Partial time: 0 days 00:02:43

Transaction failed and will be retried in 1.0841658163055903s (ForsetiClient[transactionId=232495, clientId=3] can't acquire ExclusiveLock{owner=ForsetiClient[transactionId=232487, clientId=2]} on NODE_RELATIONSHIP_GROUP_DELETE(122455) because holders of that lock are waiting for ForsetiClient[transactionId=232495, clientId=3].
 Wait list:ExclusiveLock[
Client[232487] waits for [ForsetiClient[transactionId=232495, clientId=3]]])


Total time: 0 days 03:14:17.915417
Partial time: 0 days 00:05:35.351418
36 - Downloading and extracting: a_expulsao_do_sertao_elites_paulistas_entre_o_imaginario_e_o_territorio_17491841 from https://www.teses.usp.br/teses/disponiveis/16/16133/tde-21122018-162438/publico/TEamaliacristovaodossantos.pdf
Loading triplas: a_expulsao_do_sertao_elites_paulistas_entre_o_imaginario_e_o_territorio_17491841
Total time: 0 days 03:36:09.384621
Partial time: 0 days 00:21:51.469204
37 - Downloading and extracting: arqueobotanica_e_mudancas_socioeconomicas_durante_o_holoceno_medio_no_sudoeste_da_amazonia from https://www.teses.usp.br/teses/disponiveis/71/71131/tde-30112018-102517/publico/laurafurquimcorrigida.pdf
Loading triplas: arqueobotanica_e_mudancas_socioeconomicas_durante_o_holoceno_medio_no_sudoeste_da_amazonia
Total time: 0 days 03:42:53.529154
Partial time: 0 days 00:06:44.144533
38 - Downloading and extracting: do_plano_economico_a_estruturacao_do_territorio_o_eixo_rio_de_janeiro__sao_paulo

Transaction failed and will be retried in 1.0284723537063443s (ForsetiClient[transactionId=232838, clientId=4] can't acquire ExclusiveLock{owner=ForsetiClient[transactionId=232831, clientId=3]} on NODE_RELATIONSHIP_GROUP_DELETE(107804) because holders of that lock are waiting for ForsetiClient[transactionId=232838, clientId=4].
 Wait list:ExclusiveLock[
Client[232831] waits for [ForsetiClient[transactionId=232838, clientId=4]]])


Total time: 0 days 04:03:53.164521
Partial time: 0 days 00:10:21.834342
41 - Downloading and extracting: a_historia_indigena_profunda_do_sambaqui_monte_castelo_um_ensaio_sobre_a_longa_duracao_da_ceramica_e_das_paisagens_no_sudoeste_amazonico from https://www.teses.usp.br/teses/disponiveis/71/71131/tde-02102018-162628/publico/FranciscoPugliese_corrigida.pdf
Loading triplas: a_historia_indigena_profunda_do_sambaqui_monte_castelo_um_ensaio_sobre_a_longa_duracao_da_ceramica_e_das_paisagens_no_sudoeste_amazonico
Total time: 0 days 04:09:10.340813
Partial time: 0 days 00:05:17.176292
42 - Downloading and extracting: tarja_preta_um_estudo_antropologico_sobre_estados_alterados_diagnosticados_pela_biomedicina_como_transtornos_mentais_nos_wajapi_do_amapari from https://www.teses.usp.br/teses/disponiveis/8/8134/tde-17092019-141715/publico/2017_JulianaRosalen_VCorr.pdf
Loading triplas: tarja_preta_um_estudo_antropologico_sobre_estados_alterados_diagnosticados_pela_biomedicina_como_transtornos_ment

Transaction failed and will be retried in 0.8320308583061817s (ForsetiClient[transactionId=233011, clientId=2] can't acquire ExclusiveLock{owner=ForsetiClient[transactionId=232998, clientId=1]} on NODE_RELATIONSHIP_GROUP_DELETE(109131) because holders of that lock are waiting for ForsetiClient[transactionId=233011, clientId=2].
 Wait list:ExclusiveLock[
Client[232998] waits for [ForsetiClient[transactionId=233011, clientId=2]]])
Transaction failed and will be retried in 1.0280823355084225s (ForsetiClient[transactionId=233031, clientId=3] can't acquire ExclusiveLock{owner=ForsetiClient[transactionId=233028, clientId=4]} on NODE_RELATIONSHIP_GROUP_DELETE(134670) because holders of that lock are waiting for ForsetiClient[transactionId=233031, clientId=3].
 Wait list:ExclusiveLock[
Client[233028] waits for [ForsetiClient[transactionId=233031, clientId=3]]])


Total time: 0 days 04:22:53.856918
Partial time: 0 days 00:07:29.828947
44 - Downloading and extracting: paisagens_do_mediobaixo_xingu_arqueologia_temporalidade_e_historicidade from https://www.teses.usp.br/teses/disponiveis/71/71131/tde-31082017-142914/publico/LorenaLuanaGarciaREVISADA.pdf
Loading triplas: paisagens_do_mediobaixo_xingu_arqueologia_temporalidade_e_historicidade


Transaction failed and will be retried in 0.8881643357095959s (ForsetiClient[transactionId=233084, clientId=4] can't acquire ExclusiveLock{owner=ForsetiClient[transactionId=233063, clientId=1]} on NODE_RELATIONSHIP_GROUP_DELETE(175815) because holders of that lock are waiting for ForsetiClient[transactionId=233084, clientId=4].
 Wait list:ExclusiveLock[
Client[233063] waits for [ForsetiClient[transactionId=233084, clientId=4]]])


Total time: 0 days 04:37:53.103160
Partial time: 0 days 00:14:59.245251
45 - Downloading and extracting: da_boa_guerra_nasce_a_boa_paz_a_expulsao_dos_portugueses_do_planalto_do_zambeze__reino_do_monomotapa_africa_austral_16931695 from https://www.teses.usp.br/teses/disponiveis/8/8138/tde-19122017-193824/publico/2017_IvanaPanseraDeOliveiraMuscalu_VCorr.pdf
Loading triplas: da_boa_guerra_nasce_a_boa_paz_a_expulsao_dos_portugueses_do_planalto_do_zambeze__reino_do_monomotapa_africa_austral_16931695
Total time: 0 days 04:50:02.002969
Partial time: 0 days 00:12:08.899809
46 - Downloading and extracting: o_valor_dos_servicos_ecossistemicos_nas_bacias_hidrograficas_dos_rios_itaguare_e_guaratuba_em_bertioga_sp from https://www.teses.usp.br/teses/disponiveis/8/8135/tde-30052018-105644/publico/2017_VivianeCoelhoBuchianeri_VCorr.pdf
Loading triplas: o_valor_dos_servicos_ecossistemicos_nas_bacias_hidrograficas_dos_rios_itaguare_e_guaratuba_em_bertioga_sp
Total time: 0 days 04:57:31.974933
Partial t

Transaction failed and will be retried in 0.9431454920329034s (ForsetiClient[transactionId=233454, clientId=4] can't acquire ExclusiveLock{owner=ForsetiClient[transactionId=233450, clientId=1]} on NODE_RELATIONSHIP_GROUP_DELETE(62213) because holders of that lock are waiting for ForsetiClient[transactionId=233454, clientId=4].
 Wait list:ExclusiveLock[
Client[233450] waits for [ForsetiClient[transactionId=233454, clientId=4]]])


Total time: 0 days 05:19:10.461868
Partial time: 0 days 00:04:09.389867
50 - Downloading and extracting: luzes_apagadas_a_educacao_escolar_indigena_na_amazonia_colonial from https://www.teses.usp.br/teses/disponiveis/48/48134/tde-14122018-093140/publico/JONAS_ARAUJO_DA_CUNHA_rev.pdf
Loading triplas: luzes_apagadas_a_educacao_escolar_indigena_na_amazonia_colonial
Total time: 0 days 05:26:32.461036
Partial time: 0 days 00:07:21.999168
51 - Downloading and extracting: reciclagem_de_paineis_fotovoltaicos_e_recuperacao_de_metais from https://www.teses.usp.br/teses/disponiveis/3/3137/tde-30012019-141410/publico/PedroForastierideAlmeidaPradoCorr18.pdf
Loading triplas: reciclagem_de_paineis_fotovoltaicos_e_recuperacao_de_metais
Total time: 0 days 05:28:01.723105
Partial time: 0 days 00:01:29.262069
52 - Downloading and extracting: pardos_livres_em_um_campo_de_tensoes_milicia_trabalho_e_poder_sao_paulo_17971831 from https://www.teses.usp.br/teses/disponiveis/8/8138/tde-10092019-154951/publico/2

UnicodeEncodeError: 'utf-8' codec can't encode character '\udf07' in position 1: surrogates not allowed