In [1]:
#%pip install beautifulsoup4
#%pip install pip-system-certs

In [6]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import os
from langchain_community.graphs import Neo4jGraph
from PyPDF2 import PdfReader
from langdetect import detect
import spacy
import re
from unidecode import unidecode
import time

Conectando ao Neo4j 

In [7]:
from neo4j import GraphDatabase

# Neo4j variables
NEO4J_URL = os.getenv("NEO4J_URL")
NEO4J_USERNAME =os.getenv("NEO4J_USERNAME")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")


graph = GraphDatabase.driver(NEO4J_URL, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

- Buscar os links para a tese no Neo4j
- Buscar o link do PDF no repositório da universidade
- Baixar o PDF
- Extrair o texto do PDF 

Buscar links para a tese no Neo4j

In [None]:
# Query to get theses and their repositories from the Neo4j database
# This query retrieves theses that are published by the Federal Rural University of Rio de Janeiro (UFRRJ) and have not been processed yet.
query1 = """
MATCH (t:Thesis)-[:publisher]-(uni:University{uri:'tag:stardog:api:UFRRJ'})
WHERE NOT EXISTS {(t{processed:true})}
RETURN distinct(t["uri"]) as Thesis, t["repository"] as repository 
"""

# Alternative query to get theses that do not have excerpts and have been processed.
query2 = """ 
match (t:Thesis)-[:publisher]-(uni:University{uri:'tag:stardog:api:UFRRJ'})
where (not (t:Thesis)-[:BFO_0000050]-(:Excerpt)) and  exists {(t{processed:true})}
return distinct(t["uri"]) as Thesis, t["repository"] as repository
"""

# Query to retrive theses with few pages and their repositories.
query3 = """
MATCH (t:Thesis)-[:publisher]-(uni:University{uri:'tag:stardog:api:UFRRJ'})
Match (t:Thesis)-[:BFO_0000050]-(e:Excerpt)
RETURN count(e) as pages, t["uri"] as Thesis, t["repository"] as repository
 order by (pages) limit 40
"""

graph = GraphDatabase.driver(NEO4J_URL, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))
results = graph.execute_query(query1).records
graph.close()

#results = graph.query(query2)
# Convert the results to a DataFrame
#thesis_repository = pd.DataFrame(results, columns=["pages", "Thesis", "repository"])
thesis_repository = pd.DataFrame(results, columns=["Thesis", "repository"])
thesis_repository = thesis_repository.reset_index(drop=True) 
thesis_repository

Unnamed: 0,Thesis,repository


Criando as utils functions

In [4]:
# Funcao para coletar link do pdf

def get_pdf_link(url):
    
    #Fazer requisição e parsear o arquivo html
    try:
        f = requests.get(url, verify=True).text#, verify=False) 
    except:
        print("Sleeping for 5 seconds before retrying...")
        time.sleep(5)
        try:
            f = requests.get(url, verify=True).text#, verify=False) 
        except:
            print("Sleeping for 30 seconds before retrying...")
            time.sleep(30)
            f = requests.get(url, verify=True).text#, verify=False)    
    soup = bs(f, "html.parser")
    
    #Coletando link para as teses
    link_list = []
    for doc in soup.find_all('a', href=True):
        if doc['href'].startswith('/jspui/bitstream'):     # (doc['href'].endswith('.pdf')):# & doc['href'].startswith('/bitstream')):
            path = doc['href']
            prefix_uni = 'https://rima.ufrrj.br'
            link = prefix_uni + path
            link_list.append(link)
    if link_list != []:
        return list(set(link_list))
    else:
        return None
    
    #return None

#fazer download do arquivo
def download_file(pdf_link, thesis_uri):


    # NOTE the stream=True parameter below
    try:
        r = requests.get(pdf_link, verify=True, stream=True)
    except:
        print("Sleeping for 5 seconds before retrying...")
        time.sleep(5)
        try:
            r = requests.get(pdf_link, verify=True, stream=True)
        except:
            print("Sleeping for 30 seconds before retrying...")
            time.sleep(30)
            r = requests.get(pdf_link, verify=True, stream=True)
    # raise_for_status() is not needed with stream=True
    # r.raise_for_status()
    with open('tese UFRRJ.pdf', 'wb') as f:
        for chunk in r.iter_content(chunk_size=8192):
            f.write(chunk)
    return 

#Carregando os modelos SpaCy para inglês e português 
nlp_en = spacy.load("en_core_web_lg")
nlp_pt = spacy.load("pt_core_news_lg")

# Funcão que recebe uma string e a limpar para ficar no formato aceitável para uma URI
def process_uri(x):
    return (re.sub('[^a-zA-Z0-9_ ]', '',
            unidecode(x.strip())
                        .replace(" ", "_")
                        .replace("[","")
                        .replace("]","")
                        .replace("?","")
                        .replace("'","")
                        .lower()))


# Função para extrair texto de uma tese em PDF e identificar entidades
def extratc_text(tese_path):
    
    # Processando o arquivo em PDF
    try:
        reader = PdfReader(tese_path)
    except:
        print(f"Erro ao ler o arquivo {tese_path}. ")
        excerpt = {}
        return excerpt
    number_of_pages = len(reader.pages)
    excerpt = {}
    

    # Iterando por cada página
    for page_number in range(number_of_pages):
        
        
        
        # Extraindo o texto da página
        try:
            if page_number == 0:
                try:
                    page_text = (reader.pages[page_number].extract_text() + '\n ' 
                                + reader.pages[page_number+1].extract_text()[:400])
                except:
                    page_text = reader.pages[page_number].extract_text()
            else:
                
                if page_number == number_of_pages -1:
                    try:
                        page_text = (reader.pages[page_number-1].extract_text()[-400:] + '\n ' 
                                    + reader.pages[page_number].extract_text())
                    except:
                        page_text = reader.pages[page_number].extract_text()
                
                else:
                    try:
                        page_text = (reader.pages[page_number-1].extract_text()[-400:] + '\n ' 
                                    + reader.pages[page_number].extract_text() + '\n ' 
                                    + reader.pages[page_number+1].extract_text()[:400])  
                    except:
                        page_text = reader.pages[page_number].extract_text()
        except:
            page_text = ''
        
        # Limpando page text         
        page_text = page_text.replace("'", "").replace('"', '')
        # Verifica se o último caractere é uma barra invertida
        if page_text != '':
            if page_text[-1] == '\\': 
                page_text = page_text[:-1]
        page_text = re.sub(r"[\ud800-\udfff]", " ", page_text)
        page_text = page_text.encode('utf-8').decode('utf-8')
        
        
        # Detectando o idioma do texto
        try:
            lang = detect(page_text)
            
        except:
            lang = 'Não_detectado'
        
        
        # Extraindo as entidades
        page_extracted = False
        if lang == 'pt':
            doc = nlp_pt(page_text)
            page_extracted = True
        if lang == 'en':
            doc = nlp_en(page_text)
            page_extracted = True
        

        if page_extracted:

            persons = []
            gpes = []
            #orgs = []
            for ent in doc.ents:
                if ent.label_ == "PER": #"PERSON":
                    persons.append(process_uri(ent.text))
                if ent.label_ == "LOC": #"GPE":
                    gpes.append(process_uri(ent.text))
                #if ent.label_ == "ORG":
                #    orgs.append(process_uri(ent.text))
                    
            excerpt[page_number] = {'text': page_text, 
                                    'lang': lang, 
                                    'persons': list(set(persons)), 
                                    'gpes': list(set(gpes)),
                                #'orgs': list(set(orgs))
                                }

    return excerpt

Iterando por todas as URI para extrair o texto e gravar no grafo NEO4j

In [None]:
# Gravar hora de início do processamento
start_time = pd.Timestamp.now()
parcial_time = pd.Timestamp.now()

for i in range(0,len(thesis_repository)):

    # Coletando o link do PDF
    url = thesis_repository['repository'][i]
    thesis_uri = thesis_repository['Thesis'][i][16:]
    pdf_link = get_pdf_link(url)
    if pdf_link is not None:
        # Itereando pelos links de PDF encontrados (em alguns casos há outros materiais além da tese)
        for pdf_num in range(len(pdf_link)):
            print(f"{i} - Downloading and extracting: {thesis_uri} from {pdf_link[pdf_num]}")
            download_file(pdf_link[pdf_num], thesis_uri)

            # Extraindo o texto e entidades
            tese_path = 'tese UFRRJ.pdf'
            excerpt = extratc_text(tese_path)

            excerpts = []
            persons = []
            gpes = []

            # Salvando as triplas em CSV
            for n in excerpt.keys():

                excerpt_uri = thesis_repository['Thesis'][i] + '_pdf_' + str(pdf_num) + '_p_' + str(n)
                excerpt_text = excerpt[n]['text']
                # Verifica se o último caractere é uma barra invertida
                #if excerpt_text[-1] == '\\': 
                #    excerpt_text = excerpt_text[:-1]
                

                excerpt_page = 'pdf_' + str(pdf_num) + '_p_' + str(n)
                excerpt_lang = excerpt[n]['lang']
                
                excerpts.append([thesis_repository['Thesis'][i], excerpt_uri, excerpt_text, excerpt_page, excerpt_lang])
                    
                #Criando as relações com as entidades
                for person in excerpt[n]['persons']:
                    persons.append([excerpt_uri, 'tag:stardog:api:' + person])
                
                for gpe in excerpt[n]['gpes']:
                    gpes.append([excerpt_uri, 'tag:stardog:api:' + gpe])
            
            # Salvando os dados em CSV no diretório de importação do Neo4j
            pd.DataFrame(excerpts).to_csv('C:\\Users\\facordei\\.Neo4jDesktop\\relate-data\\dbmss\\dbms-792eabfc-5517-4220-ab29-30ef6ec3236e\\import\\excerptsUFRRJ.csv', index=False, header=False)
            pd.DataFrame(persons).to_csv('C:\\Users\\facordei\\.Neo4jDesktop\\relate-data\\dbmss\\dbms-792eabfc-5517-4220-ab29-30ef6ec3236e\\import\\personsUFRRJ.csv', index=False, header=False)
            pd.DataFrame(gpes).to_csv('C:\\Users\\facordei\\.Neo4jDesktop\\relate-data\\dbmss\\dbms-792eabfc-5517-4220-ab29-30ef6ec3236e\\import\\gpesUFRRJ.csv', index=False, header=False)

            #Deletando o arquivo PDF após o processamento
            os.remove('tese UFRRJ.pdf')

            print(f"Loading triplas: {thesis_uri}")

            # Importanto os dados para o Neo4j
            #Excerpts
            query = """
            LOAD CSV FROM 'file:///excerptsUFRRJ.csv'
            AS row
            MATCH  (t:Thesis{uri: row[0]})
            MERGE  (e:Excerpt{
                    uri: row[1],
                    text: row[2],
                    page: row[3],
                    lang: row[4]
                    })-[r:BFO_0000050]-(t);
            """
            graph = GraphDatabase.driver(NEO4J_URL, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))
            results = graph.execute_query(query)
            graph.close()

            #Persons
            query = """
            LOAD CSV FROM 'file:///personsUFRRJ.csv'
            AS row
            MATCH  (e:Excerpt{uri: row[0]})
            MERGE  (p:Person{uri: row[1]})
            MERGE  (e)-[r:mentions]-(p);
            """
            graph = GraphDatabase.driver(NEO4J_URL, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))
            results = graph.execute_query(query)
            graph.close()

            #GPEs
            query = """
            LOAD CSV FROM 'file:///gpesUFRRJ.csv'
            AS row
            MATCH  (e:Excerpt{uri: row[0]})
            MERGE  (p:Place{uri: row[1]})
            MERGE  (e)-[r:mentions]-(p);
            """
            graph = GraphDatabase.driver(NEO4J_URL, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))
            results = graph.execute_query(query)
            graph.close()

            # Removendo os arquivos CSV após a importação
            os.remove('C:\\Users\\facordei\\.Neo4jDesktop\\relate-data\\dbmss\\dbms-792eabfc-5517-4220-ab29-30ef6ec3236e\\import\\excerptsUFRRJ.csv')
            os.remove('C:\\Users\\facordei\\.Neo4jDesktop\\relate-data\\dbmss\\dbms-792eabfc-5517-4220-ab29-30ef6ec3236e\\import\\personsUFRRJ.csv')
            os.remove('C:\\Users\\facordei\\.Neo4jDesktop\\relate-data\\dbmss\\dbms-792eabfc-5517-4220-ab29-30ef6ec3236e\\import\\gpesUFRRJ.csv')

            print(f"Total time: {pd.Timestamp.now() - start_time}")
            print(f"Partial time: {pd.Timestamp.now() - parcial_time}")
            # Atualizando a hora parcial
            parcial_time = pd.Timestamp.now()
    else:
        print(f"No PDF link found for {thesis_uri}")

    # gravando no grafo que a tese foi processada
    query = """
    MATCH  (t:Thesis{uri: \'""" + thesis_repository['Thesis'][i] +  """\'})
    SET t.processed = true
    """
    graph = GraphDatabase.driver(NEO4J_URL, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))
    results = graph.execute_query(query)
    graph.close()

0 - Downloading and extracting: sateremawe_a_identidade_indigena_no_espaco_escolar from https://rima.ufrrj.br/jspui/bitstream/20.500.14407/12970/3/2016%20-%20Romulo%20Ribeiro%20Machado.pdf
Loading triplas: sateremawe_a_identidade_indigena_no_espaco_escolar
Total time: 0 days 00:00:15.935376
Partial time: 0 days 00:00:15.936365
1 - Downloading and extracting: poder_e_territorio_no_baixo_sul_da_bahia_os_discursos_e_os_arranjos_politicos_de_desenvolvimento from https://rima.ufrrj.br/jspui/bitstream/20.500.14407/9464/3/2016%20-%20Jose%20Renato%20Sant%27Anna%20Porto.pdf
Loading triplas: poder_e_territorio_no_baixo_sul_da_bahia_os_discursos_e_os_arranjos_politicos_de_desenvolvimento
Total time: 0 days 00:01:40.177427
Partial time: 0 days 00:01:24.241062
2 - Downloading and extracting: contos_populares_de_tradicao_indigena_uma_proposta_de_leitura_em_sequencia_didatica_para_as_turmas_do_9o_ano_do_ensino_fundamental from https://rima.ufrrj.br/jspui/bitstream/20.500.14407/15446/3/2017%20-%20Rosa

PdfReadError('Invalid Elementary Object starting with b\'\\xb3\' @923628: b\'                    \\xb3\\xdeu\\x9f\\xe7\\x9ezv}\\xf2\\xe4\\xfa\\xba0\\xb9\\xb9\\xbe=\\xc9\\x0b\\x93\\xe1\\x87\\xcf*\\xdb\\xce\\xb3\\x8dYl\\xe6\\xcb,\\xc3\\\\s\\xf2\\xe4"\\xacL\\x19xAfBy\\xf2\\xe4\\xc5w\\xb9y\\x13N2s]\\xd2\'')


Loading triplas: diversidade_biocultural_direitos_de_propriedade_intelectual_versus_direitos_dos_recursos_tradicionais
Total time: 0 days 01:14:25.301771
Partial time: 0 days 00:00:28.825880
155 - Downloading and extracting: o_terceiro_lado_da_fronteira_processos_e_transformacoes_dos_espacos_agrarios_no_pantanal_dois_estudos_de_caso_em_comunidades_da_bolivia_e_brasil from https://rima.ufrrj.br/jspui/bitstream/20.500.14407/9497/3/2006%20-%20Jos%c3%a9%20Luis%20Guti%c3%a9rrez%20Angulo.pdf
Loading triplas: o_terceiro_lado_da_fronteira_processos_e_transformacoes_dos_espacos_agrarios_no_pantanal_dois_estudos_de_caso_em_comunidades_da_bolivia_e_brasil
Total time: 0 days 01:14:46.758601
Partial time: 0 days 00:00:21.456830
156 - Downloading and extracting: cultura_e_sustentabilidade_a_sociedade_potiguara_e_um_novo_malestar_na_civilizacao from https://rima.ufrrj.br/jspui/bitstream/20.500.14407/9531/3/2008%20-%20Antonio%20Ricardo%20Pereira%20de%20Andrade.pdf
Loading triplas: cultura_e_sustentabi

unknown widths : 
[0, IndirectObject(464, 0, 1776564493072)]
unknown widths : 
[0, IndirectObject(459, 0, 1776564493072)]
unknown widths : 
[0, IndirectObject(454, 0, 1776564493072)]
unknown widths : 
[0, IndirectObject(449, 0, 1776564493072)]
unknown widths : 
[0, IndirectObject(444, 0, 1776564493072)]
unknown widths : 
[0, IndirectObject(464, 0, 1776564493072)]
unknown widths : 
[0, IndirectObject(459, 0, 1776564493072)]
unknown widths : 
[0, IndirectObject(454, 0, 1776564493072)]
unknown widths : 
[0, IndirectObject(449, 0, 1776564493072)]
unknown widths : 
[0, IndirectObject(444, 0, 1776564493072)]
unknown widths : 
[0, IndirectObject(464, 0, 1776564493072)]
unknown widths : 
[0, IndirectObject(459, 0, 1776564493072)]
unknown widths : 
[0, IndirectObject(454, 0, 1776564493072)]
unknown widths : 
[0, IndirectObject(449, 0, 1776564493072)]
unknown widths : 
[0, IndirectObject(444, 0, 1776564493072)]


Loading triplas: problema_com_escola_eu_tenho_mil_uma_investigacao_sobre_o_racismo_institucional_em_escolas_publicas_de_angra_dos_reis
Total time: 0 days 01:27:22.955867
Partial time: 0 days 00:00:15.598358
190 - Downloading and extracting: racismo_institucional_desafios_e_perspectivas_na_implementacao_de_politicas_publicas_antirracistas_na_faetc from https://rima.ufrrj.br/jspui/bitstream/20.500.14407/12994/3/2017%20-%20Lilian%20do%20Carmo%20de%20Oliveira%20Cunha.pdf
Loading triplas: racismo_institucional_desafios_e_perspectivas_na_implementacao_de_politicas_publicas_antirracistas_na_faetc
Total time: 0 days 01:27:38.921799
Partial time: 0 days 00:00:15.965932
191 - Downloading and extracting: a_lei_106392003_estrategias_docentes_de_promocao_da_igualdade_racial_atraves_do_ensino_de_historia from https://rima.ufrrj.br/jspui/bitstream/20.500.14407/13086/3/2016%20-%20Amanda%20Arag%c3%a3o%20Barreto.pdf
Loading triplas: a_lei_106392003_estrategias_docentes_de_promocao_da_igualdade_racial_at

unknown widths : 
[0, IndirectObject(438, 0, 1776392698064)]
unknown widths : 
[0, IndirectObject(433, 0, 1776392698064)]
unknown widths : 
[0, IndirectObject(438, 0, 1776392698064)]
unknown widths : 
[0, IndirectObject(433, 0, 1776392698064)]
unknown widths : 
[0, IndirectObject(427, 0, 1776392698064)]
unknown widths : 
[0, IndirectObject(422, 0, 1776392698064)]
unknown widths : 
[0, IndirectObject(438, 0, 1776392698064)]
unknown widths : 
[0, IndirectObject(433, 0, 1776392698064)]
unknown widths : 
[0, IndirectObject(427, 0, 1776392698064)]
unknown widths : 
[0, IndirectObject(422, 0, 1776392698064)]
unknown widths : 
[0, IndirectObject(427, 0, 1776392698064)]
unknown widths : 
[0, IndirectObject(422, 0, 1776392698064)]


Loading triplas: racismo_digital_limites_e_desafios_de_um_campo_em_construcao
Total time: 0 days 01:34:54.750150
Partial time: 0 days 00:00:17.664822
210 - Downloading and extracting: afrociberdelia_mocamba_baobaxia_na_rota_digital_do_campinho_da_independencia from https://rima.ufrrj.br/jspui/bitstream/20.500.14407/11690/3/2020%20-%20Daniel%20Cardoso%20de%20Andrade.pdf
Loading triplas: afrociberdelia_mocamba_baobaxia_na_rota_digital_do_campinho_da_independencia
Total time: 0 days 01:35:20.228828
Partial time: 0 days 00:00:25.478678
211 - Downloading and extracting: pedagogia_antirracista_uma_proposta_de_formacao_continuada_de_professores_para_o_enfrentamento_do_racismo_institucional_na_escola from https://rima.ufrrj.br/jspui/bitstream/20.500.14407/15039/3/2022%20-%20Leandro%20Bulh%c3%b5es%20dos%20Santos.Pdf
Loading triplas: pedagogia_antirracista_uma_proposta_de_formacao_continuada_de_professores_para_o_enfrentamento_do_racismo_institucional_na_escola
Total time: 0 days 01:35:36.310247

 impossible to decode XFormObject /Image-1
 impossible to decode XFormObject /Image-1
 impossible to decode XFormObject /Image-1


Loading triplas: criancas_no_cativeiro_agencia_infantil_africana_nos_oitocentos_na_cidade_do_rio_de_janeiro
Total time: 0 days 01:57:27.817727
Partial time: 0 days 00:00:36.817387
259 - Downloading and extracting: divisao_naval_da_costa_dleste_a_expansao_da_guerra_cisplatina_para_o_litoral_africano_18251830 from https://rima.ufrrj.br/jspui/bitstream/20.500.14407/13857/3/2017%20-%20Marcelo%20Rodrigues%20de%20%20Oliveira.pdf
Loading triplas: divisao_naval_da_costa_dleste_a_expansao_da_guerra_cisplatina_para_o_litoral_africano_18251830
Total time: 0 days 01:57:58.014247
Partial time: 0 days 00:00:30.196520
260 - Downloading and extracting: as_praticas_juridicas_nas_acoes_de_liberdade_no_tribunal_da_relacao_do_rio_de_janeiro_no_periodo_entre_1871_e_1888 from https://rima.ufrrj.br/jspui/bitstream/20.500.14407/14004/3/2015%20-%20Carlos%20Henrique%20Antunes%20da%20Silva.pdf
Loading triplas: as_praticas_juridicas_nas_acoes_de_liberdade_no_tribunal_da_relacao_do_rio_de_janeiro_no_periodo_entre_