In [1]:
#%pip install beautifulsoup4
#%pip install pip-system-certs

In [52]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import json
import stardog
import os
import io
from langchain_community.graphs import Neo4jGraph
from PyPDF2 import PdfReader
from langdetect import detect
import spacy
import re
from unidecode import unidecode

Conectando ao Neo4j 

In [53]:
from neo4j import GraphDatabase

# Neo4j variables
NEO4J_URL = os.getenv("NEO4J_URL")
NEO4J_USERNAME =os.getenv("NEO4J_USERNAME")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")


graph = GraphDatabase.driver(NEO4J_URL, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

- Buscar os links para a tese no Neo4j
- Buscar o link do PDF no repositório da universidade
- Baixar o PDF
- Extrair o texto do PDF 

Buscar links para a tese no Neo4j

In [54]:
query1 = """
MATCH (t:Thesis)-[:publisher]-(uni:University{uri:'tag:stardog:api:USP'})
WHERE NOT EXISTS {(t{processed:true})}
RETURN distinct(t["uri"]) as Thesis, t["repository"] as repository 
"""

graph = GraphDatabase.driver(NEO4J_URL, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))
results = graph.execute_query(query1).records
graph.close()

#results = graph.query(query2)
# Convert the results to a DataFrame
thesis_repository = pd.DataFrame(results, columns=["Thesis", "repository"])
thesis_repository = thesis_repository[206:300].reset_index(drop=True) 
thesis_repository

Unnamed: 0,Thesis,repository
0,tag:stardog:api:catequese_e_evasao_etnografia_...,https://www.teses.usp.br/teses/disponiveis/8/8...
1,tag:stardog:api:efeitos_de_macronutrientes_no_...,https://www.teses.usp.br/teses/disponiveis/11/...
2,tag:stardog:api:ritmos_biologicos_em_indios_gu...,http://www.teses.usp.br/teses/disponiveis/47/4...
3,tag:stardog:api:delivery_systems_for_simultane...,https://www.teses.usp.br/teses/disponiveis/74/...
4,tag:stardog:api:nem_cidadaos_nem_brasileiros_i...,http://www.teses.usp.br/teses/disponiveis/8/81...
...,...,...
89,tag:stardog:api:analise_dos_marcadores_convers...,http://www.teses.usp.br/teses/disponiveis/8/81...
90,tag:stardog:api:elementos_para_uma_teoria_poli...,http://www.teses.usp.br/teses/disponiveis/8/81...
91,tag:stardog:api:diarios_musicais_o_musicar_na_...,https://www.teses.usp.br/teses/disponiveis/27/...
92,tag:stardog:api:necroolhar_um_olhar_nos_corpos...,https://www.teses.usp.br/teses/disponiveis/27/...


Criando as utils functions

In [55]:

# Funcao para coletar link do pdf

def get_pdf_link(url):
    
    #Fazer requisição e parsear o arquivo html
    f = requests.get(url, verify=True).text#, verify=False) 
    soup = bs(f, "html.parser")
    
    #Coletando link para as teses
    for doc in soup.find_all('a', href=True):
        if doc['href'].endswith('.pdf'):
            path = doc['href']
            prefix_uni = 'https://www.teses.usp.br'
            link = prefix_uni + path

            return link
    
    return None

#fazer download do arquivo
def download_file(pdf_link, thesis_uri):
    # NOTE the stream=True parameter below
    r = requests.get(pdf_link, verify=True, stream=True)
    # raise_for_status() is not needed with stream=True
    # r.raise_for_status()
    with open('tese 2.pdf', 'wb') as f:
        for chunk in r.iter_content(chunk_size=8192):
            f.write(chunk)
    return 

#Carregando os modelos SpaCy para inglês e português 
nlp_en = spacy.load("en_core_web_lg")
nlp_pt = spacy.load("pt_core_news_lg")

# Funcão que recebe uma string e a limpar para ficar no formato aceitável para uma URI
def process_uri(x):
    return (re.sub('[^a-zA-Z0-9_ ]', '',
            unidecode(x.strip())
                        .replace(" ", "_")
                        .replace("[","")
                        .replace("]","")
                        .replace("?","")
                        .replace("'","")
                        .lower()))


# Função para extrair texto de uma tese em PDF e identificar entidades
def extratc_text(tese_path):
    
    # Processando o arquivo em PDF
    reader = PdfReader(tese_path)
    number_of_pages = len(reader.pages)
    excerpt = {}
    

    # Iterando por cada página
    for page_number in range(number_of_pages):
        
        
        # Extraindo o texto da página
        if page_number == 0:
            page_text = (reader.pages[page_number].extract_text() + '\n ' 
                        + reader.pages[page_number+1].extract_text()[:400])
        else:
            
            if page_number == number_of_pages -1:
                page_text = (reader.pages[page_number-1].extract_text()[-400:] + '\n ' 
                            + reader.pages[page_number].extract_text())
            
            else:
                page_text = (reader.pages[page_number-1].extract_text()[-400:] + '\n ' 
                            + reader.pages[page_number].extract_text() + '\n ' 
                            + reader.pages[page_number+1].extract_text()[:400])  
       
        # Removendo aspas
        if "'" in page_text:
            page_text = page_text.replace("'", "")
        if '"' in page_text:
            page_text = page_text.replace('"', '')

        # Detectando o idioma do texto
        try:
            lang = detect(page_text)
            
        except:
            lang = 'Não_detectado'
        
        # Extraindo as entidades
        page_extracted = False
        if lang == 'pt':
            doc = nlp_pt(page_text)
            page_extracted = True
        if lang == 'en':
            doc = nlp_en(page_text)
            page_extracted = True
        
        if page_extracted:

            persons = []
            gpes = []
            #orgs = []
            for ent in doc.ents:
                if ent.label_ == "PER": #"PERSON":
                    persons.append(process_uri(ent.text))
                if ent.label_ == "LOC": #"GPE":
                    gpes.append(process_uri(ent.text))
                #if ent.label_ == "ORG":
                #    orgs.append(process_uri(ent.text))
                    
            excerpt[page_number] = {'text': page_text, 
                                    'lang': lang, 
                                    'persons': list(set(persons)), 
                                    'gpes': list(set(gpes)),
                                #'orgs': list(set(orgs))
                                }

    return excerpt

Iterando por todas as URI para extrair o texto e gravar no grafo NEO4j

In [56]:
# Gravar hora de início do processamento
start_time = pd.Timestamp.now()
parcial_time = pd.Timestamp.now()

for i in range(0,len(thesis_repository)):

    # Coletando o link do PDF
    url = thesis_repository['repository'][i]
    thesis_uri = thesis_repository['Thesis'][i][16:]
    pdf_link = get_pdf_link(url)
    if pdf_link is not None:
        print(f"{i} - Downloading and extracting: {thesis_uri} from {pdf_link}")
        download_file(pdf_link, thesis_uri)
        
        # Extraindo o texto e entidades
        tese_path = 'tese 2.pdf'
        excerpt = extratc_text(tese_path)

        excerpts = []
        persons = []
        gpes = []

        # Salvando as triplas em CSV
        for n in excerpt.keys():

            excerpt_uri = thesis_repository['Thesis'][i] + '_p_' + str(n)
            excerpt_text = excerpt[n]['text']

            excerpt_page = n
            excerpt_lang = excerpt[n]['lang']
            
            excerpts.append([thesis_repository['Thesis'][i], excerpt_uri, excerpt_text, str(excerpt_page), excerpt_lang])
                
            #Criando as relações com as entidades
            for person in excerpt[n]['persons']:
                persons.append([excerpt_uri, 'tag:stardog:api:' + person])
            
            for gpe in excerpt[n]['gpes']:
                gpes.append([excerpt_uri, 'tag:stardog:api:' + gpe])
        
        # Salvando os dados em CSV no diretório de importação do Neo4j
        pd.DataFrame(excerpts).to_csv('C:\\Users\\facordei\\.Neo4jDesktop\\relate-data\\dbmss\\dbms-792eabfc-5517-4220-ab29-30ef6ec3236e\\import\\excerpts2.csv', index=False, header=False)
        pd.DataFrame(persons).to_csv('C:\\Users\\facordei\\.Neo4jDesktop\\relate-data\\dbmss\\dbms-792eabfc-5517-4220-ab29-30ef6ec3236e\\import\\persons2.csv', index=False, header=False)
        pd.DataFrame(gpes).to_csv('C:\\Users\\facordei\\.Neo4jDesktop\\relate-data\\dbmss\\dbms-792eabfc-5517-4220-ab29-30ef6ec3236e\\import\\gpes2.csv', index=False, header=False)
        
        #Deletando o arquivo PDF após o processamento
        os.remove('tese 2.pdf')

        print(f"Loading triplas: {thesis_uri}")

        # Importanto os dados para o Neo4j
        #Excerpts
        query = """
        LOAD CSV FROM 'file:///excerpts2.csv'
        AS row
        MATCH  (t:Thesis{uri: row[0]})
        MERGE  (e:Exerpt{
                uri: row[1],
                text: row[2],
                page: row[3],
                lang: row[4]
                })-[r:BFO_0000050]-(t);
        """
        graph = GraphDatabase.driver(NEO4J_URL, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))
        results = graph.execute_query(query)
        graph.close()

        #Persons
        query = """
        LOAD CSV FROM 'file:///persons2.csv'
        AS row
        MATCH  (e:Exerpt{uri: row[0]})
        MERGE  (p:Person{uri: row[1]})
        MERGE  (e)-[r:mentions]-(p);
        """
        graph = GraphDatabase.driver(NEO4J_URL, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))
        results = graph.execute_query(query)
        graph.close()

        #GPEs
        query = """
        LOAD CSV FROM 'file:///gpes2.csv'
        AS row
        MATCH  (e:Exerpt{uri: row[0]})
        MERGE  (p:Place{uri: row[1]})
        MERGE  (e)-[r:mentions]-(p);
        """
        graph = GraphDatabase.driver(NEO4J_URL, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))
        results = graph.execute_query(query)
        graph.close()

        # Removendo os arquivos CSV após a importação
        os.remove('C:\\Users\\facordei\\.Neo4jDesktop\\relate-data\\dbmss\\dbms-792eabfc-5517-4220-ab29-30ef6ec3236e\\import\\excerpts2.csv')
        os.remove('C:\\Users\\facordei\\.Neo4jDesktop\\relate-data\\dbmss\\dbms-792eabfc-5517-4220-ab29-30ef6ec3236e\\import\\persons2.csv')
        os.remove('C:\\Users\\facordei\\.Neo4jDesktop\\relate-data\\dbmss\\dbms-792eabfc-5517-4220-ab29-30ef6ec3236e\\import\\gpes2.csv')

        print(f"Total time: {pd.Timestamp.now() - start_time}")
        print(f"Partial time: {pd.Timestamp.now() - parcial_time}")
        # Atualizando a hora parcial
        parcial_time = pd.Timestamp.now()
    else:
        print(f"No PDF link found for {thesis_uri}")

    # gravando no grafo que a tese foi processada
    query = """
    MATCH  (t:Thesis{uri: \'""" + thesis_repository['Thesis'][i] +  """\'})
    SET t.processed = true
    """
    graph = GraphDatabase.driver(NEO4J_URL, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))
    results = graph.execute_query(query)
    graph.close()

0 - Downloading and extracting: catequese_e_evasao_etnografia_do_aldeamento_indigena_de_sao_pedro_de_alcantara_parana_18451855 from https://www.teses.usp.br/teses/disponiveis/8/8134/tde-16042024-150101/publico/1998_MartaRosaAmoroso_V1.pdf
Loading triplas: catequese_e_evasao_etnografia_do_aldeamento_indigena_de_sao_pedro_de_alcantara_parana_18451855
Total time: 0 days 00:14:11.361810
Partial time: 0 days 00:14:11.361810
1 - Downloading and extracting: efeitos_de_macronutrientes_no_crescimento_de_mudas_e_na_producao_do_guaranazeiro_paullinia_cupana_var_sorbilis from https://www.teses.usp.br/teses/disponiveis/11/11140/tde-20240301-154402/publico/725130.pdf


unknown widths : 
[0, IndirectObject(6236, 0, 1641710652304)]
unknown widths : 
[0, IndirectObject(6233, 0, 1641710652304)]
unknown widths : 
[0, IndirectObject(6230, 0, 1641710652304)]
unknown widths : 
[0, IndirectObject(6227, 0, 1641710652304)]
unknown widths : 
[0, IndirectObject(6224, 0, 1641710652304)]
unknown widths : 
[0, IndirectObject(6221, 0, 1641710652304)]
unknown widths : 
[0, IndirectObject(6218, 0, 1641710652304)]
unknown widths : 
[0, IndirectObject(6215, 0, 1641710652304)]
unknown widths : 
[0, IndirectObject(3326, 0, 1641710652304)]
unknown widths : 
[0, IndirectObject(3323, 0, 1641710652304)]
unknown widths : 
[0, IndirectObject(3320, 0, 1641710652304)]
unknown widths : 
[0, IndirectObject(3317, 0, 1641710652304)]
unknown widths : 
[0, IndirectObject(6236, 0, 1641710652304)]
unknown widths : 
[0, IndirectObject(6233, 0, 1641710652304)]
unknown widths : 
[0, IndirectObject(6230, 0, 1641710652304)]
unknown widths : 
[0, IndirectObject(6227, 0, 1641710652304)]
unknown 

Loading triplas: efeitos_de_macronutrientes_no_crescimento_de_mudas_e_na_producao_do_guaranazeiro_paullinia_cupana_var_sorbilis
Total time: 0 days 00:16:28.185169
Partial time: 0 days 00:02:16.823359
2 - Downloading and extracting: ritmos_biologicos_em_indios_guarani_adultos from https://www.teses.usp.br/teses/disponiveis/47/47135/tde-29052008-143531/publico/wey_do.pdf
Loading triplas: ritmos_biologicos_em_indios_guarani_adultos
Total time: 0 days 00:18:34.863083
Partial time: 0 days 00:02:06.678795
3 - Downloading and extracting: delivery_systems_for_simultaneous_intake_of_probiotics_and_plant_extracts from https://www.teses.usp.br/teses/disponiveis/74/74132/tde-22022022-082809/publico/DO5883165COR.pdf
Loading triplas: delivery_systems_for_simultaneous_intake_of_probiotics_and_plant_extracts
Total time: 0 days 00:20:04.792505
Partial time: 0 days 00:01:29.928541
4 - Downloading and extracting: nem_cidadaos_nem_brasileiros_indigenas_na_formacao_do_estado_nacional_brasileiro_e_conflitos

unknown widths : 
[0, IndirectObject(21066, 0, 1641933853520)]
unknown widths : 
[0, IndirectObject(21069, 0, 1641933853520)]
unknown widths : 
[0, IndirectObject(21072, 0, 1641933853520)]
unknown widths : 
[0, IndirectObject(21075, 0, 1641933853520)]
unknown widths : 
[0, IndirectObject(21078, 0, 1641933853520)]
unknown widths : 
[0, IndirectObject(21054, 0, 1641933853520)]
unknown widths : 
[0, IndirectObject(21057, 0, 1641933853520)]
unknown widths : 
[0, IndirectObject(21060, 0, 1641933853520)]
unknown widths : 
[0, IndirectObject(21063, 0, 1641933853520)]
unknown widths : 
[0, IndirectObject(21066, 0, 1641933853520)]
unknown widths : 
[0, IndirectObject(21069, 0, 1641933853520)]
unknown widths : 
[0, IndirectObject(21072, 0, 1641933853520)]
unknown widths : 
[0, IndirectObject(21075, 0, 1641933853520)]
unknown widths : 
[0, IndirectObject(21078, 0, 1641933853520)]
unknown widths : 
[0, IndirectObject(21054, 0, 1641933853520)]
unknown widths : 
[0, IndirectObject(21057, 0, 16419338

Loading triplas: xavante_e_a_politica_indigenista_no_brasil_nos_seculos_xviii_e_xix
Total time: 0 days 00:47:00.024193
Partial time: 0 days 00:12:14.751895
8 - Downloading and extracting: paranambuco_ocupacao_espacial_e_trabalho_indigena_na_capitania_de_pernambuco_nos_seculos_xvi_e_xvii from https://www.teses.usp.br/teses/disponiveis/8/8137/tde-06042023-114244/publico/2003_BartiraFerrazBarbosa.pdf
Loading triplas: paranambuco_ocupacao_espacial_e_trabalho_indigena_na_capitania_de_pernambuco_nos_seculos_xvi_e_xvii
Total time: 0 days 01:07:10.249979
Partial time: 0 days 00:20:10.224787
9 - Downloading and extracting: pelas_fronteiras_e_trincheiras_do_indigenismo_e_do_sanitarismo_a_atencao_as_dst_em_comunidades_indigenas_no_contexto_das_politicas_e_praticas_indigenistas_e_de_saude_na_preamazonia from https://www.teses.usp.br/teses/disponiveis/6/6135/tde-09042021-091521/publico/DR_545_Varga_2002.pdf
Loading triplas: pelas_fronteiras_e_trincheiras_do_indigenismo_e_do_sanitarismo_a_atencao_as

KeyboardInterrupt: 