In [40]:
# Importação de bibliotecas
import os
import polars as pl
import pandas as pd
import logging
import logging.config
import time
import psutil
import sys
import datetime
import requests
import json
import warnings
import fitz
import re

In [41]:
def search_openalex(entity='works', **kwargs):
    """
    Função genérica para buscar na OpenAlex.

    Args:
    - entity (str): Tipo de entidade a ser buscada ('works' ou 'concepts').
    - **kwargs: Parâmetros de consulta para a API da OpenAlex.

    Returns:
    - dict: Resposta da API da OpenAlex em formato JSON.
    """
    base_url = f'https://api.openalex.org/{entity}'
    try:
        response = requests.get(base_url, params=kwargs)

        return response.json()
    
    except requests.exceptions.RequestException as e:
        logging.error(f'Request failed: {str(e)}')
        raise e

def get_pdf_url(work):
    """
    Verifica e retorna o URL de PDF de acesso aberto para um trabalho.

    Args:
    - work (dict): Metadados de um trabalho.

    Returns:
    - str: URL de PDF de acesso aberto ou None.
    """
    try:
        if 'primary_location' in work and work['primary_location']:
            pdf_url = work['primary_location'].get('pdf_url')

            if pdf_url:
                str_pdf_url = pdf_url.split('/')[-1]
                str_pdf_url = str_pdf_url.split('.')[-1]

                if str_pdf_url == 'pdf':
                    return pdf_url
        return None
    except requests.exceptions.RequestException as e:
        logging.error(f'Error: {str(e)}')
        raise e

def extract_text_from_pdf_url(pdf_url):
    """
    Extrai o texto de um PDF diretamente de uma URL.

    Args:
    - pdf_url (str): URL do PDF.

    Returns:
    - str: Texto extraído do PDF.
    """
    text = ""
    try:
        response = requests.get(pdf_url)
        response.raise_for_status()
        pdf_bytes = response.content
        with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
            for page_num in range(len(doc)):
                page = doc.load_page(page_num)
                text += page.get_text()
    except Exception as e:
        logging.error(f"Error: {e}")
    return text

def clean_text(text):
    """
    Limpa o texto extraído de um PDF.

    Args:
    - text (str): Texto extraído do PDF.

    Returns:
    - str: Texto limpo.
    """
    try:
        # Substituir múltiplas quebras de linha e espaços por um único espaço
        text = re.sub(r'\s+', ' ', text)

        # Remover caracteres de controle como \xa0 e \u2003
        text = text.replace(u'\xa0', u' ').replace(u'\u2003', u' ')

        # Remover URLs (opcional)
        text = re.sub(r'http[s]?://\S+', '', text)

        # Palavras-chave que podem designar a seção de referências
        keywords = ['References', 'Bibliography', 'Works Cited', 'Literature Cited']

        # Remover todo o texto após a última ocorrência de qualquer uma das palavras-chave
        for keyword in keywords:
            last_occurrence = text.rfind(keyword)
            if last_occurrence != -1:
                text = text[:last_occurrence]
                break

        # Remover espaços em branco adicionais
        text = text.strip()

        return text
    except requests.exceptions.RequestException as e:
        logging.error(f'Error: {str(e)}')
        raise e

In [42]:
# Buscar conceitos relacionados à saúde
concept_params = {
    'search': 'health'
}
concept_result = search_openalex(entity='concepts', **concept_params)

concepts = {}
for concept in concept_result['results']:
    concepts[concept['display_name']] = concept['id'].split('/')[-1]

concepts

{'Environmental health': 'C99454951',
 'Health care': 'C160735492',
 'Public health': 'C138816342',
 'Mental health': 'C134362201',
 'Occupational safety and health': 'C187155963',
 'Human health': 'C2987857752',
 'Health equity': 'C2250968',
 'Health psychology': 'C155164915',
 'Global health': 'C46578552',
 'Health policy': 'C47344431',
 'Health promotion': 'C185618831',
 'Health informatics': 'C145642194',
 'Oral health': 'C2992672162',
 'Reproductive health': 'C121752807',
 'Health services': 'C2986740045',
 'Health professionals': 'C3019806175',
 'Health benefits': 'C3018122547',
 'Health administration': 'C137992405',
 'Health education': 'C113807197',
 'National Health and Nutrition Examination Survey': 'C2779874844',
 'Health insurance': 'C2983635472',
 'Population health': 'C2778149918',
 'Social determinants of health': 'C78491826',
 'Health literacy': 'C2778843546',
 'Community health': 'C2775951005'}

In [43]:
# concept_id = ' | '.join([concepts['Health psychology'], concepts['Mental health']]) 

concept_id = concepts['Health psychology']

# Definir parâmetros de paginação
per_page = 200

# Buscar artigos em inglês de 2022 a 2024 dos conceitos selecionados, com paginação
work_params = {
    'filter': f'language:en,from_publication_date:2022-01-01,to_publication_date:2024-12-31,concepts.id:{concept_id}',
    'per_page': per_page,
}

work_result = search_openalex(entity='works', **work_params)

In [44]:
works = {}
for work in work_result['results']:
    open_alex_id = work.get('id').split('/')[-1]
    pdf_url = get_pdf_url(work)

    text = False
    if pdf_url:
        text = extract_text_from_pdf_url(pdf_url)
        text = clean_text(text)
    
    if open_alex_id and text:
        works[open_alex_id] = text

ERROR:root:Error: 403 Client Error: Forbidden for url: https://academic.oup.com/abm/advance-article-pdf/doi/10.1093/abm/kaac039/45037045/kaac039.pdf
ERROR:root:Error: 403 Client Error: Forbidden for url: https://academic.oup.com/tbm/advance-article-pdf/doi/10.1093/tbm/ibad014/50031314/ibad014.pdf
ERROR:root:Error: 403 Client Error: Forbidden for url: https://academic.oup.com/abm/article-pdf/56/8/781/45214298/kaac023.pdf
ERROR:root:Error: 403 Client Error: Forbidden for url: https://academic.oup.com/tbm/article-pdf/13/6/400/51881524/ibad010.pdf
ERROR:root:Error: 403 Client Error: Forbidden for url: https://academic.oup.com/tbm/advance-article-pdf/doi/10.1093/tbm/ibad034/50698147/ibad034.pdf
ERROR:root:Error: 403 Client Error: Forbidden for url: https://academic.oup.com/tbm/advance-article-pdf/doi/10.1093/tbm/ibac105/48846927/ibac105.pdf
ERROR:root:Error: 403 Client Error: Forbidden for url: https://academic.oup.com/abm/article-pdf/57/11/929/52174619/kaad054.pdf
ERROR:root:Error: 403 Cli

In [45]:
list(works.values())[0]

'Vol.:(0123456789) 1 3 J Behav Med (2023) 46:153–166  Addressing racial/ethnic inequities in vaccine hesitancy and uptake: lessons learned from the California alliance against COVID‑19 Mona AuYoung1 · Patricia Rodriguez Espinosa2,3 · Wei‑ting Chen2 · Preeti Juturu4 · Maria‑Elena De Trinidad Young5 · Alejandra Casillas6 · Paris Adkins‑Jackson7,8 · Suellen Hopfer9 · Ed Kissam10,11 · Audrey Kawaiopua Alo12 · Roberto A. Vargas13 · Arleen F. Brown6 · And the STOP COVID-19 C. A. Communications Working Group Received: 3 September 2021 / Accepted: 3 January 2022 / Published online: 22 January 2022 Working Group, we demonstrate the wide range of strate- gies, communication methods, languages, and trusted mes- sengers that have been effective in reaching diverse com- munities across the state. We also showcase challenges and lessons learned, such as the importance of including trusted community partners to share information or provide vac- cines. These approaches, rooted in community engagement,