In [10]:
# Importação de bibliotecas
import os
import sys
import time
import polars as pl
import pandas as pd
import logging
import logging.config
import requests
import fitz
import re
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import torch
from transformers import DistilBertTokenizer, DistilBertModel
from torch.nn.functional import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [11]:
def search_openalex(entity='works', **kwargs):
    """
    Função genérica para buscar na OpenAlex.

    Args:
    - entity (str): Tipo de entidade a ser buscada ('works' ou 'concepts').
    - **kwargs: Parâmetros de consulta para a API da OpenAlex.

    Returns:
    - dict: Resposta da API da OpenAlex em formato JSON.
    """
    base_url = f'https://api.openalex.org/{entity}'
    try:
        response = requests.get(base_url, params=kwargs)

        return response.json()
    
    except requests.exceptions.RequestException as e:
        logging.error(f'Request failed: {str(e)}')
        raise e

def get_pdf_url(work):
    """
    Verifica e retorna o URL de PDF de acesso aberto para um trabalho.

    Args:
    - work (dict): Metadados de um trabalho.

    Returns:
    - str: URL de PDF de acesso aberto ou None.
    """
    try:
        if 'primary_location' in work and work['primary_location']:
            pdf_url = work['primary_location'].get('pdf_url')

            if pdf_url:
                str_pdf_url = pdf_url.split('/')[-1]
                str_pdf_url = str_pdf_url.split('.')[-1]

                if str_pdf_url == 'pdf':
                    return pdf_url
        return None
    except requests.exceptions.RequestException as e:
        logging.error(f'Error: {str(e)}')
        raise e

def extract_text_from_pdf_url(pdf_url):
    """
    Extrai o texto de um PDF diretamente de uma URL.

    Args:
    - pdf_url (str): URL do PDF.

    Returns:
    - str: Texto extraído do PDF.
    """
    text = ""
    try:
        response = requests.get(pdf_url)
        response.raise_for_status()
        pdf_bytes = response.content
        with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
            for page_num in range(len(doc)):
                page = doc.load_page(page_num)
                text += page.get_text()
    except Exception as e:
        logging.error(f"Error: {e}")
    return text

def clean_text(text):
    """
    Limpa o texto extraído de um PDF e extrai apenas o abstract.

    Args:
    - text (str): Texto extraído do PDF.

    Returns:
    - str: Texto limpo contendo apenas o abstract.
    """
    try:
        # Substituir múltiplas quebras de linha e espaços por um único espaço
        text = re.sub(r'\s+', ' ', text)

        # Remover caracteres de controle como \xa0 e \u2003
        text = text.replace(u'\xa0', u' ').replace(u'\u2003', u' ')

        # Remover URLs (opcional)
        text = re.sub(r'http[s]?://\S+', '', text)

        # Extrair apenas o abstract
        abstract_start = re.search(r'\bAbstract\b', text, re.IGNORECASE)
        if abstract_start:
            # Encontrar o início do abstract
            start_pos = abstract_start.end()
            # Encontrar o próximo parágrafo em branco
            end_pos = re.search(r'\n\s*\n', text[start_pos:])
            if end_pos:
                end_pos = start_pos + end_pos.start()
                text = text[start_pos:end_pos].strip()
            else:
                text = text[start_pos:].strip()
        else:
            # Se não encontrar "Abstract", retorna o texto original limpo
            text = text.strip()

        return text
    except Exception as e:
        logging.error(f'Error: {str(e)}')
        return ""


def preprocess_text(text, nlp_model):
    """
    Pré-processa o texto para uso em um modelo de PLN.

    Args:
    - text (str): Texto a ser pré-processado.

    Returns:
    - str: Texto pré-processado.
    """
    # Remover caracteres especiais, números e sinais de pontuação
    text = re.sub(r'[^A-Za-z\s]', '', text)

    # Converter para minúsculas
    text = text.lower()

    # Processar o texto com spaCy
    doc = nlp_model(text)

    # Tokenizar, remover stopwords e lematizar
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]

    # Reunir tokens em uma string
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

def download_spacy_model():
    """
    Baixa o modelo de linguagem spaCy se não estiver presente.
    """
    try:
        nlp = spacy.load('en_core_web_sm')
    except OSError:
        from spacy.cli import download
        download('en_core_web_sm')
        nlp = spacy.load('en_core_web_sm')
    return nlp

def get_representative_words(text, tokenizer, model, tfidf_scores, top_n=5):
    """
    Gera embeddings contextuais para o texto e extrai as palavras mais representativas.

    Args:
    - text (str): Texto a ser processado.
    - tfidf_scores: Pontuações TF-IDF para o texto.
    - top_n (int): Número de palavras representativas a serem extraídas.

    Returns:
    - list: Lista de palavras mais representativas.
    """
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    cls_embedding = outputs.last_hidden_state[:, 0, :]
    word_embeddings = outputs.last_hidden_state.squeeze(0)

    similarities = cosine_similarity(word_embeddings, cls_embedding, dim=-1)
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'].squeeze(0))

    word_similarities = list(zip(tokens, similarities.tolist(), tfidf_scores.tolist()))
    word_similarities.sort(key=lambda x: (x[1], x[2]), reverse=True)

    representative_words = [word for word, sim, tfidf in word_similarities if word not in ["[CLS]", "[SEP]", "[PAD]"] and len(word) > 3][:top_n]

    return representative_words

In [12]:
# Buscar conceitos relacionados à saúde
concept_params = {
    'search': 'health'
}
concept_result = search_openalex(entity='concepts', **concept_params)

concepts = {}
for concept in concept_result['results']:
    concepts[concept['display_name']] = concept['id'].split('/')[-1]

concepts

{'Environmental health': 'C99454951',
 'Health care': 'C160735492',
 'Public health': 'C138816342',
 'Mental health': 'C134362201',
 'Occupational safety and health': 'C187155963',
 'Human health': 'C2987857752',
 'Health equity': 'C2250968',
 'Health psychology': 'C155164915',
 'Global health': 'C46578552',
 'Health policy': 'C47344431',
 'Health promotion': 'C185618831',
 'Health informatics': 'C145642194',
 'Oral health': 'C2992672162',
 'Reproductive health': 'C121752807',
 'Health services': 'C2986740045',
 'Health professionals': 'C3019806175',
 'Health benefits': 'C3018122547',
 'Health administration': 'C137992405',
 'Health education': 'C113807197',
 'National Health and Nutrition Examination Survey': 'C2779874844',
 'Health insurance': 'C2983635472',
 'Population health': 'C2778149918',
 'Social determinants of health': 'C78491826',
 'Health literacy': 'C2778843546',
 'Community health': 'C2775951005'}

In [13]:
concept_id = concepts['Health psychology']

# Definir parâmetros de paginação
per_page = 40

# Buscar artigos em inglês de 2022 a 2024 dos conceitos selecionados, com paginação
work_params = {
    'filter': f'language:en,from_publication_date:2022-01-01,to_publication_date:2024-12-31,concepts.id:{concept_id}',
    'per_page': per_page,
}

work_result = search_openalex(entity='works', **work_params)

In [14]:
texts = {}
pdfs = {}
for work in work_result['results']:
    open_alex_id = work.get('id').split('/')[-1]
    pdf_url = get_pdf_url(work)

    text = False
    if pdf_url:
        text = extract_text_from_pdf_url(pdf_url)
        text = clean_text(text)
    
    if open_alex_id and text:
        texts[open_alex_id] = text
        pdfs[open_alex_id] = pdf_url

ERROR:root:Error: 403 Client Error: Forbidden for url: https://academic.oup.com/abm/advance-article-pdf/doi/10.1093/abm/kaac039/45037045/kaac039.pdf
ERROR:root:Error: 403 Client Error: Forbidden for url: https://academic.oup.com/tbm/advance-article-pdf/doi/10.1093/tbm/ibad014/50031314/ibad014.pdf
ERROR:root:Error: 403 Client Error: Forbidden for url: https://academic.oup.com/abm/article-pdf/56/8/781/45214298/kaac023.pdf


In [15]:
nlp = download_spacy_model()

prep_texts = {}
for article_id, text in texts.items():
    prep_texts[article_id] = preprocess_text(text, nlp)

In [16]:
corpus = list(prep_texts.values())
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

representative_words_dict = {}

for i, (article_id, text) in enumerate(prep_texts.items()):
    tfidf_scores = tfidf_matrix[i].toarray().flatten()
    representative_words = get_representative_words(text, tokenizer, model, tfidf_scores, top_n=5)
    representative_words_dict[article_id] = representative_words

representative_words_dict

{'W4207080146': ['research', 'lack', 'comprise', 'govern', 'solution'],
 'W4206998862': ['administer', 'include', 'evenly', 'speak', 'survey'],
 'W4205630067': ['validation', 'model', 'reliability', 'examine', 'extant'],
 'W4226453222': ['analysis', 'research', 'examine', 'prevalence', 'complete'],
 'W4210338148': ['social', 'initiative', 'research', 'suggest', 'develop'],
 'W4220657291': ['evidence', '##le', 'propose', 'report', 'evidence'],
 'W4210916174': ['regression', 'examine', 'model', 'association', 'examine'],
 'W4225113282': ['widespread', 'percent', 'remain', 'perceive', 'compare'],
 'W4213426836': ['include', 'access', 'proportion', 'include', 'include'],
 'W4313424320': ['follow', '##ml', 'support', 'compare', 'examine'],
 'W4315491255': ['individual', 'model', 'analysis', 'hypothetical', 'sample'],
 'W4321434852': ['include', 'organize', 'address', 'particularly', 'develop'],
 'W4321850362': ['change',
  'prevention',
  'approach',
  'currently',
  'participant'],
 'W4361