In [None]:
# Importação de bibliotecas
import os
import sys
import time
import polars as pl
import pandas as pd
import logging
import logging.config
import requests
import fitz
import re
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import torch
from transformers import DistilBertTokenizer, DistilBertModel
from torch.nn.functional import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from xml.etree import ElementTree as ET

In [None]:
def search_openalex(entity='works', **kwargs):
    """
    Função genérica para buscar na OpenAlex.

    Args:
    - entity (str): Tipo de entidade a ser buscada ('works' ou 'concepts').
    - **kwargs: Parâmetros de consulta para a API da OpenAlex.

    Returns:
    - dict: Resposta da API da OpenAlex em formato JSON.
    """
    base_url = f'https://api.openalex.org/{entity}'
    try:
        response = requests.get(base_url, params=kwargs)

        return response.json()
    
    except requests.exceptions.RequestException as e:
        logging.error(f'Request failed: {str(e)}')
        raise e

def preprocess_text(text, nlp_model):
    """
    Pré-processa o texto para uso em um modelo de PLN.

    Args:
    - text (str): Texto a ser pré-processado.

    Returns:
    - str: Texto pré-processado.
    """
    # Remover caracteres especiais, números e sinais de pontuação
    text = re.sub(r'[^A-Za-z\s]', '', text)

    # Converter para minúsculas
    text = text.lower()

    # Processar o texto com spaCy
    doc = nlp_model(text)

    # Tokenizar, remover stopwords e lematizar
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]

    # Reunir tokens em uma string
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

def get_spacy_model():
    """
    Baixa o modelo de linguagem spaCy se não estiver presente.
    """
    try:
        nlp = spacy.load('en_core_web_sm')
    except OSError:
        from spacy.cli import download
        download('en_core_web_sm')
        nlp = spacy.load('en_core_web_sm')
    return nlp

def get_representative_words(text, tokenizer, model, tfidf_scores, top_n=5):
    """
    Gera embeddings contextuais para o texto e extrai as palavras mais representativas.

    Args:
    - text (str): Texto a ser processado.
    - tfidf_scores: Pontuações TF-IDF para o texto.
    - top_n (int): Número de palavras representativas a serem extraídas.

    Returns:
    - list: Lista de palavras mais representativas.
    """
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    cls_embedding = outputs.last_hidden_state[:, 0, :]
    word_embeddings = outputs.last_hidden_state.squeeze(0)

    similarities = cosine_similarity(word_embeddings, cls_embedding, dim=-1)
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'].squeeze(0))

    word_similarities = list(zip(tokens, similarities.tolist(), tfidf_scores.tolist()))
    word_similarities.sort(key=lambda x: (x[1], x[2]), reverse=True)

    representative_words = [word for word, sim, tfidf in word_similarities if word not in ["[CLS]", "[SEP]", "[PAD]"] and len(word) > 3][:top_n]

    return representative_words

def search_pubmed(doi):
    url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    params = {
        'db': 'pubmed',
        'term': doi,
        'retmode': 'json'
    }
    response = requests.get(url, params=params)
    data = response.json()
    idlist = data['esearchresult']['idlist']
    if idlist:
        return idlist[0]
    else:
        return None

# Função para recuperar o resumo completo do artigo usando o PMID
def fetch_abstract(pmid):
    url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
    params = {
        'db': 'pubmed',
        'id': pmid,
        'retmode': 'xml'
    }
    response = requests.get(url, params=params)
    root = ET.fromstring(response.content)
    abstract_text = ''
    for abstract in root.findall(".//AbstractText"):
        abstract_text += abstract.text + ' '
    return abstract_text.strip()

In [None]:
# Buscar conceitos relacionados à saúde
concept_params = {
    'search': 'health'
}
concept_result = search_openalex(entity='concepts', **concept_params)

concepts = {}
for concept in concept_result['results']:
    concepts[concept['display_name']] = concept['id'].split('/')[-1]

concepts

In [None]:
concept_id = concepts['Health psychology']

# Definir parâmetros de paginação
per_page = 200

# Buscar artigos em inglês de 2022 a 2024 dos conceitos selecionados, com paginação
work_params = {
    'filter': f'language:en,from_publication_date:2022-01-01,to_publication_date:2024-12-31,concepts.id:{concept_id}',
    'per_page': per_page,
}

work_result = search_openalex(entity='works', **work_params)

In [None]:
doi_dict = {}
for work in work_result['results']:
    open_alex_id = work.get('id').split('/')[-1]
    doi = work.get('doi')
    doi = doi.split('org/')[-1]
    
    if open_alex_id and doi:
        doi_dict[open_alex_id] = doi

doi_dict

In [None]:
articles = {}
for article_id, doi in doi_dict.items():
    pmid = search_pubmed(doi)
    
    if pmid:
        abstract = fetch_abstract(pmid)
        articles[article_id] = abstract
    else:
        print(f'Article of ID {article_id} not found in PubMed')

In [None]:
articles

In [None]:
nlp = get_spacy_model()

prep_texts = {}
for article_id, text in articles.items():
    prep_texts[article_id] = preprocess_text(text, nlp)

In [None]:
corpus = list(prep_texts.values())
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

representative_words_dict = {}

for i, (article_id, text) in enumerate(prep_texts.items()):
    tfidf_scores = tfidf_matrix[i].toarray().flatten()
    representative_words = get_representative_words(text, tokenizer, model, tfidf_scores, top_n=5)
    representative_words_dict[article_id] = representative_words

representative_words_dict