In [24]:
# Importação de bibliotecas
import os
import sys
import polars as pl
import numpy as np
import re
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import torch
from transformers import DistilBertTokenizer, DistilBertModel
from torch.nn.functional import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from xml.etree import ElementTree as ET
from transformers import AutoTokenizer, AutoModel


# Definicao da raiz do projeto
PROJECT_ROOT = 'G:/Csouza/nlp/topic_modeling'
os.chdir(PROJECT_ROOT)
sys.path.insert(0, PROJECT_ROOT)

from src.etl.abstracts_med import AbstractOpenAlexPubMed

In [20]:
abs = AbstractOpenAlexPubMed()

abstracts = abs.abstracts()

In [25]:
abstracts

{'W4394814112': 'Artificial Intelligence for Detecting Cephalometric Landmarks: A Systematic Review and Meta-analysis. J Digit Imaging. 2023 Jun;36(3):1158-1179. doi:10.1007/s10278-022-00766-w. The study was financed in part by the Coordenacao de Aperfeicoamentode Pessoal de Nivel Superior-Brazil (CAPES)-Finance Code 001. Systematic review and meta-analysis.',
 'W4318671511': 'Classic theories of stress and health are largely based on assumptions regarding how different psychosocial stressors influence biological processes that, in turn, affect human health and behavior. Although theoretically rich, this work has yielded little consensus and led to numerous conceptual, measurement, and reproducibility issues. Social Safety Theory aims to address these issues by using the primary goal and regulatory logic of the human brain and immune system as the basis for specifying the social-environmental situations to which these systems should respond most strongly to maximize reproductive succes

In [30]:
def preprocess_text(text, nlp_model):
    """
    Pré-processa o texto para uso em um modelo de PLN.

    Args:
    - text (str): Texto a ser pré-processado.

    Returns:
    - str: Texto pré-processado.
    """
    # Remover caracteres especiais, números e sinais de pontuação
    text = re.sub(r'[^A-Za-z\s]', '', text)

    # Converter para minúsculas
    text = text.lower()

    # Processar o texto com spaCy
    doc = nlp_model(text)

    # Tokenizar, remover stopwords e lematizar
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]

    # Reunir tokens em uma string
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

def get_spacy_model():
    """
    Baixa o modelo de linguagem spaCy se não estiver presente.
    """
    try:
        nlp = spacy.load('en_core_web_sm')
    except OSError:
        from spacy.cli import download
        download('en_core_web_sm')
        nlp = spacy.load('en_core_web_sm')
    return nlp

def get_representative_words(text, tokenizer, model, tfidf_scores, top_n=5):
    """
    Gera embeddings contextuais para o texto e extrai as palavras mais representativas.

    Args:
    - text (str): Texto a ser processado.
    - tfidf_scores: Pontuações TF-IDF para o texto.
    - top_n (int): Número de palavras representativas a serem extraídas.

    Returns:
    - list: Lista de palavras mais representativas.
    """
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    cls_embedding = outputs.last_hidden_state[:, 0, :]
    word_embeddings = outputs.last_hidden_state.squeeze(0)

    similarities = cosine_similarity(word_embeddings, cls_embedding, dim=-1)
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'].squeeze(0))

    word_similarities = list(zip(tokens, similarities.tolist(), tfidf_scores.tolist()))
    word_similarities.sort(key=lambda x: (x[1], x[2]), reverse=True)

    representative_words = [word for word, sim, tfidf in word_similarities if word not in ["[CLS]", "[SEP]", "[PAD]"] and len(word) > 3][:top_n]

    return representative_words

In [28]:
nlp = get_spacy_model()

prep_texts = {}
for article_id, text in abstracts.items():
    prep_texts[article_id] = preprocess_text(text, nlp)

In [23]:
corpus = list(prep_texts.values())
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

representative_words_dict = {}

for i, (article_id, text) in enumerate(prep_texts.items()):
    tfidf_scores = tfidf_matrix[i].toarray().flatten()
    representative_words = get_representative_words(text, tokenizer, model, tfidf_scores, top_n=5)
    representative_words_dict[article_id] = representative_words

representative_words_dict

{'W4394814112': ['artificial', 'study', '##sw', 'finance', 'code'],
 'W4318671511': ['formulation',
  'theory',
  'describe',
  'integrate',
  'transform'],
 'W4207080146': ['lack', 'highlight', 'access', 'comprise', 'exist'],
 'W4206998862': ['include', 'administer', 'survey', 'report', 'distribute'],
 'W4210831730': ['describe',
  'incorporate',
  'implementation',
  'generally',
  'initiate'],
 'W4205630067': ['validation', 'study', 'bergen', 'analysis', 'participant'],
 'W4226453222': ['analysis', 'research', 'public', 'examine', 'prevalence'],
 'W4210338148': ['social', '##or', 'consistently', 'current', 'research'],
 'W4210494543': ['artificial', 'study', '##sw', 'finance', 'code'],
 'W4220657291': ['evidence', '##led', 'propose', 'report', 'evidence'],
 'W4288053855': ['collect', 'develop', 'tailor', 'intervention', 'initiative'],
 'W4210916174': ['##person', 'association', 'people', 'result', 'expansion'],
 'W4225113282': ['widespread', 'perceive', 'reside', 'appear', 'liberal'

['and', 'and', 'vaccine', 'vaccine', 'vaccine']