In [2]:
# Importação de bibliotecas

import os
import sys
import polars as pl
import numpy as np
import re
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import torch
from transformers import DistilBertTokenizer, DistilBertModel
from torch.nn.functional import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from xml.etree import ElementTree as ET
from transformers import AutoTokenizer, AutoModel

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Definicao da raiz do projeto

PROJECT_ROOT = 'G:/Csouza/nlp/topic_modeling'

os.chdir(PROJECT_ROOT)

sys.path.insert(0, PROJECT_ROOT)

In [4]:
from src.etl.abstracts_med import AbstractOpenAlexPubMedETL

In [5]:
abs = AbstractOpenAlexPubMed()

load_path = os.path.join(PROJECT_ROOT, 'data', 'processed', 'open_alex', 'pubmed')
file_name = 'abstracts'
full_path = f'{load_path}/{file_name}.parquet'

if not os.path.exists(full_path):
    abs.load_abstracts(load_path=load_path, file_name=file_name)

abstracts = pl.read_parquet(full_path)

In [7]:
abstracts

{'W4394814112': shape: (1,)
 Series: 'W4394814112' [str]
 [
 	"In the current study, high-thr…
 ],
 'W4318671511': shape: (1,)
 Series: 'W4318671511' [str]
 [
 	"Classic theories of stress and…
 ],
 'W4207080146': shape: (1,)
 Series: 'W4207080146' [str]
 [
 	"Lack of trust in biomedical re…
 ],
 'W4206998862': shape: (1,)
 Series: 'W4206998862' [str]
 [
 	"We administered a survey durin…
 ],
 'W4210831730': shape: (1,)
 Series: 'W4210831730' [str]
 [
 	"Hospital-based addiction care …
 ],
 'W4205630067': shape: (1,)
 Series: 'W4205630067' [str]
 [
 	"The validation of the Bergen S…
 ],
 'W4210338148': shape: (1,)
 Series: 'W4210338148' [str]
 [
 	"Social support is consistently…
 ],
 'W4226453222': shape: (1,)
 Series: 'W4226453222' [str]
 [
 	"COVID-19 vaccination is recomm…
 ],
 'W4210494543': shape: (1,)
 Series: 'W4210494543' [str]
 [
 	"In the current study, high-thr…
 ],
 'W4220657291': shape: (1,)
 Series: 'W4220657291' [str]
 [
 	"There is evidence that univers…
 ],
 'W4288053

In [None]:
def preprocess_text(text, nlp_model):
    """
    Pré-processa o texto para uso em um modelo de PLN.

    Args:
    - text (str): Texto a ser pré-processado.

    Returns:
    - str: Texto pré-processado.
    """
    # Remover caracteres especiais, números e sinais de pontuação
    text = re.sub(r'[^A-Za-z\s]', '', text)

    # Converter para minúsculas
    text = text.lower()

    # Processar o texto com spaCy
    doc = nlp_model(text)

    # Tokenizar, remover stopwords e lematizar
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]

    # Reunir tokens em uma string
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

def get_spacy_model():
    """
    Baixa o modelo de linguagem spaCy se não estiver presente.
    """
    try:
        nlp = spacy.load('en_core_web_sm')
    except OSError:
        from spacy.cli import download
        download('en_core_web_sm')
        nlp = spacy.load('en_core_web_sm')
    return nlp

def get_representative_words(text, tokenizer, model, tfidf_scores, top_n=5):
    """
    Gera embeddings contextuais para o texto e extrai as palavras mais representativas.

    Args:
    - text (str): Texto a ser processado.
    - tfidf_scores: Pontuações TF-IDF para o texto.
    - top_n (int): Número de palavras representativas a serem extraídas.

    Returns:
    - list: Lista de palavras mais representativas.
    """
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    cls_embedding = outputs.last_hidden_state[:, 0, :]
    word_embeddings = outputs.last_hidden_state.squeeze(0)

    similarities = cosine_similarity(word_embeddings, cls_embedding, dim=-1)
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'].squeeze(0))

    word_similarities = list(zip(tokens, similarities.tolist(), tfidf_scores.tolist()))
    word_similarities.sort(key=lambda x: (x[1], x[2]), reverse=True)

    representative_words = [word for word, sim, tfidf in word_similarities if word not in ["[CLS]", "[SEP]", "[PAD]"] and len(word) > 3][:top_n]

    return representative_words

In [None]:
nlp = get_spacy_model()

prep_texts = {}
for article_id, text in abstracts.items():
    prep_texts[article_id] = preprocess_text(text, nlp)

In [None]:
corpus = list(prep_texts.values())
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

representative_words_dict = {}

for i, (article_id, text) in enumerate(prep_texts.items()):
    tfidf_scores = tfidf_matrix[i].toarray().flatten()
    representative_words = get_representative_words(text, tokenizer, model, tfidf_scores, top_n=5)
    representative_words_dict[article_id] = representative_words

representative_words_dict