<a href="https://colab.research.google.com/github/citlaline/ecmi_teste/blob/main/Projeto_7_Modelagem_de_T%C3%B3picos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Modelagem de Tópicos

In [1]:
# Para o grupo que usar o BERTopic, é preciso ativar a T4 GPU (Ambiente de Execução > Alterar o Tipo de Ambiente de Execução > T4 GPU > Salvar)
# e rodar essa célula para instalar o BERTopic
!pip install bertopic keybert sentence-transformers umap-learn

Collecting bertopic
  Downloading bertopic-0.16.4-py3-none-any.whl.metadata (23 kB)
Collecting keybert
  Downloading keybert-0.8.5-py3-none-any.whl.metadata (15 kB)
Collecting umap-learn
  Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.39-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Collecting pynndescent>=0.5 (from umap-learn)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Downloading bertopic-0.16.4-py3-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.7/143.7 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading keybert-0.8.5-py3-none-any.whl (37 kB)
Downloading umap_learn-0.5.7-py3-none-any.whl (88 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.8/88.8 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading hdbscan-0.8.39-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x8

### Para todos (menos BERTopic)

In [None]:
import nltk
import pandas as pd

nltk.download('stopwords')
nltk.download('punkt')

artigos = pd.read_csv('articles.csv')
artigos.head()
artigos.value_counts('category')

### Algoritmo K-Means

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Limpeza
# Lowerization, remoção de stopwords...

# TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(textos)

num_clusters = 10
kmeans = KMeans(n_clusters=num_clusters)
kmeans.fit(X)

# Exibir os tópicos
termos = vectorizer.get_feature_names_out()

for i in range(num_clusters):
    print(f"Tópico {i + 1}:")
    sorted_indices = kmeans.cluster_centers_[i].argsort()[::-1]
    top_terms = [termos[index] for index in sorted_indices[:10]]  # top 15 termos
    print(", ".join(top_terms))

## Algoritmo LDA



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Limpeza
# Lowerization, remoção de stopwords...

# Vetorização
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(textos)

# Modelagem
num_topics = 10
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda.fit(X)

# Exibir os tópicos
termos = vectorizer.get_feature_names_out()
for i, topic in enumerate(lda.components_):
    print(f"Tópico {i + 1}:")
    top_terms = [termos[j] for j in topic.argsort()[:-16:-1]]  # top 15 termos
    print(", ".join(top_terms))

## Algoritmo NMF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Limpeza
# Lowerization, remoção de stopwords...


# Vetorização
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(textos)

# Modelagem
num_topics = 10
nmf = NMF(n_components=num_topics)
nmf.fit(X)

# Exibir os tópicos
terms = vectorizer.get_feature_names_out()
for i, topic in enumerate(nmf.components_):
    print(f"Tópico {i + 1}:")
    top_terms = [terms[index] for index in topic.argsort()[:-16:-1]]  # top 15 termos
    print(" ".join(top_terms))


## Algoritmo BERTopic

In [None]:
import pandas as pd

artigos = pd.read_csv('articles.csv')
artigos.head()

from bertopic import BERTopic
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic.representation import KeyBERTInspired, ZeroShotClassification, MaximalMarginalRelevance
from sklearn.feature_extraction.text import CountVectorizer

# Modelo de embeddings
embedding_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
embeddings = embedding_model.encode(artigos['title'].tolist(), show_progress_bar=True)

In [None]:
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=400, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

keybert = KeyBERTInspired()
vectorizer_model = CountVectorizer(ngram_range=(1, 2))

representation_models = {
    'KeyBERT': keybert,
    'MMR': MaximalMarginalRelevance(diversity=0.3)
}

topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    representation_model=representation_models,
    verbose=True
)

topics, probs = topic_model.fit_transform(artigos['title'].tolist(), embeddings)

In [None]:
topicos = topic_model.get_topic_info()
pd.options.display.max_rows = 999
topicos