# Parte 4 - Sequence embeddings

In [None]:
from sentence_transformers import SentenceTransformer

# model = SentenceTransformer("distiluse-base-multilingual-cased-v2")
# model = SentenceTransformer("neuralmind/bert-base-portuguese-cased")
# model = SentenceTransformer("rufimelo/Legal-BERTimbau-sts-base")  # português brasileiro
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")  # mais leve e eficiente

In [None]:
import pandas as pd

hinos_analise: pd.DataFrame = pd.read_pickle("..\\assets\\hinos_analise_word_embeddings.pkl")

In [None]:
import numpy as np

# cria embeddings diretamente para cada hino (texto inteiro)
embeddings = model.encode(hinos_analise["texto_limpo"].tolist(), show_progress_bar=True)
X_sent = np.array(embeddings)
hinos_analise["sent_embeddings"] = list(X_sent)

print(X_sent.shape)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(X_sent)

# hinos mais semelhantes ao hino 443
similarities = list(enumerate(similarity_matrix[443]))
similarities = sorted(similarities, key=lambda x: x[1], reverse=True)

print("Mais parecidos com o hino 443:")
for idx, score in similarities[1:6]:
    print(f"Hino {idx}: {hinos_analise['nome'].iloc[idx]} → similaridade {score:.3f}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

similarity_matrix_df = pd.DataFrame(
    similarity_matrix, index=hinos_analise.index, columns=hinos_analise.index
)

plt.figure(figsize=(8, 6))
sns.heatmap(similarity_matrix_df, cmap="viridis", annot=False, cbar=True)
plt.title("Similaridade entre hinos (Sentence Embeddings)")
plt.show()

In [None]:
import umap
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

tsne = TSNE(
    n_components=2,  # 2D
    perplexity=30,
    random_state=42,
)
X_tsne = tsne.fit_transform(X_sent)

umap_model = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
X_umap = umap_model.fit_transform(X_sent)

hinos_analise["sent_tsne1"] = X_tsne[:, 0]
hinos_analise["sent_tsne2"] = X_tsne[:, 1]

hinos_analise["sent_umap1"] = X_umap[:, 0]
hinos_analise["sent_umap2"] = X_umap[:, 1]

fig, ax = plt.subplots(1, 2, figsize=(16, 7))
sns.scatterplot(data=hinos_analise, x="sent_tsne1", y="sent_tsne2", ax=ax[0])
ax[0].set_title("t-SNE")
sns.scatterplot(data=hinos_analise, x="sent_umap1", y="sent_umap2", ax=ax[1])
ax[1].set_title("UMAP")
plt.suptitle("Mapa dos hinos com Sentence Embeddings")
plt.show()

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt


range_n_clusters = range(2, 12)
silhouette_scores = []

for k in range_n_clusters:
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(X_umap)
    score = silhouette_score(X_umap, labels)
    silhouette_scores.append(score)
    print(f"k = {k}, silhouette = {score:.4f}")

# Visualiza o resultado
plt.figure(figsize=(8, 5))
plt.plot(range_n_clusters, silhouette_scores, marker="o")
plt.title("Análise de Silhouette para seleção de k")
plt.xlabel("Número de clusters (k)")
plt.ylabel("Coeficiente médio de Silhouette")
plt.grid(True)
plt.show()

In [None]:
# número de clusters (experimente, ex.: 4 ou 6)
kmeans = KMeans(n_clusters=4, random_state=42)
hinos_analise["sent_cluster"] = kmeans.fit_predict(X_umap)

plt.figure(figsize=(10, 8))
sns.scatterplot(
    data=hinos_analise, x="sent_umap1", y="sent_umap2", hue="sent_cluster", palette="tab10", s=80
)
plt.title("Mapa dos hinos com Sentence Embeddings (UMAP)")
plt.show()

In [None]:
from collections import Counter

for c in sorted(hinos_analise["sent_cluster"].unique()):
    cluster_tokens = hinos_analise.loc[
        hinos_analise["sent_cluster"] == c, "tokens_no_stops"
    ].sum()
    top_terms = Counter(cluster_tokens).most_common(10)
    print(f"\nCluster {c}:")
    print([t for t, _ in top_terms])
    print(hinos_analise.loc[hinos_analise["sent_cluster"] == c, "nome"][:5])

print(hinos_analise["sent_cluster"].value_counts().sort_index())

In [None]:
query = "a palavra é alimento para a alma"
query_vec = model.encode([query])

scores = cosine_similarity(query_vec, X_sent)[0]
top_idx = np.argsort(scores)[::-1][:10]

print("Top hinos para a busca:")
for i in top_idx:
    print(f"Hino {i}: {hinos_analise['nome'].iloc[i]} → score {scores[i]:.3f}")

# Tópicos

In [None]:
from bertopic import BERTopic

# Criar o modelo BERTopic
topic_model = BERTopic(embedding_model=model)

# Treinar modelo
topics, probs = topic_model.fit_transform(hinos_analise["texto_limpo"])

# Associar tópicos ao DataFrame
hinos_analise["BERT_topic"] = topics

In [None]:
print("\nDistribuição de tópicos por hino:")
print(hinos_analise["BERT_topic"].value_counts())

# Mostrar os tópicos descobertos
print("\nTópicos extraídos:")
for topic_num in set(topics):
    if (
        topic_num != -1
    ):  # -1 significa "outlier" (documento não encaixou em nenhum cluster)
        palavras = topic_model.get_topic(topic_num)
        print(f"Tópico {topic_num}:")
        print([word for word, _ in palavras])

In [None]:
# número de clusters (experimente, ex.: 4 ou 6)
plt.figure(figsize=(10, 8))
sns.scatterplot(
    data=hinos_analise, x="sent_umap1", y="sent_umap2", hue="BERT_topic", palette="tab10", s=80
)
plt.title("Mapa dos hinos com Sentence Embeddings (BERTopic)")
plt.show()

# Salvamento de informações novas

In [None]:
hinos_analise.to_pickle("..\\assets\\hinos_analise_embeddings_complete.pkl")

In [None]:
similarity_matrix_df.to_pickle("..\\assets\\similarity_matrix_sentence_embeddings.pkl")