# Parte 4 - Embeddings 

In [None]:
import fasttext

# fasttext.util.download_model('pt', if_exists='ignore')
model = fasttext.load_model("..\\assets\\cc.pt.300.bin")

In [None]:
import pandas as pd

hinos_analise:pd.DataFrame = pd.read_pickle("..\\assets\\hinos_analise_tokens.pkl")

In [None]:
import numpy as np
from collections import Counter

def embed_text_weighted(tokens, model, method="tfidf"):
    """Embedding com diferentes estratégias de peso"""
    if not tokens:
        return np.zeros(model.get_dimension())

    vectors = []
    weights = []

    if method == "tfidf":
        # Peso baseado em frequência inversa (palavras raras = mais peso)
        token_counts = Counter(tokens)
        total_docs = len(hinos_analise)  # ou seu corpus total

        for word in tokens:
            vector = model.get_word_vector(word)
            # Simulação simples de TF-IDF
            tf = token_counts[word] / len(tokens)
            idf = np.log(
                total_docs
                / (
                    1
                    + sum(
                        1
                        for doc_tokens in hinos_analise["tokens_no_stops"]
                        if word in doc_tokens
                    )
                )
            )
            weight = tf * idf

            vectors.append(vector)
            weights.append(weight)

    elif method == "uniform":
        # Sua abordagem atual
        vectors = [model.get_word_vector(word) for word in tokens]
        weights = [1.0] * len(vectors)

    elif method == "length_penalty":
        # Penaliza documentos muito longos
        vectors = [model.get_word_vector(word) for word in tokens]
        weights = [1.0 / np.sqrt(len(tokens))] * len(vectors)

    # Média ponderada
    weighted_sum = np.average(vectors, axis=0, weights=weights)
    return weighted_sum


# Teste diferentes abordagens
hinos_analise["embedding_tfidf"] = hinos_analise["tokens_no_stops"].apply(
    lambda t: embed_text_weighted(t, model, "tfidf")
)

hinos_analise["embedding_length_penalty"] = hinos_analise["tokens_no_stops"].apply(
    lambda t: embed_text_weighted(t, model, "length_penalty")
)

hinos_analise["embedding"] = hinos_analise["tokens_no_stops"].apply(
    lambda t: embed_text_weighted(t, model, "uniform")
)
hinos_analise

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

sims_tfidf = cosine_similarity(list(hinos_analise["embedding_tfidf"]))
sims_lp = cosine_similarity(list(hinos_analise["embedding_length_penalty"]))
sims = cosine_similarity(list(hinos_analise["embedding"]))

# hinos mais semelhantes ao hino 443
similarities_tfidf = list(enumerate(sims_tfidf[443]))
similarities_tfidf = sorted(similarities_tfidf, key=lambda x: x[1], reverse=True)

similarities_lp = list(enumerate(sims_lp[443]))
similarities_lp = sorted(similarities_lp, key=lambda x: x[1], reverse=True)

similarities = list(enumerate(sims[443]))
similarities = sorted(similarities, key=lambda x: x[1], reverse=True)

print("Mais parecidos com o hino 443: " + hinos_analise["nome"].iloc[443])
print("TF-IDF:")
for idx, score in similarities_tfidf[1:6]:
    print(f"Hino {idx}: {hinos_analise['nome'].iloc[idx]} → similaridade {score:.3f}")

print("Length Penalty:")
for idx, score in similarities_lp[1:6]:
    print(f"Hino {idx}: {hinos_analise['nome'].iloc[idx]} → similaridade {score:.3f}")

print("Uniform:")
for idx, score in similarities[1:6]:
    print(f"Hino {idx}: {hinos_analise['nome'].iloc[idx]} → similaridade {score:.3f}")

In [None]:
sims_tfidf_df = pd.DataFrame(sims_tfidf, index=hinos_analise.index, columns=hinos_analise.index)
sims_lp_df = pd.DataFrame(sims_lp, index=hinos_analise.index, columns=hinos_analise.index)
sims_df = pd.DataFrame(sims, index=hinos_analise.index, columns=hinos_analise.index)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

fig, ax = plt.subplots(1, 3, figsize=(24, 5))

sns.heatmap(sims_tfidf_df, cmap="viridis", annot=False, ax=ax[0])
sns.heatmap(sims_lp_df, cmap="viridis", annot=False, ax=ax[1])
sns.heatmap(sims_df, cmap="viridis", annot=False, ax=ax[2])
plt.title("Similaridade entre hinos (Word Embeddings)")
plt.show()

In [None]:
high_similarity_word2vec = sims[
    (sims > 0.5) & (sims < 1.0)
].stack()  # .reset_index()
high_similarity_word2vec = high_similarity_word2vec[
    high_similarity_word2vec.index.get_level_values(0)
    < high_similarity_word2vec.index.get_level_values(1)
]
high_similarity_word2vec.sort_values(ascending=False)

## Clustering

In [None]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap


pca = PCA(n_components=2)
tsne = TSNE(
    n_components=2,  # 2D
    perplexity=30,  # balanceia "quantos vizinhos" considerar (20-50 costuma ser bom)
    #n_iter=1000,  # número de iterações
    random_state=42,
)
umap_model = umap.UMAP(
    n_neighbors=15,  # controla quão “local” é o agrupamento (10–50 bons valores)
    min_dist=0.1,  # densidade dos pontos no espaço 2D (0 = pontos bem juntos, 0.5 = mais espalhados)
    n_components=2,  # queremos 2D para visualização
    random_state=42,
)

X = np.vstack(hinos_analise["embedding"].values)

X_pca = pca.fit_transform(X)
X_tsne = tsne.fit_transform(X)
X_umap = umap_model.fit_transform(X)

hinos_analise["pca1"] = X_pca[:, 0]
hinos_analise["pca2"] = X_pca[:, 1]

hinos_analise["tsne1"] = X_tsne[:, 0]
hinos_analise["tsne2"] = X_tsne[:, 1]

hinos_analise["umap1"] = X_umap[:, 0]
hinos_analise["umap2"] = X_umap[:, 1]

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

# X é sua matriz vetorial (TF-IDF, embeddings etc.)
# Exemplo: X = tfidf.fit_transform(df["texto"])  ou  X = np.array(list(df["embeddings"]))

range_n_clusters = range(2, 12)
silhouette_scores = []

for k in range_n_clusters:
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(X_tsne)
    score = silhouette_score(X_tsne, labels)
    silhouette_scores.append(score)
    print(f"k = {k}, silhouette = {score:.4f}")

# Visualiza o resultado
plt.figure(figsize=(8, 5))
plt.plot(range_n_clusters, silhouette_scores, marker="o")
plt.title("Análise de Silhouette para seleção de k")
plt.xlabel("Número de clusters (k)")
plt.ylabel("Coeficiente médio de Silhouette")
plt.grid(True)
plt.show()

In [None]:
from sklearn.metrics import silhouette_samples
import numpy as np

k = 9  # exemplo, número escolhido
kmeans = KMeans(n_clusters=k, random_state=42)
labels = kmeans.fit_predict(X_tsne)

silhouette_vals = silhouette_samples(X_tsne, labels)
y_lower = 10

plt.figure(figsize=(8, 6))
for i in range(k):
    ith_vals = silhouette_vals[labels == i]
    ith_vals.sort()
    size_cluster_i = ith_vals.shape[0]
    y_upper = y_lower + size_cluster_i
    plt.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_vals)
    y_lower = y_upper + 10

plt.axvline(x=np.mean(silhouette_vals), color="red", linestyle="--")
plt.title(f"Silhouette plot para k={k}")
plt.xlabel("Coeficiente de Silhouette")
plt.ylabel("Clusters")
plt.show()

In [None]:
from sklearn.cluster import KMeans, AgglomerativeClustering

# n_clusters = len(hinos_analise["categoria_id"].unique())
n_clusters = 9
# número de clusters (experimente, ex.: 4 ou 6)
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
hinos_analise["cluster"] = kmeans.fit_predict(X)

agg_clust = AgglomerativeClustering(n_clusters=n_clusters)
hinos_analise["agg_cluster"] = agg_clust.fit_predict(X)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Criar figura com 3 subplots lado a lado
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot PCA
sns.scatterplot(
    data=hinos_analise, x="pca1", y="pca2", hue="cluster", palette="tab10", s=80, ax=axes[0]
)
axes[0].set_title("PCA - Hinos agrupados por embeddings")
axes[0].legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# Plot t-SNE
sns.scatterplot(
    data=hinos_analise, x="tsne1", y="tsne2", hue="cluster", palette="tab10", s=80, ax=axes[1]
)
axes[1].set_title("t-SNE - Hinos agrupados por embeddings")
axes[1].legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# Plot UMAP
sns.scatterplot(
    data=hinos_analise, x="umap1", y="umap2", hue="cluster", palette="tab10", s=80, ax=axes[2]
)
axes[2].set_title("UMAP - Hinos agrupados por embeddings")
axes[2].legend(bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()

In [None]:
from collections import Counter

for c in sorted(hinos_analise["cluster"].unique()):
    cluster_tokens = hinos_analise.loc[hinos_analise["cluster"] == c, "tokens_no_stops"].sum()
    top_terms = Counter(cluster_tokens).most_common(10)
    print(f"\nCluster {c}:")
    print([t for t, _ in top_terms])
    print(hinos_analise.loc[hinos_analise["cluster"] == c, "nome"][:5])

print(hinos_analise["cluster"].value_counts().sort_index())

## Tópicos

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF

n_topics = n_clusters

# Criar TF-IDF apenas para análise de tópicos
vectorizer = TfidfVectorizer(
    max_features=500, 
    stop_words=None,  # você já removeu as stopwords
    ngram_range=(1, 2),  # uni e bigramas
    min_df=2  # palavra deve aparecer em pelo menos 2 documentos
)

# Usar texto já limpo (sem stopwords)
texts_for_topics = [' '.join(tokens) for tokens in hinos_analise['tokens_no_stops']]
X_tfidf = vectorizer.fit_transform(texts_for_topics)

# Agora podemos usar LDA e NMF
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42, max_iter=10)
lda_topics = lda.fit_transform(X_tfidf)

# NMF também funciona com TF-IDF
nmf = NMF(n_components=n_topics, random_state=42, max_iter=100)
nmf_topics = nmf.fit_transform(X_tfidf)

# Função original adaptada
def display_topics(model, feature_names, n_top_words=10):
    for idx, topic in enumerate(model.components_):
        print(f"\nTópico {idx+1}:")
        top_words = [feature_names[i] for i in topic.argsort()[:-n_top_words-1:-1]]
        print(f"Palavras-chave: {' | '.join(top_words)}")

feature_names = vectorizer.get_feature_names_out()

print("=== LDA (Latent Dirichlet Allocation) ===")
display_topics(lda, feature_names)

print("\n=== NMF (Non-negative Matrix Factorization) ===")
display_topics(nmf, feature_names)

# Atribuir tópicos aos hinos
hinos_analise["LDA_topic"] = lda_topics.argmax(axis=1)
hinos_analise["NMF_topic"] = nmf_topics.argmax(axis=1)

print(f"\nDistribuição LDA:")
print(hinos_analise["LDA_topic"].value_counts().sort_index())

print(f"\nDistribuição NMF:")
print(hinos_analise["NMF_topic"].value_counts().sort_index())

In [None]:
# Visualizar a distribuição de tópicos
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# LDA
hinos_analise['LDA_topic'].value_counts().sort_index().plot(kind='bar', ax=axes[0], title='LDA (TF-IDF)')
axes[0].set_xlabel('Tópico')
axes[0].set_ylabel('Número de Hinos')

# NMF
hinos_analise['NMF_topic'].value_counts().sort_index().plot(kind='bar', ax=axes[1], title='NMF (TF-IDF)')
axes[1].set_xlabel('Tópico')
axes[1].set_ylabel('Número de Hinos')

plt.tight_layout()
plt.show()

In [None]:
# Criar figura com 3 subplots lado a lado
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot PCA
sns.scatterplot(
    data=hinos_analise,
    x="pca1",
    y="pca2",
    hue="NMF_topic",
    palette="tab10",
    s=80,
    ax=axes[0],
)
axes[0].set_title("PCA - Hinos agrupados por embeddings")
axes[0].legend(bbox_to_anchor=(1.05, 1), loc="upper left")

# Plot t-SNE
sns.scatterplot(
    data=hinos_analise,
    x="tsne1",
    y="tsne2",
    hue="NMF_topic",
    palette="tab10",
    s=80,
    ax=axes[1],
)
axes[1].set_title("t-SNE - Hinos agrupados por embeddings")
axes[1].legend(bbox_to_anchor=(1.05, 1), loc="upper left")

# Plot UMAP
sns.scatterplot(
    data=hinos_analise,
    x="umap1",
    y="umap2",
    hue="NMF_topic",
    palette="tab10",
    s=80,
    ax=axes[2],
)
axes[2].set_title("UMAP - Hinos agrupados por embeddings")
axes[2].legend(bbox_to_anchor=(1.05, 1), loc="upper left")

plt.tight_layout()
plt.show()