# Part 3 - texto

- maior/menor louvor
- word length (maybe)
- bag-of-words with frequency
- gensim corpus
- tf-idf
- named entity recognition (spacy or polyglot)
- next: category classification for avulsos

## Maior e menor louvor

In [None]:
import pandas as pd
from pathlib import Path

assets_folder = Path("../assets")
hinos_analise = pd.read_pickle(assets_folder / "hinos_analise.pkl")
hinos_analise = hinos_analise.set_index("numero")
hinos_analise["categoria_abr"] = hinos_analise["categoria"].apply(
    lambda x: x[:13] + "..." if len(x) > 15 else x
)
hinos_analise

In [None]:
import nltk
from tqdm import tqdm

nltk.download("stopwords")
# testar tokenizacao direto com spacy

stopwords = nltk.corpus.stopwords.words("portuguese")
stopwords.extend(["ó", "ti", "pra", "lo", "oh"])
text = []
text_no_stops = []

for hino in tqdm(hinos_analise.to_dict("records")):
    tokens = nltk.tokenize.regexp_tokenize(hino["texto_limpo"], r"\w+")
    # Replace "MINH" with "MINHA" with regex
    tokens = [nltk.re.sub(r"^minh$", "minha", palavra.lower()) for palavra in tokens]

    tokens_no_stops = [
        palavra for palavra in tokens if palavra.lower() not in stopwords
    ]
    # remover pontuacao
    tokens_no_stops = [palavra for palavra in tokens_no_stops if palavra.isalpha()]
    text.append(tokens)
    text_no_stops.append(tokens_no_stops)

hinos_analise["tokens"] = text
hinos_analise["tokens_no_stops"] = text_no_stops
# considerando numero total de palavras, pois todas elas tem que ser cantadas, logo impactam no tamanho prático do hino
hinos_analise["num_tokens"] = hinos_analise["tokens"].apply(len)
hinos_analise

In [None]:
display(hinos_analise.sort_values("num_tokens", ascending=False))
display(hinos_analise.sort_values("num_tokens", ascending=True))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Ensure 'categoria_id' is treated as a categorical variable
hinos_analise["categoria_id"] = hinos_analise["categoria_id"].astype("category")

# Create a mapping between categoria_id and categoria
categoria_mapping = (
    hinos_analise[["categoria_id", "categoria_abr"]]
    .drop_duplicates()
    .set_index("categoria_id")["categoria_abr"]
)

# Create a violin plot
plt.figure(figsize=(12, 6))
""" sns.violinplot(
    data=hinos_analise,
    x="categoria_id",
    y="num_tokens",
    palette="viridis",
    inner="quartile",
) """
sns.boxplot(data=hinos_analise, x="categoria_id", y="num_tokens", palette="viridis")

# Replace x-ticks with corresponding 'categoria' names
plt.xticks(
    ticks=range(len(categoria_mapping)),
    labels=categoria_mapping,
    rotation=90,
    ha="right",
)

# Add labels and title
plt.xlabel("Categoria")
plt.ylabel("Number of Tokens")
plt.title("Relationship Between Number of Tokens and Categoria (Violin Plot)")

# Show the plot
plt.tight_layout()
plt.show()

## Word length (maybe)

- bag-of-words with frequency + word map
- gensim corpus
- tf-idf
- named entity recognition (spacy or polyglot)

In [None]:
tokenized_lines = hinos_analise.iloc[0]["tokens_no_stops"]
tokenized_lines[:10]

In [None]:
# Make a frequency list of lengths: line_num_words
line_num_words = [len(t_line) for t_line in hinos_analise["tokens_no_stops"].explode().tolist()]

# Plot a histogram of the line lengths
plt.hist(line_num_words)

# Show the plot
plt.show()

## Palavras

In [None]:
palavras = hinos_analise["tokens_no_stops"].explode().tolist()

### Palavras mais longas

In [None]:
# find the 10 largest words
palavras_unique = list(set(palavras))
palavras_unique.sort(key=len, reverse=True)
print(len(palavras_unique))
pd.DataFrame({
    "palavra": palavras_unique[:10],
    "tamanho": [len(palavra) for palavra in palavras_unique[:10]]
})

### Bag-of-words with frequency

In [None]:
print(len(palavras))
set_words_full = list(set(palavras))
count_words = [palavras.count(i) for i in set_words_full]

contagem_palav = pd.DataFrame(
    zip(set_words_full, count_words), columns=["palavra", "contagem"]
)
contagem_palav = contagem_palav.sort_values("contagem", ascending=False)
contagem_palav

In [None]:
# create a percentage column
contagem_palav["percentual"] = contagem_palav["contagem"] / len(palavras) * 100
contagem_palav

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Create a dictionary from the word frequency data
word_freq_dict = dict(zip(contagem_palav['palavra'], contagem_palav['contagem']))

# Generate word cloud
wordcloud = WordCloud(
    width=800, 
    height=400, 
    background_color='white',
    max_words=100,
    colormap='viridis',
    relative_scaling=0.5,
    random_state=42
).generate_from_frequencies(word_freq_dict)

# Plot the word cloud
plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud - Palavras mais frequentes nos hinos', fontsize=16, pad=20)
plt.tight_layout()
plt.show()

# Also show top 20 most frequent words as a bar chart
plt.figure(figsize=(12, 8))
top_20 = contagem_palav.head(20)
plt.barh(range(len(top_20)), top_20['contagem'], color='skyblue')
plt.yticks(range(len(top_20)), top_20['palavra'])
plt.xlabel('Frequência')
plt.title('Top 20 Palavras Mais Frequentes')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

~~## Named Entity Recognition~~

Tentei:
- Gensim corpus (problemas de compatibilidade)
- NLTK NER (ruim)
- Polyglot (não consegui instalar)
- SpaCy (péssimos resultados): comparei tokenização + lematização com a anterior, prefiro a minha.

## N-grams

In [None]:
from collections import Counter

# Exemplo: gerar bigramas do corpus inteiro
def get_bigrams(tokens):
    return list(nltk.ngrams(tokens, 2))  # 2 = bigramas


# Gerar bigramas para todos os hinos
hinos_analise["bigrams"] = hinos_analise["tokens_no_stops"].apply(get_bigrams)

# Contar bigramas mais frequentes no corpus inteiro
all_bigrams = [bigram for hino in hinos_analise["bigrams"] for bigram in hino]
bigram_freq = Counter(all_bigrams)

bigram_freq.most_common(10)

In [None]:
# Exemplo: gerar trigramas do corpus inteiro
def get_trigrams(tokens):
    return list(nltk.ngrams(tokens, 3))  # 3 = trigrams


# Gerar trigrams para todos os hinos
hinos_analise["trigrams"] = hinos_analise["tokens_no_stops"].apply(get_trigrams)

# Contar trigrams mais frequentes no corpus inteiro
all_trigrams = [trigram for hino in hinos_analise["trigrams"] for trigram in hino]
trigram_freq = Counter(all_trigrams)

trigram_freq.most_common(10)

## Matriz de frequencia e similaridade

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Juntar os tokens em string (CountVectorizer trabalha com texto)
hinos_analise["tokens_str"] = hinos_analise["tokens_no_stops"].apply(lambda t: " ".join(t))

# Criar o vetor de frequências (aqui só unigramas)
vectorizer = CountVectorizer(
    ngram_range=(1, 3)
)  # unigramas e bigramas ngram_range=(1, 2)
X = vectorizer.fit_transform(hinos_analise["tokens_str"])

# Similaridade de cosseno entre hinos
similarity = cosine_similarity(X)
similarity_df = pd.DataFrame(similarity, index=hinos_analise.index, columns=hinos_analise.index)
similarity_df.head()

In [None]:
high_similarity = similarity_df[(similarity_df > 0.5) & (similarity_df < 1.0)].stack()#.reset_index()
high_similarity = high_similarity[high_similarity.index.get_level_values(0) < high_similarity.index.get_level_values(1)]
high_similarity.sort_values(ascending=False)

## Matriz com TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer


# TF-IDF: unigrams e bigrams
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf = vectorizer.fit_transform(hinos_analise["tokens_str"])

# DataFrame para visualizar
tfidf_df = pd.DataFrame(
    X_tfidf.toarray(), columns=vectorizer.get_feature_names_out(), index=hinos_analise.index
)

tfidf_df.head()

In [None]:
def top_terms_for_hymn(row, features, top_n=5):
    row_data = list(zip(features, row))
    row_data = sorted(row_data, key=lambda x: x[1], reverse=True)
    return row_data[:top_n]


features = vectorizer.get_feature_names_out()

for idx, row in enumerate(X_tfidf.toarray()[:5]): 
    top_terms = top_terms_for_hymn(row, features, top_n=5)
    print(f"\n🎵 Hino {idx}:")
    for term, score in top_terms:
        print(f"  {term}: {score:.3f}")

In [None]:
similarity_tfidf = cosine_similarity(X_tfidf)

similarity_df_tfidf = pd.DataFrame(similarity_tfidf, index=hinos_analise.index, columns=hinos_analise.index)

similarity_df_tfidf.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 10))
sns.heatmap(similarity_df_tfidf, cmap="viridis", annot=False)
plt.title("Similaridade entre hinos (TF-IDF)")
plt.show()

In [None]:
high_similarity_tfidf = similarity_df_tfidf[
    (similarity_df_tfidf > 0.5) & (similarity_df_tfidf < 1.0)
].stack()  # .reset_index()
high_similarity_tfidf = high_similarity_tfidf[
    high_similarity_tfidf.index.get_level_values(0)
    < high_similarity_tfidf.index.get_level_values(1)
]
high_similarity_tfidf.sort_values(ascending=False)

In [None]:
hinos_analise.to_pickle(assets_folder / "hinos_analise_tokens.pkl")