In [2]:
import pandas as pd
merged_df = pd.read_parquet("data/merged_df.parquet")
merged_df.columns

Index(['Title', 'description', 'authors', 'image', 'previewLink', 'publisher',
       'publishedDate', 'infoLink', 'categories', 'ratingsCount', 'Id',
       'Price', 'User_id', 'profileName', 'score', 'time', 'summary', 'text'],
      dtype='object')

In [3]:
harry_df = merged_df[merged_df['Id'] == 'B000IEZE3G']
harry_df.head()

Unnamed: 0,Title,description,authors,image,previewLink,publisher,publishedDate,infoLink,categories,ratingsCount,Id,Price,User_id,profileName,score,time,summary,text
721440,Harry Potter and The Sorcerer's Stone,Celebrate 20 years of Harry Potter magic! Harr...,['J. K. Rowling'],http://books.google.com/books/content?id=HksgD...,http://books.google.com/books?id=HksgDQAAQBAJ&...,Bloomsbury Publishing,2014-01-09,http://books.google.com/books?id=HksgDQAAQBAJ&...,['Juvenile Fiction'],1.0,B000IEZE3G,,,,5.0,926640000,"A great book, I couldn't stop reading it.","Famous before he can talk, Harry Potter the so..."
721441,Harry Potter and The Sorcerer's Stone,Celebrate 20 years of Harry Potter magic! Harr...,['J. K. Rowling'],http://books.google.com/books/content?id=HksgD...,http://books.google.com/books?id=HksgDQAAQBAJ&...,Bloomsbury Publishing,2014-01-09,http://books.google.com/books?id=HksgDQAAQBAJ&...,['Juvenile Fiction'],1.0,B000IEZE3G,,,,5.0,926640000,I believe this is the greatest book in history.,I believe this is the greatest book in history...
721442,Harry Potter and The Sorcerer's Stone,Celebrate 20 years of Harry Potter magic! Harr...,['J. K. Rowling'],http://books.google.com/books/content?id=HksgD...,http://books.google.com/books?id=HksgDQAAQBAJ&...,Bloomsbury Publishing,2014-01-09,http://books.google.com/books?id=HksgDQAAQBAJ&...,['Juvenile Fiction'],1.0,B000IEZE3G,,,,5.0,926553600,"It was great, it inspired me","This book, Harry Potter and the sorcerers ston..."
721443,Harry Potter and The Sorcerer's Stone,Celebrate 20 years of Harry Potter magic! Harr...,['J. K. Rowling'],http://books.google.com/books/content?id=HksgD...,http://books.google.com/books?id=HksgDQAAQBAJ&...,Bloomsbury Publishing,2014-01-09,http://books.google.com/books?id=HksgDQAAQBAJ&...,['Juvenile Fiction'],1.0,B000IEZE3G,,,,5.0,926467200,A suspenseful story,I enjoyed this book so much because it was so ...
721444,Harry Potter and The Sorcerer's Stone,Celebrate 20 years of Harry Potter magic! Harr...,['J. K. Rowling'],http://books.google.com/books/content?id=HksgD...,http://books.google.com/books?id=HksgDQAAQBAJ&...,Bloomsbury Publishing,2014-01-09,http://books.google.com/books?id=HksgDQAAQBAJ&...,['Juvenile Fiction'],1.0,B000IEZE3G,,,,5.0,926467200,Rowling is magnificent!,Harry Potter takes your imagination on a wild ...


In [4]:
from transformers import pipeline
from tqdm import tqdm

def analyze_sentiment(input_df: pd.DataFrame, text_column: str = 'text') -> pd.DataFrame:
    """
    Realiza a análise de sentimento em uma coluna de texto de um DataFrame.

    Args:
        input_df (pd.DataFrame): O DataFrame que contém os dados.
        text_column (str): O nome da coluna com o texto a ser analisado.

    Returns:
        pd.DataFrame: O DataFrame original com duas novas colunas: 
                      'sentiment_label' (POSITIVE/NEGATIVE) e 'sentiment_score'.
    """
    print("Iniciando a análise de sentimento...")
    
    # Faz uma cópia para evitar modificar o DataFrame original fora da função
    df = input_df.copy()

    # 1. Carrega o modelo pré-treinado do Hugging Face
    try:
        sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
    except Exception as e:
        print(f"Erro ao carregar o modelo do Hugging Face: {e}")
        print("Verifique sua conexão com a internet ou as bibliotecas instaladas.")
        return input_df # Retorna o DF original em caso de erro

    # 2. Garante que a coluna de texto exista e não tenha valores nulos
    if text_column not in df.columns:
        print(f"ERRO: A coluna '{text_column}' não foi encontrada no DataFrame.")
        return input_df

    df[text_column] = df[text_column].fillna('')
    texts_to_analyze = df[text_column].tolist()
    
    if not texts_to_analyze:
        print("AVISO: A coluna de texto está vazia. Nenhuma análise a ser feita.")
        return df

    # 3. Executa a análise com uma barra de progresso
    print(f"Analisando {len(texts_to_analyze)} reviews...")
    # O parâmetro truncation=True garante que textos muito longos não causem erro
    # O parâmetro batch_size pode ser ajustado para otimizar a velocidade em GPUs
    results = []
    for out in tqdm(sentiment_pipeline(texts_to_analyze, truncation=True, batch_size=8), total=len(texts_to_analyze)):
        results.append(out)

    # 4. Adiciona os resultados ao DataFrame
    df['sentiment_label'] = [result['label'] for result in results]
    df['sentiment_score'] = [result['score'] for result in results]

    print("\nAnálise de sentimento concluída!")
    return df

df_sentiment = analyze_sentiment(harry_df)
df_sentiment.head()

  from .autonotebook import tqdm as notebook_tqdm


Iniciando a análise de sentimento...


Device set to use cpu


Analisando 6796 reviews...


100%|██████████| 6796/6796 [00:00<00:00, 3391776.53it/s]


Análise de sentimento concluída!





Unnamed: 0,Title,description,authors,image,previewLink,publisher,publishedDate,infoLink,categories,ratingsCount,Id,Price,User_id,profileName,score,time,summary,text,sentiment_label,sentiment_score
721440,Harry Potter and The Sorcerer's Stone,Celebrate 20 years of Harry Potter magic! Harr...,['J. K. Rowling'],http://books.google.com/books/content?id=HksgD...,http://books.google.com/books?id=HksgDQAAQBAJ&...,Bloomsbury Publishing,2014-01-09,http://books.google.com/books?id=HksgDQAAQBAJ&...,['Juvenile Fiction'],1.0,B000IEZE3G,,,,5.0,926640000,"A great book, I couldn't stop reading it.","Famous before he can talk, Harry Potter the so...",POSITIVE,0.986109
721441,Harry Potter and The Sorcerer's Stone,Celebrate 20 years of Harry Potter magic! Harr...,['J. K. Rowling'],http://books.google.com/books/content?id=HksgD...,http://books.google.com/books?id=HksgDQAAQBAJ&...,Bloomsbury Publishing,2014-01-09,http://books.google.com/books?id=HksgDQAAQBAJ&...,['Juvenile Fiction'],1.0,B000IEZE3G,,,,5.0,926640000,I believe this is the greatest book in history.,I believe this is the greatest book in history...,POSITIVE,0.999635
721442,Harry Potter and The Sorcerer's Stone,Celebrate 20 years of Harry Potter magic! Harr...,['J. K. Rowling'],http://books.google.com/books/content?id=HksgD...,http://books.google.com/books?id=HksgDQAAQBAJ&...,Bloomsbury Publishing,2014-01-09,http://books.google.com/books?id=HksgDQAAQBAJ&...,['Juvenile Fiction'],1.0,B000IEZE3G,,,,5.0,926553600,"It was great, it inspired me","This book, Harry Potter and the sorcerers ston...",POSITIVE,0.996753
721443,Harry Potter and The Sorcerer's Stone,Celebrate 20 years of Harry Potter magic! Harr...,['J. K. Rowling'],http://books.google.com/books/content?id=HksgD...,http://books.google.com/books?id=HksgDQAAQBAJ&...,Bloomsbury Publishing,2014-01-09,http://books.google.com/books?id=HksgDQAAQBAJ&...,['Juvenile Fiction'],1.0,B000IEZE3G,,,,5.0,926467200,A suspenseful story,I enjoyed this book so much because it was so ...,POSITIVE,0.999263
721444,Harry Potter and The Sorcerer's Stone,Celebrate 20 years of Harry Potter magic! Harr...,['J. K. Rowling'],http://books.google.com/books/content?id=HksgD...,http://books.google.com/books?id=HksgDQAAQBAJ&...,Bloomsbury Publishing,2014-01-09,http://books.google.com/books?id=HksgDQAAQBAJ&...,['Juvenile Fiction'],1.0,B000IEZE3G,,,,5.0,926467200,Rowling is magnificent!,Harry Potter takes your imagination on a wild ...,POSITIVE,0.999353


In [6]:
import pandas as pd
# Bibliotecas do BERTopic e Sentence-Transformers
# Certifique-se de que estão instaladas com os comandos do artefato anterior.
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from typing import Tuple, Optional
import numpy as np
import re
from bertopic.representation import KeyBERTInspired
from sklearn.feature_extraction.text import CountVectorizer

def run_bertopic_analysis(
    input_df: pd.DataFrame, 
    text_column: str = 'text', 
    save_path: Optional[str] = None
) -> Tuple[Optional[pd.DataFrame], Optional[BERTopic]]:
    """
    Realiza uma análise de tópicos otimizada para gerar resultados de alta qualidade.

    Melhorias principais:
    1.  Limpeza de texto para remover ruídos.
    2.  Uso de CountVectorizer para remover stopwords e considerar n-gramas.
    3.  Uso de KeyBERTInspired para criar nomes de tópicos mais significativos.
    4.  Ajuste de parâmetros do BERTopic para tópicos mais robustos.

    Args:
        input_df (pd.DataFrame): O DataFrame de entrada.
        text_column (str): A coluna com o texto a ser analisado.
        save_path (Optional[str]): Caminho para salvar o modelo treinado.

    Returns:
        Uma tupla contendo o DataFrame com os resultados e o modelo BERTopic treinado.
    """
    print("--- Iniciando Análise de Tópicos Otimizada ---")
    
    df = input_df.copy()

    # --- 1. Preparação e Limpeza dos Dados ---
    print("1. Preparando e limpando os textos...")
    if text_column not in df.columns:
        print(f"ERRO: A coluna '{text_column}' não foi encontrada.")
        return None, None
    
    # Função simples para limpar o texto: remove caracteres não alfabéticos e converte para minúsculas.
    def clean_text(text):
        text = str(text).lower()
        text = re.sub(r'[^a-z\s]', '', text)
        return text

    df[text_column] = df[text_column].fillna('').apply(clean_text)
    docs = df[text_column].tolist()

    if not docs or all(s == '' for s in docs):
        print("AVISO: A coluna de texto está vazia. Análise cancelada.")
        return df, None

    # --- 2. Configuração dos Componentes do BERTopic ---
    print("2. Configurando os componentes do modelo...")

    # A) CountVectorizer: Essencial para limpar as palavras dos tópicos.
    #    - stop_words="english": Remove palavras comuns (ex: 'the', 'a', 'is').
    #    - ngram_range=(1, 3): Permite que os tópicos sejam formados por palavras únicas,
    #      bigramas (ex: 'harry potter') e trigramas (ex: 'game of thrones').
    vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 3))

    # B) Representation Model: Para criar nomes de tópicos melhores.
    #    KeyBERTInspired usa um modelo de extração de palavras-chave para encontrar
    #    os termos mais representativos, resultando em nomes mais coerentes.
    representation_model = KeyBERTInspired()

    # C) Modelo de Embedding
    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

    # --- 3. Instanciação e Treinamento do BERTopic ---
    print("3. Instanciando e treinando o modelo BERTopic...")
    
    topic_model = BERTopic(
        embedding_model=embedding_model,
        vectorizer_model=vectorizer_model,
        representation_model=representation_model,
        language="english",
        calculate_probabilities=True,
        min_topic_size=30,  # Aumentamos para garantir tópicos mais robustos e gerais
        verbose=True
    )
    
    print(f"\nTreinando o modelo em {len(docs)} reviews. Isso pode levar um tempo...")
    topics, probs = topic_model.fit_transform(docs)

    # --- 4. Processamento dos Resultados ---
    print("\n4. Processando e adicionando resultados ao DataFrame...")
    df['topic'] = topics
    df['topic_probability'] = np.max(probs, axis=1) if probs is not None and probs.ndim > 1 else probs
    
    print("\nAnálise de tópicos concluída com sucesso!")

    # --- 5. Salvamento e Exibição ---
    if save_path:
        print(f"\nSalvando o modelo em: {save_path}")
        topic_model.save(save_path, serialization="safetensors")
        print("Modelo salvo com sucesso!")

    print("\n--- Informações dos Tópicos Encontrados (Otimizados) ---")
    print(topic_model.get_topic_info())

    return df, topic_model
    
# Define o caminho onde o modelo será salvo
model_save_path = "modelo_bertopic"
# Chama a função para analisar os tópicos e salvar o modelo
df_topics, trained_model = run_bertopic_analysis(df_sentiment, save_path=model_save_path)
df_topics.head()
    



--- Iniciando Análise de Tópicos Otimizada ---
1. Preparando e limpando os textos...
2. Configurando os componentes do modelo...


2025-06-29 19:12:58,325 - BERTopic - Embedding - Transforming documents to embeddings.


3. Instanciando e treinando o modelo BERTopic...

Treinando o modelo em 6796 reviews. Isso pode levar um tempo...


Batches: 100%|██████████| 213/213 [01:32<00:00,  2.30it/s]
2025-06-29 19:14:31,076 - BERTopic - Embedding - Completed ✓
2025-06-29 19:14:31,077 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-29 19:14:55,448 - BERTopic - Dimensionality - Completed ✓
2025-06-29 19:14:55,450 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-29 19:14:55,906 - BERTopic - Cluster - Completed ✓
2025-06-29 19:14:55,918 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-06-29 19:15:00,458 - BERTopic - Representation - Completed ✓



4. Processando e adicionando resultados ao DataFrame...

Análise de tópicos concluída com sucesso!

Salvando o modelo em: modelo_bertopic
Modelo salvo com sucesso!

--- Informações dos Tópicos Encontrados (Otimizados) ---
   Topic  Count                                               Name  \
0      0   4917  0_harry potter books_potter books_book harry_p...   
1      1   1656   1_best book read_loved book_best book_book great   
2      2    136  2_harry potter books_potter books_audio books_...   
3      3     87  3_awesome awesome awesome_book came_described_...   

                                      Representation  \
0  [harry potter books, potter books, book harry,...   
1  [best book read, loved book, best book, book g...   
2  [harry potter books, potter books, audio books...   
3  [awesome awesome awesome, book came, described...   

                                 Representative_Docs  
0  [harry potter and the sorcerers stone is a boo...  
1  [awesome book best ive ever read

Unnamed: 0,Title,description,authors,image,previewLink,publisher,publishedDate,infoLink,categories,ratingsCount,...,User_id,profileName,score,time,summary,text,sentiment_label,sentiment_score,topic,topic_probability
721440,Harry Potter and The Sorcerer's Stone,Celebrate 20 years of Harry Potter magic! Harr...,['J. K. Rowling'],http://books.google.com/books/content?id=HksgD...,http://books.google.com/books?id=HksgDQAAQBAJ&...,Bloomsbury Publishing,2014-01-09,http://books.google.com/books?id=HksgDQAAQBAJ&...,['Juvenile Fiction'],1.0,...,,,5.0,926640000,"A great book, I couldn't stop reading it.",famous before he can talk harry potter the son...,POSITIVE,0.986109,0,0.778921
721441,Harry Potter and The Sorcerer's Stone,Celebrate 20 years of Harry Potter magic! Harr...,['J. K. Rowling'],http://books.google.com/books/content?id=HksgD...,http://books.google.com/books?id=HksgDQAAQBAJ&...,Bloomsbury Publishing,2014-01-09,http://books.google.com/books?id=HksgDQAAQBAJ&...,['Juvenile Fiction'],1.0,...,,,5.0,926640000,I believe this is the greatest book in history.,i believe this is the greatest book in history...,POSITIVE,0.999635,0,0.932609
721442,Harry Potter and The Sorcerer's Stone,Celebrate 20 years of Harry Potter magic! Harr...,['J. K. Rowling'],http://books.google.com/books/content?id=HksgD...,http://books.google.com/books?id=HksgDQAAQBAJ&...,Bloomsbury Publishing,2014-01-09,http://books.google.com/books?id=HksgDQAAQBAJ&...,['Juvenile Fiction'],1.0,...,,,5.0,926553600,"It was great, it inspired me",this book harry potter and the sorcerers stone...,POSITIVE,0.996753,0,0.867016
721443,Harry Potter and The Sorcerer's Stone,Celebrate 20 years of Harry Potter magic! Harr...,['J. K. Rowling'],http://books.google.com/books/content?id=HksgD...,http://books.google.com/books?id=HksgDQAAQBAJ&...,Bloomsbury Publishing,2014-01-09,http://books.google.com/books?id=HksgDQAAQBAJ&...,['Juvenile Fiction'],1.0,...,,,5.0,926467200,A suspenseful story,i enjoyed this book so much because it was so ...,POSITIVE,0.999263,0,0.960082
721444,Harry Potter and The Sorcerer's Stone,Celebrate 20 years of Harry Potter magic! Harr...,['J. K. Rowling'],http://books.google.com/books/content?id=HksgD...,http://books.google.com/books?id=HksgDQAAQBAJ&...,Bloomsbury Publishing,2014-01-09,http://books.google.com/books?id=HksgDQAAQBAJ&...,['Juvenile Fiction'],1.0,...,,,5.0,926467200,Rowling is magnificent!,harry potter takes your imagination on a wild ...,POSITIVE,0.999353,0,0.85111


In [8]:
info_topicos = trained_model.get_topic_info()
print(info_topicos.values)


[[0 4917
  '0_harry potter books_potter books_book harry_potter sorcerers stone'
  list(['harry potter books', 'potter books', 'book harry', 'potter sorcerers stone', 'read harry', 'harry potter', 'potter', 'harry potter sorcerers', 'potter sorcerers', 'hogwarts'])
  list(['harry potter and the sorcerers stone is a book about an  year old wizard named harry the only thing is harry doesnt know he is a wizard until the keeper of keys at hogwarts school of witchcraft and wizardry hagrid tells harry that he was accepted into hogwarts while at hogwarts harry discovers that the sorcerers stone is being kept there and someone is after it harry and his two friends ron and hermione must work together to try to stop who ever is after the stone harry potter and the sorcerers stone is a book full of magic and fantasy it is the kind of book that can keep you on the edge of your seat and you never want it to end', 'harry potter  harry potter and the sorcerers stone by j k rowlingive been a fan of ha