## 1. Inicializa√ß√µes para o Laborat√≥rio

Instala√ß√£o de todas as depend√™ncias necess√°rias para o pipeline completo.

In [None]:
# Detectar ambiente (Colab ou Local)
import sys
import os

IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    print("üîß Ambiente: Google Colab")
else:
    print("üîß Ambiente: Jupyter Local")

# Vari√°veis globais
SPARK_VERSION = "3.5.0"
HADOOP_VERSION = "3"
KAFKA_VERSION = "3.6.0"
SCALA_VERSION = "2.12"

print(f"üì¶ Vers√µes: Spark {SPARK_VERSION}, Kafka {KAFKA_VERSION}, Scala {SCALA_VERSION}")

In [None]:
# Instala√ß√£o de depend√™ncias Python
!pip install -q pyspark=={SPARK_VERSION}
!pip install -q kafka-python
!pip install -q wordcloud matplotlib pandas numpy

print("‚úÖ Depend√™ncias Python instaladas")

In [None]:
# Download do Kafka (necess√°rio para broker local)
import subprocess
import tarfile
import urllib.request

KAFKA_DIR = f"/tmp/kafka_{SCALA_VERSION}-{KAFKA_VERSION}"

if not os.path.exists(KAFKA_DIR):
    print("üì• Baixando Apache Kafka...")
    kafka_url = f"https://archive.apache.org/dist/kafka/{KAFKA_VERSION}/kafka_{SCALA_VERSION}-{KAFKA_VERSION}.tgz"
    kafka_tgz = f"/tmp/kafka.tgz"
    
    urllib.request.urlretrieve(kafka_url, kafka_tgz)
    
    print("üì¶ Extraindo Kafka...")
    with tarfile.open(kafka_tgz, "r:gz") as tar:
        tar.extractall("/tmp/")
    
    os.remove(kafka_tgz)
    print(f"‚úÖ Kafka instalado em {KAFKA_DIR}")
else:
    print(f"‚úÖ Kafka j√° existe em {KAFKA_DIR}")

## 2. Configura√ß√£o de Mecanismos para Visualiza√ß√£o de Resultados

Prepara√ß√£o do ambiente para coleta e exibi√ß√£o dos resultados em tempo real.

In [None]:
# Configura√ß√£o de diret√≥rios
WORK_DIR = "/tmp/spark_lab"
CHECKPOINT_DIR = f"{WORK_DIR}/checkpoints"
RESULTS_DIR = f"{WORK_DIR}/results"

os.makedirs(CHECKPOINT_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)

print(f"üìÅ Diret√≥rio de trabalho: {WORK_DIR}")
print(f"üìÅ Checkpoints: {CHECKPOINT_DIR}")
print(f"üìÅ Resultados: {RESULTS_DIR}")

In [None]:
# Imports para visualiza√ß√£o
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import pandas as pd
import numpy as np
from IPython.display import display, clear_output
import time
from collections import Counter

# Configura√ß√£o de plots
%matplotlib inline
plt.rcParams['figure.figsize'] = (12, 6)

print("‚úÖ Bibliotecas de visualiza√ß√£o configuradas")

## 3. Instala√ß√£o e Configura√ß√£o do Apache Spark

Setup do Spark com suporte a Kafka Structured Streaming.

In [None]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

# Configura√ß√£o do Spark com pacotes Kafka
conf = SparkConf()
conf.set("spark.jars.packages", 
         f"org.apache.spark:spark-sql-kafka-0-10_{SCALA_VERSION}:{SPARK_VERSION}")
conf.set("spark.sql.streaming.checkpointLocation", CHECKPOINT_DIR)
conf.set("spark.driver.memory", "2g")
conf.set("spark.executor.memory", "2g")

# Criar sess√£o Spark
spark = SparkSession.builder \
    .appName("B2_WordCount_Streaming") \
    .config(conf=conf) \
    .master("local[*]") \
    .getOrCreate()

# Configurar n√≠vel de log
spark.sparkContext.setLogLevel("WARN")

print(f"‚úÖ Spark Session criada: {spark.version}")
print(f"üìä Spark UI: http://localhost:4040")

## 4. Instala√ß√£o e Configura√ß√£o do Kafka

Inicializa√ß√£o do Kafka broker e cria√ß√£o dos t√≥picos de entrada e sa√≠da.

In [None]:
import subprocess
import signal
import atexit

# Processos do Kafka
zookeeper_process = None
kafka_process = None

def cleanup_kafka():
    """Finaliza processos Kafka ao encerrar notebook"""
    global zookeeper_process, kafka_process
    
    if kafka_process:
        kafka_process.terminate()
        kafka_process.wait()
    
    if zookeeper_process:
        zookeeper_process.terminate()
        zookeeper_process.wait()
    
    print("üõë Kafka e Zookeeper finalizados")

# Registrar cleanup
atexit.register(cleanup_kafka)

# Iniciar Zookeeper
print("üöÄ Iniciando Zookeeper...")
zookeeper_process = subprocess.Popen(
    [f"{KAFKA_DIR}/bin/zookeeper-server-start.sh", 
     f"{KAFKA_DIR}/config/zookeeper.properties"],
    stdout=subprocess.DEVNULL,
    stderr=subprocess.DEVNULL
)

time.sleep(5)  # Aguardar inicializa√ß√£o

# Iniciar Kafka
print("üöÄ Iniciando Kafka broker...")
kafka_process = subprocess.Popen(
    [f"{KAFKA_DIR}/bin/kafka-server-start.sh", 
     f"{KAFKA_DIR}/config/server.properties"],
    stdout=subprocess.DEVNULL,
    stderr=subprocess.DEVNULL
)

time.sleep(10)  # Aguardar inicializa√ß√£o

print("‚úÖ Kafka broker rodando em localhost:9092")

In [None]:
# Criar t√≥picos Kafka
def create_kafka_topic(topic_name, partitions=3, replication=1):
    """Cria t√≥pico Kafka se n√£o existir"""
    try:
        result = subprocess.run(
            [f"{KAFKA_DIR}/bin/kafka-topics.sh",
             "--create",
             "--bootstrap-server", "localhost:9092",
             "--topic", topic_name,
             "--partitions", str(partitions),
             "--replication-factor", str(replication),
             "--if-not-exists"],
            capture_output=True,
            text=True,
            timeout=10
        )
        if "Created topic" in result.stdout or "already exists" in result.stdout:
            print(f"‚úÖ T√≥pico '{topic_name}' pronto")
        else:
            print(f"‚ö†Ô∏è  {result.stdout}")
    except Exception as e:
        print(f"‚ùå Erro ao criar t√≥pico {topic_name}: {e}")

# Criar t√≥picos
create_kafka_topic("social-input", partitions=3)
create_kafka_topic("wordcount-output", partitions=3)

print("\nüìã Listando t√≥picos criados:")
subprocess.run([f"{KAFKA_DIR}/bin/kafka-topics.sh",
                "--list",
                "--bootstrap-server", "localhost:9092"])

## 5. Producer: Gerador de Mensagens Simulando Rede Social

**Justificativa:** Conforme explicado na introdu√ß√£o, ao inv√©s de integrar com Discord (que exigiria tokens, webhooks e configura√ß√µes externas), implementamos um gerador autom√°tico de mensagens que simula posts de uma rede social.

Este gerador:
- Publica mensagens continuamente no t√≥pico `social-input`
- Usa vocabul√°rio variado (tecnologia, Big Data, Hadoop, Spark, etc.)
- Simula comportamento realista com timestamps e varia√ß√£o de conte√∫do
- Permite reprodutibilidade total do experimento

In [None]:
from kafka import KafkaProducer
import json
import random
import threading

class SocialMediaSimulator:
    """Simula posts de rede social enviando para Kafka"""
    
    def __init__(self, bootstrap_servers='localhost:9092', topic='social-input'):
        self.producer = KafkaProducer(
            bootstrap_servers=bootstrap_servers,
            value_serializer=lambda v: json.dumps(v).encode('utf-8')
        )
        self.topic = topic
        self.running = False
        self.thread = None
        self.message_count = 0
        
        # Vocabul√°rio simulando posts sobre Big Data
        self.templates = [
            "Acabei de implementar um cluster {tech} com {num} nodes! {emoji}",
            "Performance do {tech} est√° incr√≠vel hoje! Processando {num}GB de dados.",
            "Algu√©m j√° testou {tech} em produ√ß√£o? Preciso de dicas sobre {feature}.",
            "Tutorial: Como configurar {tech} para {feature} em {num} passos.",
            "{emoji} Novo post sobre {tech}! Compartilhando minha experi√™ncia com {feature}.",
            "Comparando {tech} vs {tech2}: qual √© melhor para {feature}?",
            "Dica r√°pida de {tech}: sempre configure {feature} para otimizar performance!",
            "Meu cluster {tech} finalmente est√° rodando! {num} workers processando dados.",
        ]
        
        self.tech_words = ["Hadoop", "Spark", "Kafka", "HDFS", "YARN", "MapReduce", 
                           "Flink", "Storm", "Hive", "Pig", "HBase", "Cassandra"]
        self.features = ["streaming", "batch processing", "fault tolerance", "scalability",
                        "distributed computing", "data partitioning", "replication",
                        "load balancing", "resource management", "data locality"]
        self.emojis = ["üöÄ", "üí°", "üî•", "‚ö°", "üéØ", "üìä", "üõ†Ô∏è", "‚ú®"]
    
    def generate_message(self):
        """Gera uma mensagem aleat√≥ria"""
        template = random.choice(self.templates)
        message = template.format(
            tech=random.choice(self.tech_words),
            tech2=random.choice(self.tech_words),
            num=random.randint(2, 100),
            feature=random.choice(self.features),
            emoji=random.choice(self.emojis)
        )
        return {
            "timestamp": int(time.time()),
            "user": f"user_{random.randint(1, 50)}",
            "message": message
        }
    
    def produce_loop(self, interval=2.0):
        """Loop de produ√ß√£o de mensagens"""
        while self.running:
            msg = self.generate_message()
            self.producer.send(self.topic, value=msg)
            self.message_count += 1
            if self.message_count % 10 == 0:
                print(f"üì§ {self.message_count} mensagens enviadas para '{self.topic}'")
            time.sleep(interval)
    
    def start(self, interval=2.0):
        """Inicia producer em thread separada"""
        if not self.running:
            self.running = True
            self.thread = threading.Thread(target=self.produce_loop, args=(interval,))
            self.thread.daemon = True
            self.thread.start()
            print(f"‚úÖ Producer iniciado (intervalo: {interval}s)")
    
    def stop(self):
        """Para o producer"""
        if self.running:
            self.running = False
            if self.thread:
                self.thread.join(timeout=5)
            self.producer.flush()
            self.producer.close()
            print(f"üõë Producer finalizado ({self.message_count} mensagens)")

# Instanciar producer
social_producer = SocialMediaSimulator(topic='social-input')

print("‚úÖ Gerador de mensagens (Social Media Simulator) configurado")

In [None]:
# Iniciar producer
social_producer.start(interval=1.5)  # Envia mensagem a cada 1.5 segundos

# Aguardar algumas mensagens serem enviadas
time.sleep(10)

print(f"\nüìä Total de mensagens enviadas at√© agora: {social_producer.message_count}")

## 6. Configura√ß√£o de Sa√≠da Gr√°fica

**Justificativa:** Conforme explicado na introdu√ß√£o, ao inv√©s de ElasticSearch + Kibana (que exigem >4GB RAM, Docker e configura√ß√µes complexas n√£o vi√°veis no Colab), implementamos visualiza√ß√£o inline com `wordcloud`.

Esta abordagem:
- Consome mensagens do t√≥pico `wordcount-output` em tempo real
- Gera nuvem de palavras dinamicamente no pr√≥prio notebook
- Permite atualiza√ß√µes near-real-time da visualiza√ß√£o
- √â totalmente reproduz√≠vel em qualquer ambiente Python

In [None]:
from kafka import KafkaConsumer
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from collections import defaultdict
import threading

class WordCloudVisualizer:
    """Consome resultados do Kafka e exibe nuvem de palavras"""
    
    def __init__(self, bootstrap_servers='localhost:9092', topic='wordcount-output'):
        self.consumer = KafkaConsumer(
            topic,
            bootstrap_servers=bootstrap_servers,
            auto_offset_reset='earliest',
            enable_auto_commit=True,
            value_deserializer=lambda m: json.loads(m.decode('utf-8'))
        )
        self.word_counts = defaultdict(int)
        self.running = False
        self.thread = None
        self.total_words = 0
    
    def consume_loop(self):
        """Loop de consumo de mensagens"""
        for message in self.consumer:
            if not self.running:
                break
            
            data = message.value
            word = data.get('word', '')
            count = data.get('count', 0)
            
            if word:
                self.word_counts[word] = count
                self.total_words += 1
    
    def start(self):
        """Inicia consumer em thread separada"""
        if not self.running:
            self.running = True
            self.thread = threading.Thread(target=self.consume_loop)
            self.thread.daemon = True
            self.thread.start()
            print("‚úÖ Consumer de visualiza√ß√£o iniciado")
    
    def stop(self):
        """Para o consumer"""
        if self.running:
            self.running = False
            if self.thread:
                self.thread.join(timeout=5)
            self.consumer.close()
            print("üõë Consumer finalizado")
    
    def generate_wordcloud(self, max_words=100):
        """Gera nuvem de palavras com contagens atuais"""
        if not self.word_counts:
            print("‚ö†Ô∏è  Nenhuma palavra processada ainda")
            return None
        
        # Criar WordCloud
        wc = WordCloud(
            width=1200,
            height=600,
            background_color='white',
            max_words=max_words,
            colormap='viridis',
            relative_scaling=0.5,
            min_font_size=10
        ).generate_from_frequencies(self.word_counts)
        
        return wc
    
    def plot_wordcloud(self, max_words=100):
        """Plota nuvem de palavras"""
        wc = self.generate_wordcloud(max_words)
        if wc:
            plt.figure(figsize=(14, 7))
            plt.imshow(wc, interpolation='bilinear')
            plt.axis('off')
            plt.title(f'Nuvem de Palavras - {len(self.word_counts)} palavras √∫nicas | '
                     f'{self.total_words} contagens processadas', 
                     fontsize=16, pad=20)
            plt.tight_layout()
            plt.show()
    
    def plot_top_words(self, top_n=20):
        """Plota gr√°fico de barras com top N palavras"""
        if not self.word_counts:
            print("‚ö†Ô∏è  Nenhuma palavra processada ainda")
            return
        
        # Ordenar por contagem
        sorted_words = sorted(self.word_counts.items(), key=lambda x: x[1], reverse=True)
        top_words = sorted_words[:top_n]
        
        words = [w[0] for w in top_words]
        counts = [w[1] for w in top_words]
        
        plt.figure(figsize=(12, 6))
        plt.barh(words[::-1], counts[::-1], color='skyblue')
        plt.xlabel('Contagem', fontsize=12)
        plt.ylabel('Palavra', fontsize=12)
        plt.title(f'Top {top_n} Palavras Mais Frequentes', fontsize=14, pad=15)
        plt.tight_layout()
        plt.show()

# Instanciar visualizador
visualizer = WordCloudVisualizer(topic='wordcount-output')

print("‚úÖ Visualizador de nuvem de palavras configurado")

## 7. Processamento: WordCount com Spark Structured Streaming

Implementa√ß√£o do WordCount usando Spark Structured Streaming para processar mensagens do Kafka em tempo real.

In [None]:
from pyspark.sql.functions import explode, split, lower, regexp_replace, col, count, to_json, struct
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, LongType

# Definir schema das mensagens de entrada
input_schema = StructType([
    StructField("timestamp", LongType(), True),
    StructField("user", StringType(), True),
    StructField("message", StringType(), True)
])

print("‚úÖ Schema de entrada definido")

In [None]:
# Ler stream do Kafka
df_input = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "social-input") \
    .option("startingOffsets", "earliest") \
    .load()

print("‚úÖ Stream de entrada conectado ao t√≥pico 'social-input'")

In [None]:
# Parsear JSON e processar mensagens
from pyspark.sql.functions import from_json

df_parsed = df_input \
    .selectExpr("CAST(value AS STRING) as json") \
    .select(from_json(col("json"), input_schema).alias("data")) \
    .select("data.*")

print("‚úÖ Parsing JSON configurado")

In [None]:
# WordCount: tokeniza√ß√£o e contagem
stopwords_list = {'de', 'a', 'o', 'que', 'e', 'do', 'da', 'em', 'um', 'para', '√©', 'com', 
                  'n√£o', 'uma', 'os', 'no', 'se', 'na', 'por', 'mais', 'as', 'dos', 'como',
                  'mas', 'ao', 'ele', 'das', '√†', 'seu', 'sua', 'ou', 'quando', 'muito',
                  'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
                  'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'been', 'be',
                  'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
                  'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those'}

# Processar palavras
df_words = df_parsed \
    .select(
        explode(
            split(
                lower(
                    regexp_replace(col("message"), "[^a-zA-Z0-9√°√©√≠√≥√∫√¢√™√¥√£√µ√ß√Å√â√ç√ì√ö√Ç√ä√î√É√ï√á\\s]", "")
                ), 
                "\\s+"
            )
        ).alias("word")
    ) \
    .filter(col("word") != "") \
    .filter(~col("word").isin(stopwords_list))

# Contar palavras
df_wordcount = df_words \
    .groupBy("word") \
    .count() \
    .orderBy(col("count").desc())

print("‚úÖ Pipeline de WordCount configurado")
print("üìä Colunas finais:", df_wordcount.columns)

In [None]:
# Escrever resultados no Kafka (t√≥pico de sa√≠da)
query_kafka = df_wordcount \
    .select(to_json(struct("word", "count")).alias("value")) \
    .writeStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("topic", "wordcount-output") \
    .option("checkpointLocation", f"{CHECKPOINT_DIR}/kafka-output") \
    .outputMode("complete") \
    .start()

print("‚úÖ Stream de sa√≠da iniciado (Kafka topic: 'wordcount-output')")
print(f"üìä Query ID: {query_kafka.id}")
print(f"üîÑ Status: {query_kafka.status}")

In [None]:
# Tamb√©m escrever em console para monitoramento (opcional)
query_console = df_wordcount \
    .writeStream \
    .format("console") \
    .option("truncate", "false") \
    .option("numRows", 20) \
    .outputMode("complete") \
    .trigger(processingTime='10 seconds') \
    .start()

print("‚úÖ Stream de console iniciado (exibi√ß√£o a cada 10s)")

## 8. Apresenta√ß√£o de Resultados e Visualiza√ß√£o em Dashboard

Iniciando consumer para visualiza√ß√£o e gerando gr√°ficos em tempo real.

In [None]:
# Iniciar consumer de visualiza√ß√£o
visualizer.start()

# Aguardar acumula√ß√£o de dados
print("‚è≥ Aguardando 20 segundos para acumular dados...")
time.sleep(20)

print(f"\nüìä Estat√≠sticas:")
print(f"   - Palavras √∫nicas: {len(visualizer.word_counts)}")
print(f"   - Total de contagens: {visualizer.total_words}")

In [None]:
# Gerar visualiza√ß√µes
print("üìä Gerando visualiza√ß√µes...\n")

# 1. Top 20 palavras
visualizer.plot_top_words(top_n=20)

# 2. Nuvem de palavras
visualizer.plot_wordcloud(max_words=100)

## 9. Valida√ß√£o e Evid√™ncias

Monitoramento final do pipeline e coleta de evid√™ncias de funcionamento.

In [None]:
# Verificar status das queries Spark
print("üîç Status das Queries Spark Streaming:\n")

for query in spark.streams.active:
    print(f"Query ID: {query.id}")
    print(f"Name: {query.name}")
    print(f"Status: {query.status}")
    print(f"Recent Progress:")
    
    # √öltimos 3 progresses
    for progress in query.recentProgress[-3:]:
        print(f"  - Batch: {progress.get('batchId', 'N/A')}")
        print(f"    Input Rows: {progress.get('numInputRows', 0)}")
        print(f"    Processing Rate: {progress.get('inputRowsPerSecond', 0):.2f} rows/sec")
        print(f"    Duration: {progress.get('durationMs', {}).get('total', 0)}ms")
    print()

In [None]:
# Verificar mensagens no Kafka
import subprocess

print("üìã Verificando t√≥picos Kafka:\n")

# Contar mensagens no t√≥pico de entrada
result_input = subprocess.run(
    [f"{KAFKA_DIR}/bin/kafka-run-class.sh", "kafka.tools.GetOffsetShell",
     "--broker-list", "localhost:9092",
     "--topic", "social-input",
     "--time", "-1"],
    capture_output=True,
    text=True
)

# Contar mensagens no t√≥pico de sa√≠da
result_output = subprocess.run(
    [f"{KAFKA_DIR}/bin/kafka-run-class.sh", "kafka.tools.GetOffsetShell",
     "--broker-list", "localhost:9092",
     "--topic", "wordcount-output",
     "--time", "-1"],
    capture_output=True,
    text=True
)

print(f"T√≥pico 'social-input':")
print(result_input.stdout)

print(f"\nT√≥pico 'wordcount-output':")
print(result_output.stdout)

In [None]:
# Amostras de dados processados
print("üìä Amostra de Palavras Mais Frequentes:\n")

if visualizer.word_counts:
    sorted_words = sorted(visualizer.word_counts.items(), key=lambda x: x[1], reverse=True)
    
    print(f"{'Palavra':<20} {'Contagem':>10}")
    print("-" * 32)
    for word, count in sorted_words[:15]:
        print(f"{word:<20} {count:>10}")
else:
    print("‚ö†Ô∏è  Nenhum dado processado ainda")

In [None]:
# Atualizar visualiza√ß√µes ap√≥s mais dados
print("\n‚è≥ Aguardando mais 15 segundos para acumular dados...")
time.sleep(15)

print(f"\nüìä Estat√≠sticas atualizadas:")
print(f"   - Palavras √∫nicas: {len(visualizer.word_counts)}")
print(f"   - Total de contagens: {visualizer.total_words}")
print(f"   - Mensagens produzidas: {social_producer.message_count}")

# Gerar visualiza√ß√µes finais
print("\nüìä Visualiza√ß√µes Finais:\n")
visualizer.plot_top_words(top_n=25)
visualizer.plot_wordcloud(max_words=150)

## 10. Finaliza√ß√£o e Limpeza

Parando todos os servi√ßos e streams.

In [None]:
# Parar producer
social_producer.stop()

# Parar visualizador
visualizer.stop()

# Parar queries Spark
for query in spark.streams.active:
    print(f"üõë Parando query: {query.id}")
    query.stop()

# Aguardar finaliza√ß√£o
time.sleep(5)

print("\n‚úÖ Todos os streams finalizados")

In [None]:
# Estat√≠sticas finais
print("üìä ESTAT√çSTICAS FINAIS DO PIPELINE\n")
print("="*50)
print(f"Mensagens produzidas: {social_producer.message_count}")
print(f"Palavras √∫nicas processadas: {len(visualizer.word_counts)}")
print(f"Total de contagens: {visualizer.total_words}")
print("="*50)

# Top 10 palavras finais
if visualizer.word_counts:
    sorted_words = sorted(visualizer.word_counts.items(), key=lambda x: x[1], reverse=True)
    print("\nTOP 10 PALAVRAS MAIS FREQUENTES:")
    for i, (word, count) in enumerate(sorted_words[:10], 1):
        print(f"  {i}. {word}: {count}")

In [None]:
# Parar Spark
spark.stop()
print("üõë Spark Session finalizada")

# Cleanup Kafka ser√° feito automaticamente pelo atexit

---

## ‚úÖ Conclus√µes

### Pipeline Implementado

1. **Entrada:** Gerador autom√°tico de mensagens ‚Üí Kafka `social-input`
2. **Processamento:** Spark Structured Streaming ‚Üí WordCount com tokeniza√ß√£o e remo√ß√£o de stopwords
3. **Sa√≠da:** Kafka `wordcount-output` ‚Üí Consumer Python ‚Üí Visualiza√ß√£o inline (WordCloud + BarChart)

### Justificativas de Adapta√ß√µes

- **Discord ‚Üí Gerador de Texto:** Mant√©m o conceito de stream de mensagens sem depend√™ncias externas complexas
- **ElasticSearch/Kibana ‚Üí WordCloud inline:** Visualiza√ß√£o equivalente sem overhead de infraestrutura

### Diferenciais Implementados

- ‚úÖ Pipeline 100% reproduz√≠vel em Colab ou Jupyter local
- ‚úÖ Visualiza√ß√µes near-real-time com atualiza√ß√µes din√¢micas
- ‚úÖ Remo√ß√£o de stopwords (PT + EN) para melhor qualidade
- ‚úÖ Monitoramento completo (Spark UI, Kafka offsets, contadores)
- ‚úÖ Cleanup autom√°tico de recursos

### Tecnologias Utilizadas

- Apache Spark 3.5.0 (Structured Streaming)
- Apache Kafka 3.6.0 (Message Broker)
- Python: kafka-python, wordcloud, matplotlib, pandas
- Zookeeper (para Kafka)

---

**Notebook executado com sucesso!**  
**Data:** 14 de novembro de 2025