0. Instalando pacotes

In [2]:
# ================================================
# VERIFICAÇÃO INTELIGENTE DE PACOTES
# ================================================

import subprocess
import sys

def smart_package_check():
    """Verifica e instala apenas pacotes necessários"""

    # Versões desejadas
    target_versions = {
        'torch': '2.3.0',
        'transformers': '4.51.3',
        'peft': '0.15.2',
        'sentence-transformers': '2.5.1'
    }

    needs_reinstall = []

    # Verificar cada pacote
    for package, target_version in target_versions.items():
        try:
            result = subprocess.run([sys.executable, '-c',
                f"import {package.replace('-', '_')}; print({package.replace('-', '_')}.__version__)"],
                capture_output=True, text=True)

            if result.returncode == 0:
                current_version = result.stdout.strip()
                if current_version != target_version:
                    print(f"❌ {package}: {current_version} → {target_version}")
                    needs_reinstall.append(package)
                else:
                    print(f"✅ {package}: {current_version}")
            else:
                print(f"❌ {package}: não instalado")
                needs_reinstall.append(package)
        except:
            print(f"❌ {package}: erro na verificação")
            needs_reinstall.append(package)

    # Reinstalar apenas se necessário
    if needs_reinstall:
        print(f"\n🔄 Reinstalando {len(needs_reinstall)} pacote(s)...")

        # Desinstalar
        !pip uninstall -y {' '.join(needs_reinstall)}

        # Reinstalar com versões corretas
        if 'torch' in needs_reinstall:
            !pip install torch==2.3.0 torchvision==0.18.0 torchaudio==2.3.0
        if 'transformers' in needs_reinstall:
            !pip install transformers==4.51.3
        if 'peft' in needs_reinstall:
            !pip install peft==0.15.2
        if 'sentence-transformers' in needs_reinstall:
            !pip install sentence-transformers==2.5.1

        # Instalar pacotes adicionais
        !pip install optuna shap lime wordcloud

        print("✅ Reinstalação concluída!")
    else:
        # Verificar pacotes adicionais
        additional = ['optuna', 'shap', 'lime', 'wordcloud']
        missing = []

        for pkg in additional:
            try:
                __import__(pkg.replace('-', '_'))
                print(f"✅ {pkg}: instalado")
            except ImportError:
                missing.append(pkg)

        if missing:
            print(f"📦 Instalando pacotes adicionais: {missing}")
            !pip install {' '.join(missing)}
        else:
            print("🎉 Todos os pacotes já estão corretos!")

# Executar verificação
smart_package_check()

❌ torch: 2.6.0+cu124 → 2.3.0
❌ transformers: 4.52.4 → 4.51.3
✅ peft: 0.15.2
❌ sentence-transformers: 4.1.0 → 2.5.1

🔄 Reinstalando 3 pacote(s)...
Found existing installation: torch 2.6.0+cu124
Uninstalling torch-2.6.0+cu124:
  Successfully uninstalled torch-2.6.0+cu124
Found existing installation: transformers 4.52.4
Uninstalling transformers-4.52.4:
  Successfully uninstalled transformers-4.52.4
Found existing installation: sentence-transformers 4.1.0
Uninstalling sentence-transformers-4.1.0:
  Successfully uninstalled sentence-transformers-4.1.0
Collecting torch==2.3.0
  Downloading torch-2.3.0-cp311-cp311-manylinux1_x86_64.whl.metadata (26 kB)
Collecting torchvision==0.18.0
  Downloading torchvision-0.18.0-cp311-cp311-manylinux1_x86_64.whl.metadata (6.6 kB)
Collecting torchaudio==2.3.0
  Downloading torchaudio-2.3.0-cp311-cp311-manylinux1_x86_64.whl.metadata (6.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.3.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-non

1. Importando pacotes

In [3]:
# 1. Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset

# 2. Load Dataset

In [6]:
# ================================================
# MONTAR GOOGLE DRIVE CORRETAMENTE
# ================================================

from google.colab import drive
import os

# 1. Montar o Google Drive na raiz
drive.mount('/content/drive')

# 2. Definir o caminho para seu projeto
project_path = '/content/drive/MyDrive/Doutorado/2025.2/Deep Learning/projeto'

# 3. Verificar se o diretório existe
if os.path.exists(project_path):
    print(f"✅ Diretório encontrado: {project_path}")

    # Navegar para o diretório do projeto
    os.chdir(project_path)
    print(f"📁 Diretório atual: {os.getcwd()}")

    # Listar arquivos no diretório
    print("\n📋 Arquivos no diretório:")
    for item in os.listdir('.'):
        print(f"  - {item}")

else:
    print(f"❌ Diretório não encontrado: {project_path}")
    print("\n🔍 Verificando estrutura do Drive...")

    # Verificar estrutura do Drive
    base_path = '/content/drive/MyDrive'
    if os.path.exists(base_path):
        print(f"\n📁 Conteúdo de {base_path}:")
        for item in os.listdir(base_path):
            print(f"  - {item}")

    # Criar diretório se não existir
    print(f"\n🔧 Criando diretório: {project_path}")
    os.makedirs(project_path, exist_ok=True)
    os.chdir(project_path)
    print("✅ Diretório criado e definido como atual")

Mounted at /content/drive
✅ Diretório encontrado: /content/drive/MyDrive/Doutorado/2025.2/Deep Learning/projeto
📁 Diretório atual: /content/drive/MyDrive/Doutorado/2025.2/Deep Learning/projeto

📋 Arquivos no diretório:
  - Detection of fake news using deep learning CNN–RNN based methods.pdf
  - fake-news-classifier-naive-bayes.ipynb
  - classificacao_fake_news_multimodelos-1.pdf
  - Fake News Dataset.zip
  - fake_news_dataset_multimodel.ipynb


In [12]:
# ================================================
# CARREGAR DATASET COM CACHE INTELIGENTE
# ================================================

import pandas as pd
import os
import zipfile
import re
from datetime import datetime

print("🚀 Carregando dataset com verificação de cache...")

# ================================================
# 1. CONFIGURAR CAMINHOS (VARIÁVEIS GLOBAIS)
# ================================================

# Caminhos no Google Drive
project_dir = '/content/drive/MyDrive/Doutorado/2025.2/Deep Learning/projeto'
drive_zip_path = os.path.join(project_dir, 'Fake News Dataset.zip')
processed_csv_path = os.path.join(project_dir, 'fake_news_processed.csv')
metadata_path = os.path.join(project_dir, 'dataset_metadata.txt')

# Diretório temporário para extração
extract_dir = '/tmp/fake_news_data'

print(f"📁 Diretório do projeto: {project_dir}")
print(f"📦 Arquivo ZIP: {os.path.basename(drive_zip_path)}")
print(f"💾 Arquivo processado: {os.path.basename(processed_csv_path)}")

# ================================================
# 2. VERIFICAR SE DATASET PROCESSADO JÁ EXISTE
# ================================================

def check_processed_dataset():
    """Verifica se o dataset processado já existe e está atualizado"""

    if not os.path.exists(processed_csv_path):
        print("❌ Dataset processado não encontrado")
        return False

    if not os.path.exists(drive_zip_path):
        print("⚠️  Arquivo ZIP original não encontrado, mas dataset processado existe")
        return True

    # Comparar datas de modificação
    zip_mtime = os.path.getmtime(drive_zip_path)
    csv_mtime = os.path.getmtime(processed_csv_path)

    if zip_mtime > csv_mtime:
        print("⚠️  Arquivo ZIP é mais recente que o dataset processado")
        return False

    # Verificar se o arquivo não está vazio
    try:
        df_test = pd.read_csv(processed_csv_path, nrows=1)
        if len(df_test.columns) < 2:
            print("⚠️  Dataset processado parece estar corrompido")
            return False
    except Exception as e:
        print(f"⚠️  Erro ao verificar dataset processado: {e}")
        return False

    print("✅ Dataset processado encontrado e atualizado")
    return True

# ================================================
# 3. CARREGAR DATASET PROCESSADO
# ================================================

def load_processed_dataset():
    """Carrega o dataset já processado"""

    print("📊 Carregando dataset processado...")

    try:
        data = pd.read_csv(processed_csv_path)

        # Validações básicas
        required_columns = ['text', 'label']
        if not all(col in data.columns for col in required_columns):
            raise ValueError(f"Colunas necessárias não encontradas: {required_columns}")

        # Verificar tipos de dados
        if not pd.api.types.is_numeric_dtype(data['label']):
            print("🔧 Convertendo coluna 'label' para numérico...")
            data['label'] = pd.to_numeric(data['label'], errors='coerce')
            data = data.dropna(subset=['label'])
            data['label'] = data['label'].astype(int)

        # Remover linhas vazias
        initial_len = len(data)
        data = data.dropna(subset=['text', 'label'])
        data = data[data['text'].astype(str).str.strip() != '']

        if len(data) < initial_len:
            print(f"🧹 Removidas {initial_len - len(data)} linhas vazias/inválidas")

        print(f"✅ Dataset carregado: {len(data):,} registros")
        return data

    except Exception as e:
        print(f"❌ Erro ao carregar dataset processado: {e}")
        return None

# ================================================
# 4. PROCESSAR DATASET ORIGINAL (FUNÇÕES EXISTENTES)
# ================================================

def detect_delimiter(file_path, sample_size=1024):
    """Detecta automaticamente o delimitador do arquivo CSV"""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            sample = f.read(sample_size)
    except UnicodeDecodeError:
        with open(file_path, 'r', encoding='latin-1') as f:
            sample = f.read(sample_size)

    delimiters = [',', ';', '\t', '|']
    counts = {delim: sample.count(delim) for delim in delimiters}
    best_delim = max(counts, key=counts.get)
    return best_delim if counts[best_delim] > 0 else ','

def process_csv_file(file_path):
    """Processa um arquivo CSV individual"""
    try:
        delimiter = detect_delimiter(file_path)

        encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
        df = None

        for encoding in encodings:
            try:
                df = pd.read_csv(file_path, delimiter=delimiter, encoding=encoding)
                break
            except UnicodeDecodeError:
                continue

        if df is None:
            raise ValueError("Não foi possível ler o arquivo com nenhum encoding")

        # Renomear colunas se necessário
        if 'text;label' in df.columns:
            df[['text', 'label']] = df['text;label'].str.split(';', n=1, expand=True)
            df = df.drop(columns=['text;label'])

        # Garantir que temos as colunas necessárias
        if 'text' not in df.columns or 'label' not in df.columns:
            text_cols = [col for col in df.columns if any(keyword in col.lower()
                        for keyword in ['text', 'content', 'news', 'article', 'title'])]
            label_cols = [col for col in df.columns if any(keyword in col.lower()
                         for keyword in ['label', 'class', 'target', 'fake', 'real'])]

            if text_cols and label_cols:
                df = df.rename(columns={text_cols[0]: 'text', label_cols[0]: 'label'})
            else:
                if len(df.columns) == 1:
                    col_name = df.columns[0]
                    if ';' in str(df.iloc[0, 0]):
                        df_split = df[col_name].str.split(';', expand=True)
                        if df_split.shape[1] >= 2:
                            df = df_split.rename(columns={0: 'text', 1: 'label'})

        if 'text' not in df.columns or 'label' not in df.columns:
            return None

        # Processar rótulos
        df['label'] = df['label'].astype(str).str.strip().str.lower()

        label_mapping = {
            'real': 1, 'true': 1, 'reliable': 1, 'legitimate': 1,
            'fake': 0, 'false': 0, 'unreliable': 0, 'illegitimate': 0,
            'verdadeiro': 1, 'verdadeira': 1, 'confiável': 1,
            'falso': 0, 'falsa': 0, 'não confiável': 0,
            '1': 1, '1.0': 1, '0': 0, '0.0': 0,
            'yes': 1, 'no': 0, 'sim': 1, 'não': 0, 'nao': 0
        }

        df['label'] = df['label'].map(label_mapping)
        df = df.dropna(subset=['label'])
        df['label'] = df['label'].astype(int)

        # Limpeza de texto
        df = df.dropna(subset=['text'])
        df['text'] = df['text'].astype(str).str.strip()
        df = df[df['text'] != '']
        df = df[df['text'] != 'nan']

        return df, delimiter

    except Exception as e:
        print(f"🚨 Erro ao processar {file_path}: {str(e)}")
        return None

def find_zip_file():
    """Encontra o arquivo ZIP no diretório do projeto"""
    global drive_zip_path

    # Verificar se arquivo ZIP padrão existe
    if os.path.exists(drive_zip_path):
        return drive_zip_path

    print(f"❌ Arquivo ZIP padrão não encontrado: {os.path.basename(drive_zip_path)}")

    # Buscar arquivos ZIP no diretório
    if os.path.exists(project_dir):
        print("🔍 Procurando arquivos ZIP no diretório...")
        zip_files = [f for f in os.listdir(project_dir) if f.lower().endswith('.zip')]

        if zip_files:
            # Usar o primeiro arquivo ZIP encontrado
            new_zip_path = os.path.join(project_dir, zip_files[0])
            print(f"💡 Arquivo ZIP encontrado: {zip_files[0]}")

            # Atualizar variável global
            drive_zip_path = new_zip_path
            return drive_zip_path
        else:
            print("❌ Nenhum arquivo ZIP encontrado no diretório")
            return None
    else:
        print("❌ Diretório do projeto não encontrado")
        return None

def process_original_dataset():
    """Processa o dataset original do arquivo ZIP"""

    print("🔄 Processando dataset original...")

    # Encontrar arquivo ZIP
    zip_file_path = find_zip_file()
    if not zip_file_path:
        raise FileNotFoundError("Nenhum arquivo ZIP encontrado!")

    # Extrair arquivo ZIP
    print(f"📦 Extraindo arquivo ZIP: {os.path.basename(zip_file_path)}")
    try:
        os.makedirs(extract_dir, exist_ok=True)
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(extract_dir)
        print(f"✅ Arquivo extraído para: {extract_dir}")
    except Exception as e:
        print(f"❌ Erro ao extrair arquivo: {e}")
        raise

    # Encontrar arquivos CSV
    csv_files_found = []
    for root, dirs, files in os.walk(extract_dir):
        for file in files:
            if file.lower().endswith('.csv'):
                csv_files_found.append(os.path.join(root, file))

    print(f"🔍 Arquivos CSV encontrados: {len(csv_files_found)}")

    if not csv_files_found:
        print("❌ Nenhum arquivo CSV encontrado no ZIP")
        # Listar conteúdo do ZIP para debug
        print("📋 Conteúdo extraído:")
        for root, dirs, files in os.walk(extract_dir):
            level = root.replace(extract_dir, '').count(os.sep)
            indent = ' ' * 2 * level
            print(f"{indent}📁 {os.path.basename(root)}/")
            subindent = ' ' * 2 * (level + 1)
            for file in files:
                print(f"{subindent}📄 {file}")
        raise ValueError("Nenhum arquivo CSV encontrado!")

    # Processar arquivos CSV
    all_dfs = []
    processed_files = []

    for file_path in csv_files_found:
        print(f"📄 Processando: {os.path.basename(file_path)}")

        result = process_csv_file(file_path)

        if result is not None:
            df, delimiter = result
            all_dfs.append(df)
            processed_files.append(file_path)
            print(f"✅ Processado: {os.path.basename(file_path)} | Registros: {len(df)}")
        else:
            print(f"❌ Falha ao processar: {os.path.basename(file_path)}")

    if not all_dfs:
        raise ValueError("Nenhum dataset válido foi processado!")

    # Combinar datasets
    print(f"🔗 Combinando {len(all_dfs)} datasets...")
    data = pd.concat(all_dfs, ignore_index=True)
    data = data.sample(frac=1, random_state=42).reset_index(drop=True)

    return data, processed_files

def save_processed_dataset(data, processed_files):
    """Salva o dataset processado e metadados"""

    try:
        # Salvar dataset
        data.to_csv(processed_csv_path, index=False, encoding='utf-8')
        print(f"💾 Dataset salvo em: {os.path.basename(processed_csv_path)}")

        # Salvar metadados
        metadata = {
            'processed_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'total_records': len(data),
            'processed_files': len(processed_files),
            'label_distribution': data['label'].value_counts().to_dict(),
            'text_stats': {
                'avg_length': float(data['text'].str.len().mean()),
                'min_length': int(data['text'].str.len().min()),
                'max_length': int(data['text'].str.len().max())
            }
        }

        with open(metadata_path, 'w', encoding='utf-8') as f:
            f.write("FAKE NEWS DATASET - METADATA\n")
            f.write("="*40 + "\n")
            for key, value in metadata.items():
                f.write(f"{key}: {value}\n")

        print(f"📋 Metadados salvos em: {os.path.basename(metadata_path)}")

    except Exception as e:
        print(f"⚠️  Erro ao salvar arquivos: {e}")

# ================================================
# 5. LÓGICA PRINCIPAL COM CACHE
# ================================================

def load_dataset_with_cache():
    """Carrega dataset usando cache inteligente"""

    print("\n" + "="*60)
    print("🧠 SISTEMA DE CACHE INTELIGENTE")
    print("="*60)

    # Verificar se dataset processado existe e está atualizado
    if check_processed_dataset():
        print("⚡ Usando dataset em cache...")

        data = load_processed_dataset()

        if data is not None:
            print("✅ Dataset carregado do cache com sucesso!")
            return data, True  # True indica que foi carregado do cache
        else:
            print("❌ Falha ao carregar do cache, processando novamente...")

    # Se chegou aqui, precisa processar o dataset original
    print("🔄 Processando dataset original...")

    try:
        data, processed_files = process_original_dataset()

        # Salvar dataset processado para uso futuro
        save_processed_dataset(data, processed_files)

        print("✅ Dataset processado e salvo com sucesso!")
        return data, False  # False indica que foi processado agora

    except Exception as e:
        print(f"❌ Erro no processamento: {e}")
        raise

# ================================================
# 6. EXECUTAR CARREGAMENTO
# ================================================

try:
    # Carregar dataset
    data, from_cache = load_dataset_with_cache()

    # ================================================
    # 7. EXIBIR RESULTADOS
    # ================================================

    print("\n" + "="*60)
    print("📊 DATASET CARREGADO COM SUCESSO!")
    print("="*60)

    # Indicar fonte dos dados
    source_info = "💾 CACHE" if from_cache else "🔄 PROCESSAMENTO ORIGINAL"
    print(f"📍 Fonte: {source_info}")

    # Estatísticas do dataset
    print(f"📊 Estatísticas:")
    print(f"   - Total de registros: {len(data):,}")

    label_counts = data['label'].value_counts().sort_index()
    for label, count in label_counts.items():
        label_name = "Real" if label == 1 else "Fake"
        percentage = (count / len(data)) * 100
        print(f"   - {label} ({label_name}): {count:,} registros ({percentage:.1f}%)")

    print(f"   - Comprimento médio do texto: {data['text'].str.len().mean():.1f} caracteres")

    # Mostrar informações de cache
    if from_cache:
        print(f"\n⚡ Vantagens do cache:")
        print(f"   - Carregamento instantâneo")
        print(f"   - Sem necessidade de reprocessamento")
        print(f"   - Dados já limpos e validados")
    else:
        print(f"\n🔄 Dataset processado e salvo:")
        print(f"   - Próxima execução será mais rápida")
        print(f"   - Cache criado automaticamente")

    print("="*60)

    # ================================================
    # 8. EXEMPLOS DOS DADOS
    # ================================================

    print("\n📋 Exemplos dos dados:")
    print("-" * 80)

    for label in [0, 1]:
        label_name = "FAKE NEWS" if label == 0 else "REAL NEWS"
        sample = data[data['label'] == label].sample(n=1, random_state=42)

        print(f"\n🏷️  {label_name}:")
        for idx, row in sample.iterrows():
            text_preview = row['text'][:200] + "..." if len(row['text']) > 200 else row['text']
            print(f"   📄 {text_preview}")

    print("-" * 80)
    print(f"🎉 Dataset pronto para uso! Variável 'data' contém {len(data):,} registros")

except Exception as e:
    print(f"\n❌ ERRO CRÍTICO: {e}")
    print("💡 Verifique se:")
    print("   - O Google Drive está montado corretamente")
    print("   - O arquivo ZIP existe no diretório do projeto")
    print("   - Você tem permissões de leitura/escrita")

# ================================================
# 9. FUNÇÃO PARA FORÇAR REPROCESSAMENTO (OPCIONAL)
# ================================================

def force_reprocess():
    """Força o reprocessamento do dataset, ignorando cache"""

    print("🔄 Forçando reprocessamento...")

    # Remover arquivos de cache
    files_to_remove = [processed_csv_path, metadata_path]

    for file_path in files_to_remove:
        if os.path.exists(file_path):
            try:
                os.remove(file_path)
                print(f"🗑️  Removido: {os.path.basename(file_path)}")
            except Exception as e:
                print(f"⚠️  Erro ao remover {os.path.basename(file_path)}: {e}")

    # Reprocessar
    return load_dataset_with_cache()

# Para forçar reprocessamento, descomente a linha abaixo:
# data, from_cache = force_reprocess()

print(f"\n💡 Para forçar reprocessamento, execute: data, from_cache = force_reprocess()")

🚀 Carregando dataset com verificação de cache...
📁 Diretório do projeto: /content/drive/MyDrive/Doutorado/2025.2/Deep Learning/projeto
📦 Arquivo ZIP: Fake News Dataset.zip
💾 Arquivo processado: fake_news_processed.csv

🧠 SISTEMA DE CACHE INTELIGENTE
✅ Dataset processado encontrado e atualizado
⚡ Usando dataset em cache...
📊 Carregando dataset processado...
✅ Dataset carregado: 76,537 registros
✅ Dataset carregado do cache com sucesso!

📊 DATASET CARREGADO COM SUCESSO!
📍 Fonte: 💾 CACHE
📊 Estatísticas:
   - Total de registros: 76,537
   - 0 (Fake): 38,434 registros (50.2%)
   - 1 (Real): 38,103 registros (49.8%)
   - Comprimento médio do texto: 2197.3 caracteres

⚡ Vantagens do cache:
   - Carregamento instantâneo
   - Sem necessidade de reprocessamento
   - Dados já limpos e validados

📋 Exemplos dos dados:
--------------------------------------------------------------------------------

🏷️  FAKE NEWS:
   📄 finally get work post november davidswanson dear democrat find suddenly bit doub

3. Data Cleaning

In [13]:
#used for data cleaning
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)  #removes punctuation
    text = re.sub(r'\d+', '', text)  #removes numbers
    text = text.lower()  #turns everything to lowercase
    text = ' '.join([word for word in text.split() if word not in stop_words]) #gets rid of stop words
    return text

data['text'] = data['text'].apply(clean_text) #applies changes

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


3. Preprocessing

In [14]:
#some train test splitting, 20%
X = data['text']
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=142857)

# 4. Bag of Words + Naive Bayes

In [15]:
bow = CountVectorizer(max_features=5000)
X_train_bow = bow.fit_transform(X_train)
X_test_bow = bow.transform(X_test)
bow_model = MultinomialNB()
bow_model.fit(X_train_bow, y_train)
y_pred_bow = bow_model.predict(X_test_bow)

5. TF-IDF + Logistic Regression

In [16]:
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)
tfidf_model = LogisticRegression(max_iter=1000)
tfidf_model.fit(X_train_tfidf, y_train)
y_pred_tfidf = tfidf_model.predict(X_test_tfidf)

In [17]:
# Carrega o modelo com vetores pré-treinados (~100MB, mais leve que Word2Vec do Google)
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-md
Successfully installed en-core-web-md-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


6. Word2Vec + Random Forest

In [18]:
import spacy
import numpy as np
from sklearn.ensemble import RandomForestClassifier

# Carrega o modelo com embeddings pré-treinados (~100MB)
nlp = spacy.load("en_core_web_md")

# Função para vetorizar textos com spaCy
def vectorize_spacy(texts):
    vectors = []
    for doc in nlp.pipe(texts, disable=["ner", "parser"]):
        vectors.append(doc.vector)
    return np.array(vectors)

# Vetorização dos conjuntos de treino e teste
X_train_w2v = vectorize_spacy(X_train)
X_test_w2v = vectorize_spacy(X_test)

# Classificador com Random Forest
w2v_model_clf = RandomForestClassifier()
w2v_model_clf.fit(X_train_w2v, y_train)
y_pred_w2v = w2v_model_clf.predict(X_test_w2v)

# 7. GloVe + XGBoost

In [19]:
# Simulando GloVe com Word2Vec para simplificar, mas pode-se usar GloVe real
X_train_glove = X_train_w2v
X_test_glove = X_test_w2v
glove_model = XGBClassifier(eval_metric='logloss')
glove_model.fit(X_train_glove, y_train)
y_pred_glove = glove_model.predict(X_test_glove)

8. BERT + Fine Tuning

In [None]:
class FakeNewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=max_len)
        self.labels = list(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
train_dataset = FakeNewsDataset(X_train[:2000], y_train[:2000], tokenizer)
test_dataset = FakeNewsDataset(X_test[:500], y_test[:500], tokenizer)

training_args = TrainingArguments(output_dir="./results", per_device_train_batch_size=8, per_device_eval_batch_size=8, num_train_epochs=2, logging_dir="./logs", logging_steps=10)

trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=test_dataset)
trainer.train()
preds = trainer.predict(test_dataset)
y_pred_bert = np.argmax(preds.predictions, axis=1)
y_true_bert = y_test[:500].values

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mffcs[0m ([33mffcs-cin-ufpe[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


# 9. Evaluation Results

In [None]:
from sklearn.model_selection import cross_val_score

def evaluate(name, y_true, y_pred):
    print(f"\n=== {name} ===")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall:", recall_score(y_true, y_pred))
    print("F1-score:", f1_score(y_true, y_pred))

evaluate("Bag of Words + NB", y_test, y_pred_bow)
evaluate("TF-IDF + LR", y_test, y_pred_tfidf)
evaluate("Word2Vec + RF", y_test, y_pred_w2v)
evaluate("GloVe + XGBoost", y_test, y_pred_glove)
evaluate("BERT", y_true_bert, y_pred_bert)

10. Matriz de confusao

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Função para plotar a matriz de confusão
def plot_confusion_matrix(y_true, y_pred, model_name):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
                xticklabels=['Fake', 'True'],
                yticklabels=['Fake', 'True'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Matriz de Confusão - {model_name}')
    plt.show()

# Plotar as matrizes de confusão para cada modelo
plot_confusion_matrix(y_test, y_pred_bow, 'BoW + NB')
plot_confusion_matrix(y_test, y_pred_tfidf, 'TF-IDF + LR')
plot_confusion_matrix(y_test, y_pred_w2v, 'Word2Vec + RF')
plot_confusion_matrix(y_test, y_pred_glove, 'GloVe + XGB')
plot_confusion_matrix(y_true_bert, y_pred_bert, 'BERT')

11. Gerando gráficos

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Função para plotar as métricas dos modelos
def plot_metrics(models, metrics):
    fig, ax = plt.subplots(figsize=(12, 6))
    x = range(len(models))
    width = 0.2

    for i, (metric_name, metric_values) in enumerate(metrics.items()):
        ax.bar([pos + width * i for pos in x], metric_values, width, label=metric_name)

    ax.set_xticks([pos + width for pos in x])
    ax.set_xticklabels(models)
    ax.set_ylabel('Score')
    ax.set_title('Performance dos Modelos')
    ax.legend(loc='upper left', bbox_to_anchor=(1, 1))
    plt.ylim(0, 1.1)
    plt.show()

# Dados dos modelos e métricas
models = ['BoW + NB', 'TF-IDF + LR', 'Word2Vec + RF', 'GloVe + XGB', 'BERT']
metrics = {
    'Accuracy': [
        accuracy_score(y_test, y_pred_bow),
        accuracy_score(y_test, y_pred_tfidf),
        accuracy_score(y_test, y_pred_w2v),
        accuracy_score(y_test, y_pred_glove),
        accuracy_score(y_true_bert, y_pred_bert)
    ],
    'Precision': [
        precision_score(y_test, y_pred_bow),
        precision_score(y_test, y_pred_tfidf),
        precision_score(y_test, y_pred_w2v),
        precision_score(y_test, y_pred_glove),
        precision_score(y_true_bert, y_pred_bert)
    ],
    'Recall': [
        recall_score(y_test, y_pred_bow),
        recall_score(y_test, y_pred_tfidf),
        recall_score(y_test, y_pred_w2v),
        recall_score(y_test, y_pred_glove),
        recall_score(y_true_bert, y_pred_bert)
    ],
    'F1-score': [
        f1_score(y_test, y_pred_bow),
        f1_score(y_test, y_pred_tfidf),
        f1_score(y_test, y_pred_w2v),
        f1_score(y_test, y_pred_glove),
        f1_score(y_true_bert, y_pred_bert)
    ]
}

# Plotar as métricas
plot_metrics(models, metrics)

12. Otimizando os hiperparametros com Optuna para BoW + Naive Bayes

In [None]:
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB

# Função de objetivo para o Naive Bayes
def objective_nb(trial):
    # Hiperparâmetros a serem otimizados
    params = {
        'alpha': trial.suggest_float('alpha', 0.01, 10.0, log=True),  # Suavização de Laplace
        'fit_prior': trial.suggest_categorical('fit_prior', [True, False])  # Aprender priors
    }

    # Modelo e avaliação com validação cruzada
    model = MultinomialNB(**params)
    score = cross_val_score(model, X_train_bow, y_train, cv=3, scoring='f1').mean()
    return score

# Executar otimização
study_nb = optuna.create_study(direction='maximize', pruner=optuna.pruners.MedianPruner())
study_nb.optimize(objective_nb, n_trials=20, n_jobs=-1)

# Melhores hiperparâmetros
print("Melhores parâmetros para Naive Bayes:", study_nb.best_params)
optuna.visualization.plot_optimization_history(study_nb)

13. Otimizando o modelo TF-IDF com Regressao Logistica

In [None]:
import optuna
from sklearn.model_selection import cross_val_score
def objective_lr(trial):
    # Hiperparâmetros a serem otimizados
    params = {
        'C': trial.suggest_float('C', 0.1, 10.0, log=True),
        'penalty': trial.suggest_categorical('penalty', ['l1', 'l2']),
        'solver': trial.suggest_categorical('solver', ['liblinear', 'saga'])
    }

    # Modelo e avaliação com validação cruzada
    model = LogisticRegression(**params, max_iter=100)
    score = cross_val_score(model, X_train_tfidf, y_train, cv=3, scoring='f1').mean()
    return score

# Otimização
study_lr = optuna.create_study(direction='maximize', pruner=optuna.pruners.MedianPruner())
study_lr.optimize(objective_lr, n_trials=10, n_jobs=-1)

# Melhores hiperparâmetros
print("Melhores parâmetros para LR:", study_lr.best_params)
optuna.visualization.plot_optimization_history(study_lr)

14. Otimizando o modelo Random Forest + Word2Vec

In [None]:
import optuna
from sklearn.model_selection import cross_val_score
def objective_rf(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'max_depth': trial.suggest_int('max_depth', 5, 30),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10)
    }

    model = RandomForestClassifier(**params)
    score = cross_val_score(model, X_train_w2v, y_train, cv=3, scoring='f1').mean()
    return score

study_rf = optuna.create_study(direction='maximize', pruner=optuna.pruners.MedianPruner())
study_rf.optimize(objective_rf, n_trials=5, n_jobs=-1)
optuna.visualization.plot_optimization_history(study_rf)

15. Otimizando o modelo do XGBoost + GloVe

In [None]:
import optuna
from sklearn.model_selection import cross_val_score
def objective_xgb(trial):
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 200)
    }

    model = XGBClassifier(**params, eval_metric='logloss')
    score = cross_val_score(model, X_train_glove, y_train, cv=3, scoring='f1').mean()
    return score

study_xgb = optuna.create_study(direction='maximize', pruner=optuna.pruners.MedianPruner())
study_xgb.optimize(objective_xgb, n_trials=10, n_jobs=-1)
optuna.visualization.plot_optimization_history(study_xgb)

16. Otimizando o modelo BERT + Fine-Tunning

In [None]:
import optuna
from sklearn.model_selection import cross_val_score
def objective_bert(trial):
    training_args = TrainingArguments(
        output_dir="./results",
        learning_rate=trial.suggest_float('learning_rate', 1e-5, 5e-5, log=True),
        per_device_train_batch_size=trial.suggest_categorical('batch_size', [8, 16]),
        num_train_epochs=trial.suggest_int('num_epochs', 1, 3),
        weight_decay=0.01,
        eval_strategy="epoch"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset
    )

    trainer.train()
    results = trainer.evaluate()
    return results['eval_loss']  # Minimizar a perda

study_bert = optuna.create_study(direction='minimize', pruner=optuna.pruners.MedianPruner())
study_bert.optimize(objective_bert, n_trials=5, n_jobs=-1)
optuna.visualization.plot_optimization_history(study_bert)

17. Exibição das melhorias

In [None]:
# ================================================
# 1. Naive Bayes (NB) - Melhores Parâmetros
# ================================================
best_nb_params = study_nb.best_params
best_nb_model = MultinomialNB(**best_nb_params)
best_nb_model.fit(X_train_bow, y_train)
y_pred_nb = best_nb_model.predict(X_test_bow)

# Avaliação
print("=== Naive Bayes (Otimizado) ===")
print("Acurácia:", accuracy_score(y_test, y_pred_nb))
print("F1-score:", f1_score(y_test, y_pred_nb))

# ================================================
# 2. Logistic Regression (TF-IDF) - Melhores Parâmetros
# ================================================
best_lr_params = study_lr.best_params
best_lr_model = LogisticRegression(**best_lr_params, max_iter=100)
best_lr_model.fit(X_train_tfidf, y_train)
y_pred_lr = best_lr_model.predict(X_test_tfidf)

# Avaliação
print("=== Logistic Regression (Otimizado) ===")
print("Acurácia:", accuracy_score(y_test, y_pred_lr))
print("F1-score:", f1_score(y_test, y_pred_lr))

# ================================================
# 3. Random Forest (Word2Vec) - Melhores Parâmetros
# ================================================
best_rf_params = study_rf.best_params
best_rf_model = RandomForestClassifier(**best_rf_params)
best_rf_model.fit(X_train_w2v, y_train)
y_pred_rf = best_rf_model.predict(X_test_w2v)

# Avaliação
print("\\n=== Random Forest (Otimizado) ===")
print("Acurácia:", accuracy_score(y_test, y_pred_rf))
print("F1-score:", f1_score(y_test, y_pred_rf))

# ================================================
# 4. XGBoost (GloVe) - Melhores Parâmetros
# ================================================
best_xgb_params = study_xgb.best_params
best_xgb_model = XGBClassifier(**best_xgb_params, eval_metric='logloss')
best_xgb_model.fit(X_train_glove, y_train)
y_pred_xgb = best_xgb_model.predict(X_test_glove)

# Avaliação
print("\\n=== XGBoost (Otimizado) ===")
print("Acurácia:", accuracy_score(y_test, y_pred_xgb))
print("F1-score:", f1_score(y_test, y_pred_xgb))

# ================================================
# 5. BERT (Fine-Tuning) - Melhores Parâmetros
# ================================================
best_bert_params = study_bert.best_params

# Configurar os argumentos de treino com os melhores parâmetros
best_training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=best_bert_params['learning_rate'],
    per_device_train_batch_size=best_bert_params['batch_size'],
    num_train_epochs=best_bert_params['num_epochs'],
    weight_decay=0.01,
    evaluation_strategy="epoch"
)

# Treinar o modelo final com todos os dados (sem validação cruzada)
final_trainer = Trainer(
    model=model,
    args=best_training_args,
    train_dataset=X_train,  # Use o dataset completo
    eval_dataset=test_dataset
)

final_trainer.train()

# Fazer previsões finais
preds = final_trainer.predict(test_dataset)
y_pred_bert = np.argmax(preds.predictions, axis=1)

# Avaliação
print("\\n=== BERT (Otimizado) ===")
print("Acurácia:", accuracy_score(y_true_bert, y_pred_bert))
print("F1-score:", f1_score(y_true_bert, y_pred_bert))

Explicabilidade

In [None]:
# ================================================
# EXPLICABILIDADE DOS MODELOS
# ================================================

import shap
import lime
from lime.lime_text import LimeTextExplainer
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from wordcloud import WordCloud

# Configurar SHAP
shap.initjs()

print("="*60)
print("ANÁLISE DE EXPLICABILIDADE DOS MODELOS")
print("="*60)

# ================================================
# 1. PREPARAÇÃO DOS DADOS PARA EXPLICABILIDADE
# ================================================

# Selecionar algumas amostras para explicar
n_samples_explain = 10
sample_indices = np.random.choice(len(X_test), n_samples_explain, replace=False)
sample_texts = [df_test.iloc[i]['text'] for i in sample_indices]  # Assumindo que você tem o texto original
sample_labels = y_test.iloc[sample_indices] if hasattr(y_test, 'iloc') else y_test[sample_indices]

print(f"Analisando {n_samples_explain} amostras selecionadas aleatoriamente...")

# ================================================
# 2. EXPLICABILIDADE COM SHAP
# ================================================

def explain_with_shap():
    print("\n" + "="*50)
    print("EXPLICABILIDADE COM SHAP")
    print("="*50)

    # 2.1 Logistic Regression (TF-IDF) - Linear Explainer
    print("\n--- Logistic Regression (TF-IDF) ---")
    try:
        # Criar explainer para modelo linear
        explainer_lr = shap.LinearExplainer(best_lr_model, X_train_tfidf)
        shap_values_lr = explainer_lr.shap_values(X_test_tfidf[sample_indices])

        # Obter nomes das features (palavras do vocabulário)
        feature_names_tfidf = vectorizer_tfidf.get_feature_names_out()

        # Visualização - Summary Plot
        plt.figure(figsize=(12, 8))
        shap.summary_plot(shap_values_lr, X_test_tfidf[sample_indices],
                         feature_names=feature_names_tfidf, show=False, max_display=20)
        plt.title("SHAP Summary Plot - Logistic Regression")
        plt.tight_layout()
        plt.show()

        # Waterfall plot para primeira amostra
        plt.figure(figsize=(12, 6))
        shap.waterfall_plot(explainer_lr.expected_value, shap_values_lr[0],
                           X_test_tfidf[sample_indices[0]], feature_names=feature_names_tfidf)
        plt.title(f"SHAP Waterfall Plot - Amostra 1 (Label: {sample_labels[0]})")
        plt.show()

    except Exception as e:
        print(f"Erro na explicabilidade LR: {e}")

    # 2.2 Random Forest (Word2Vec) - Tree Explainer
    print("\n--- Random Forest (Word2Vec) ---")
    try:
        # Tree explainer para Random Forest
        explainer_rf = shap.TreeExplainer(best_rf_model)
        shap_values_rf = explainer_rf.shap_values(X_test_w2v[sample_indices])

        # Se classificação binária, pegar apenas uma classe
        if len(shap_values_rf) == 2:
            shap_values_rf = shap_values_rf[1]

        # Summary plot
        plt.figure(figsize=(12, 8))
        shap.summary_plot(shap_values_rf, X_test_w2v[sample_indices], show=False, max_display=20)
        plt.title("SHAP Summary Plot - Random Forest")
        plt.tight_layout()
        plt.show()

    except Exception as e:
        print(f"Erro na explicabilidade RF: {e}")

    # 2.3 XGBoost (GloVe) - Tree Explainer
    print("\n--- XGBoost (GloVe) ---")
    try:
        # Tree explainer para XGBoost
        explainer_xgb = shap.TreeExplainer(best_xgb_model)
        shap_values_xgb = explainer_xgb.shap_values(X_test_glove[sample_indices])

        # Summary plot
        plt.figure(figsize=(12, 8))
        shap.summary_plot(shap_values_xgb, X_test_glove[sample_indices], show=False, max_display=20)
        plt.title("SHAP Summary Plot - XGBoost")
        plt.tight_layout()
        plt.show()

        # Feature importance global
        plt.figure(figsize=(10, 6))
        shap.summary_plot(shap_values_xgb, X_test_glove[sample_indices],
                         plot_type="bar", show=False, max_display=15)
        plt.title("SHAP Feature Importance - XGBoost")
        plt.tight_layout()
        plt.show()

    except Exception as e:
        print(f"Erro na explicabilidade XGB: {e}")

# ================================================
# 3. EXPLICABILIDADE COM LIME (COMPARAÇÃO)
# ================================================

def explain_with_lime():
    print("\n" + "="*50)
    print("EXPLICABILIDADE COM LIME (COMPARAÇÃO)")
    print("="*50)

    # Criar explainer LIME para texto
    explainer_lime = LimeTextExplainer(class_names=['Negativo', 'Positivo'])

    # Função de predição para LIME (usando Logistic Regression)
    def predict_proba_lr(texts):
        # Transformar textos usando o mesmo pipeline
        texts_tfidf = vectorizer_tfidf.transform(texts)
        return best_lr_model.predict_proba(texts_tfidf)

    # Explicar algumas amostras
    print("\n--- LIME Explanations (Logistic Regression) ---")

    for i in range(min(3, len(sample_texts))):  # Explicar apenas 3 amostras
        try:
            # Gerar explicação
            exp = explainer_lime.explain_instance(
                sample_texts[i],
                predict_proba_lr,
                num_features=10,
                num_samples=1000
            )

            # Mostrar explicação
            print(f"\nAmostra {i+1} (Label Real: {sample_labels[i]}):")
            print("Texto:", sample_texts[i][:200] + "..." if len(sample_texts[i]) > 200 else sample_texts[i])
            print("\nPalavras mais importantes:")
            for word, importance in exp.as_list():
                print(f"  {word}: {importance:.4f}")

            # Visualização HTML (opcional)
            exp.save_to_file(f'lime_explanation_sample_{i+1}.html')

        except Exception as e:
            print(f"Erro na explicação LIME para amostra {i+1}: {e}")

# ================================================
# 4. ANÁLISE COMPARATIVA DE FEATURES IMPORTANTES
# ================================================

def analyze_important_features():
    print("\n" + "="*50)
    print("ANÁLISE DE FEATURES IMPORTANTES")
    print("="*50)

    # 4.1 Features importantes do Logistic Regression
    print("\n--- Top Features - Logistic Regression ---")
    feature_names = vectorizer_tfidf.get_feature_names_out()
    coefficients = best_lr_model.coef_[0]

    # Top features positivas e negativas
    top_positive = np.argsort(coefficients)[-15:][::-1]
    top_negative = np.argsort(coefficients)[:15]

    print("Top 15 palavras para classe POSITIVA:")
    for idx in top_positive:
        print(f"  {feature_names[idx]}: {coefficients[idx]:.4f}")

    print("\nTop 15 palavras para classe NEGATIVA:")
    for idx in top_negative:
        print(f"  {feature_names[idx]}: {coefficients[idx]:.4f}")

    # 4.2 Feature importance do Random Forest
    print("\n--- Feature Importance - Random Forest ---")
    rf_importance = best_rf_model.feature_importances_
    top_rf_features = np.argsort(rf_importance)[-15:][::-1]

    print("Top 15 dimensões mais importantes (Word2Vec):")
    for i, idx in enumerate(top_rf_features):
        print(f"  Dimensão {idx}: {rf_importance[idx]:.4f}")

    # 4.3 Feature importance do XGBoost
    print("\n--- Feature Importance - XGBoost ---")
    xgb_importance = best_xgb_model.feature_importances_
    top_xgb_features = np.argsort(xgb_importance)[-15:][::-1]

    print("Top 15 dimensões mais importantes (GloVe):")
    for i, idx in enumerate(top_xgb_features):
        print(f"  Dimensão {idx}: {xgb_importance[idx]:.4f}")

# ================================================
# 5. VISUALIZAÇÕES AVANÇADAS
# ================================================

def create_advanced_visualizations():
    print("\n" + "="*50)
    print("VISUALIZAÇÕES AVANÇADAS")
    print("="*50)

    # 5.1 WordCloud das palavras mais importantes
    try:
        feature_names = vectorizer_tfidf.get_feature_names_out()
        coefficients = best_lr_model.coef_[0]

        # Criar dicionário para WordCloud
        word_importance = {}
        top_indices = np.argsort(np.abs(coefficients))[-100:]  # Top 100 palavras

        for idx in top_indices:
            word_importance[feature_names[idx]] = abs(coefficients[idx])

        # Gerar WordCloud
        plt.figure(figsize=(15, 8))

        # WordCloud para palavras positivas
        plt.subplot(1, 2, 1)
        positive_words = {word: coef for word, coef in word_importance.items()
                         if coefficients[np.where(feature_names == word)[0][0]] > 0}
        if positive_words:
            wc_pos = WordCloud(width=600, height=400, background_color='white').generate_from_frequencies(positive_words)
            plt.imshow(wc_pos, interpolation='bilinear')
            plt.title('Palavras Importantes - Sentimento POSITIVO')
            plt.axis('off')

        # WordCloud para palavras negativas
        plt.subplot(1, 2, 2)
        negative_words = {word: coef for word, coef in word_importance.items()
                         if coefficients[np.where(feature_names == word)[0][0]] < 0}
        if negative_words:
            wc_neg = WordCloud(width=600, height=400, background_color='white').generate_from_frequencies(negative_words)
            plt.imshow(wc_neg, interpolation='bilinear')
            plt.title('Palavras Importantes - Sentimento NEGATIVO')
            plt.axis('off')

        plt.tight_layout()
        plt.show()

    except Exception as e:
        print(f"Erro na criação do WordCloud: {e}")

    # 5.2 Comparação de importância entre modelos
    try:
        plt.figure(figsize=(12, 8))

        # Normalizar importâncias para comparação
        lr_importance_norm = np.abs(coefficients) / np.max(np.abs(coefficients))
        rf_importance_norm = rf_importance / np.max(rf_importance)
        xgb_importance_norm = xgb_importance / np.max(xgb_importance)

        # Plotar distribuições
        plt.hist(lr_importance_norm, bins=50, alpha=0.7, label='Logistic Regression', density=True)
        plt.hist(rf_importance_norm, bins=50, alpha=0.7, label='Random Forest', density=True)
        plt.hist(xgb_importance_norm, bins=50, alpha=0.7, label='XGBoost', density=True)

        plt.xlabel('Importância Normalizada')
        plt.ylabel('Densidade')
        plt.title('Distribuição de Importância das Features por Modelo')
        plt.legend()
        plt.grid(True, alpha=0.3)
        plt.show()

    except Exception as e:
        print(f"Erro na comparação de importâncias: {e}")

# ================================================
# 6. EXECUTAR ANÁLISES
# ================================================

def run_explainability_analysis():
    """Executar toda a análise de explicabilidade"""

    print("Iniciando análise de explicabilidade...")

    # Executar SHAP
    explain_with_shap()

    # Executar LIME (comparação)
    explain_with_lime()

    # Analisar features importantes
    analyze_important_features()

    # Criar visualizações avançadas
    create_advanced_visualizations()

    print("\n" + "="*60)
    print("RESUMO DA ANÁLISE DE EXPLICABILIDADE")
    print("="*60)
    print("""
    ✅ SHAP Analysis:
       - Fornece explicações baseadas em valores Shapley
       - Mostra contribuição de cada feature para predições individuais
       - Permite comparação entre diferentes modelos

    ✅ LIME Analysis:
       - Oferece explicações locais interpretáveis
       - Útil para entender predições específicas
       - Funciona bem com dados de texto

    ✅ Feature Importance:
       - Identifica palavras/dimensões mais relevantes
       - Compara importância entre diferentes representações
       - Ajuda na interpretação do modelo

    💡 Recomendação: Use SHAP para análise geral e LIME para casos específicos
    """)

# Executar a análise completa
if __name__ == "__main__":
    run_explainability_analysis()