In [1]:
import pandas as pd
import os
import re
import json
import scipy.sparse
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold

# =====================================================
# 1) Leitura do dataset
# =====================================================
print("Diretório atual:", os.getcwd())
print("Arquivos na pasta datasets:", os.listdir("../datasets"))

try:
    df = pd.read_csv("../datasets/phishing.csv", encoding="utf-8")
except UnicodeDecodeError:
    df = pd.read_csv("../datasets/phishing.csv", encoding="ISO-8859-1")

print("Dimensão inicial do dataset:", df.shape)

# =====================================================
# 2) Prepara rótulos (y) e textos (corpus)
# =====================================================
# Converte 'Email Type' em dummies
df = pd.get_dummies(df, columns=['Email Type'])
# Rótulo: phishing = 1, seguro = 0
y = df['Email Type_Phishing Email']

# Corpus: texto dos e-mails, com limpeza
corpus = df['Email Text'].astype(str).apply(
    lambda x: re.sub(r'[^a-zA-Z0-9áéíóúãõâêôçÁÉÍÓÚÃÕÂÊÔÇ\s]', '', x.lower())
)

# =====================================================
# 3) Bag of Words (CountVectorizer)
# =====================================================
count_vectorizer = CountVectorizer(
    lowercase=True,
    max_features=5000,  # limita vocabulário
    min_df=5,           # ignora palavras raras
    max_df=0.8          # ignora palavras muito comuns
)
X_count = count_vectorizer.fit_transform(corpus)
print("Formato CountVectorizer:", X_count.shape)

# =====================================================
# 4) Validação cruzada com RandomForest
# =====================================================
modelo_rf = RandomForestClassifier(n_estimators=100, random_state=42)

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(modelo_rf, X_count, y, cv=kfold, scoring='accuracy', n_jobs=-1)

print(f"\n Bag of Words (CountVectorizer) + RandomForest")
print(f"Acurácia média: {scores.mean():.4f}")
print(f"Desvio padrão: {scores.std():.4f}")
print(f"Scores individuais: {scores}")

# =====================================================
# 5) Salvar representação e vocabulário
# =====================================================
scipy.sparse.save_npz("../datasets/phishing_bow_count.npz", X_count)

with open("../datasets/phishing_bow_vocab.json", "w", encoding="utf-8") as f:
    json.dump(count_vectorizer.get_feature_names_out().tolist(), f, ensure_ascii=False, indent=2)

print("\n Pipeline concluído! Representações salvas em '../datasets/'")


Diretório atual: C:\Users\Bruno\OneDrive\TCC\PythonProject\1.Transformacao_Dataset
Arquivos na pasta datasets: ['DDoS.csv', 'phishing.csv', 'phishing_bow_count.npz', 'phishing_bow_vocab.json', 'phishing_tfidf.npz', 'phishing_tfidf_vocab.json', 'phishing_transformed.csv', 'ransomware.csv']
Dimensão inicial do dataset: (18650, 3)
Formato CountVectorizer: (18650, 5000)

 Bag of Words (CountVectorizer) + RandomForest
Acurácia média: 0.9614
Desvio padrão: 0.0022
Scores individuais: [0.96032172 0.96380697 0.95898123 0.95951743 0.96434316]

 Pipeline concluído! Representações salvas em '../datasets/'
