In [1]:
import pandas as pd
import os
import re
import scipy.sparse
import json
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Mostra diret√≥rio e arquivos dispon√≠veis
print("Diret√≥rio atual:", os.getcwd())
print("Arquivos na pasta datasets:", os.listdir("../datasets"))

# Leitura do CSV
try:
    df = pd.read_csv("../datasets/phishing.csv", encoding="utf-8")
except UnicodeDecodeError:
    df = pd.read_csv("../datasets/phishing.csv", encoding="ISO-8859-1")

# Limpeza b√°sica do texto
corpus = df['Email Text'].astype(str).apply(
    lambda x: re.sub(r'[^a-zA-Z0-9√°√©√≠√≥√∫√£√µ√¢√™√¥√ß√Å√â√ç√ì√ö√É√ï√Ç√ä√î√á\s]', '', x.lower())
)

# =====================================================
# üîπ Bag of Words com CountVectorizer
# =====================================================
count_vectorizer = CountVectorizer(
    lowercase=True,
    stop_words=None,     # pode trocar por 'english' ou lista em portugu√™s
    max_features=5000,   # limita vocabul√°rio
    min_df=5,            # ignora palavras raras
    max_df=0.8           # ignora palavras muito comuns
)

X_count = count_vectorizer.fit_transform(corpus)

print("Formato CountVectorizer:", X_count.shape)

# Salva matriz esparsa
scipy.sparse.save_npz("../datasets/phishing_bow_count.npz", X_count)

# Salva vocabul√°rio
with open("../datasets/phishing_bow_vocab.json", "w", encoding="utf-8") as f:
    json.dump(count_vectorizer.get_feature_names_out().tolist(), f, ensure_ascii=False, indent=2)

# =====================================================
# üîπ TF-IDF
# =====================================================
tfidf_vectorizer = TfidfVectorizer(
    lowercase=True,
    stop_words=None,
    max_features=5000,
    min_df=5,
    max_df=0.8
)

X_tfidf = tfidf_vectorizer.fit_transform(corpus)

print("Formato TF-IDF:", X_tfidf.shape)

# Salva matriz esparsa
scipy.sparse.save_npz("../datasets/phishing_tfidf.npz", X_tfidf)

# Salva vocabul√°rio
with open("../datasets/phishing_tfidf_vocab.json", "w", encoding="utf-8") as f:
    json.dump(tfidf_vectorizer.get_feature_names_out().tolist(), f, ensure_ascii=False, indent=2)

print("‚úÖ Bag of Words e TF-IDF gerados e salvos com sucesso!")


Diret√≥rio atual: C:\Users\Bruno\OneDrive\TCC\PythonProject\1.Transformacao_Dataset
Arquivos na pasta datasets: ['DDoS.csv', 'phishing.csv', 'phishing_transformed.csv', 'ransomware.csv']
Formato CountVectorizer: (18650, 5000)
Formato TF-IDF: (18650, 5000)
‚úÖ Bag of Words e TF-IDF gerados e salvos com sucesso!


In [2]:
import scipy.sparse
import json

# Carregar BoW
X_count = scipy.sparse.load_npz("../datasets/phishing_bow_count.npz")

with open("../datasets/phishing_bow_vocab.json", "r", encoding="utf-8") as f:
    vocab_bow = json.load(f)

# Carregar TF-IDF
X_tfidf = scipy.sparse.load_npz("../datasets/phishing_tfidf.npz")

with open("../datasets/phishing_tfidf_vocab.json", "r", encoding="utf-8") as f:
    vocab_tfidf = json.load(f)

print("BoW:", X_count.shape)
print("TF-IDF:", X_tfidf.shape)


BoW: (18650, 5000)
TF-IDF: (18650, 5000)
