# Importação das bibliotecas

In [78]:
from pathlib import Path
import kagglehub
import pandas as pd
import re
import string
from bs4 import BeautifulSoup
import numpy as np
from bs4 import MarkupResemblesLocatorWarning
import warnings

# Importação do dataset mãe

In [79]:
path = Path(kagglehub.dataset_download("naserabdullahalam/phishing-email-dataset"))

path_phishing0 = path / "Nigerian_Fraud.csv"
path_phishing1 = path / "CEAS_08.csv"
path_phishing2 = path / "Nazario.csv"

df_0 = pd.read_csv(path_phishing0)
df_1 = pd.read_csv(path_phishing1)
df_2 = pd.read_csv(path_phishing2)

df = pd.concat([df_0, df_1, df_2], ignore_index=True)

In [80]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44051 entries, 0 to 44050
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sender    43720 non-null  object
 1   receiver  42169 non-null  object
 2   date      43568 non-null  object
 3   subject   43980 non-null  object
 4   body      44051 non-null  object
 5   urls      44051 non-null  int64 
 6   label     44051 non-null  int64 
dtypes: int64(2), object(5)
memory usage: 2.4+ MB


In [81]:
df.head()

Unnamed: 0,sender,receiver,date,subject,body,urls,label
0,MR. JAMES NGOLA. <james_ngola2002@maktoob.com>,webmaster@aclweb.org,"Thu, 31 Oct 2002 02:38:20 +0000",URGENT BUSINESS ASSISTANCE AND PARTNERSHIP,FROM:MR. JAMES NGOLA.\nCONFIDENTIAL TEL: 233-2...,0,1
1,Mr. Ben Suleman <bensul2004nng@spinfinder.com>,R@M,"Thu, 31 Oct 2002 05:10:00 -0000",URGENT ASSISTANCE /RELATIONSHIP (P),"Dear Friend,\n\nI am Mr. Ben Suleman a custom ...",0,1
2,PRINCE OBONG ELEME <obong_715@epatra.com>,webmaster@aclweb.org,"Thu, 31 Oct 2002 22:17:55 +0100",GOOD DAY TO YOU,FROM HIS ROYAL MAJESTY (HRM) CROWN RULER OF EL...,0,1
3,PRINCE OBONG ELEME <obong_715@epatra.com>,webmaster@aclweb.org,"Thu, 31 Oct 2002 22:44:20 -0000",GOOD DAY TO YOU,FROM HIS ROYAL MAJESTY (HRM) CROWN RULER OF EL...,0,1
4,Maryam Abacha <m_abacha03@www.com>,R@M,"Fri, 01 Nov 2002 01:45:04 +0100",I Need Your Assistance.,"Dear sir, \n \nIt is with a heart full of hope...",0,1


In [82]:
print(df.shape)
print(df.columns)

(44051, 7)
Index(['sender', 'receiver', 'date', 'subject', 'body', 'urls', 'label'], dtype='object')


---

# Tratamento do dataset

In [83]:
# Limita a quantidade de amostras por valor de uma coluna
# e remove as linhas excedentes

def limit_occurrences(df, n, column):
    return df.groupby(column).head(n).reset_index(drop=True)

def limit_repeats_per_value(df, n, columns):
    for column in columns:
        print(f"Limitando a {n} amostras por valor da coluna '{column}'")
        df = df.groupby(column).head(n).reset_index(drop=True)
    return df

In [84]:
# Renomear label pra phishing (mais intuitivo)

df.rename(columns={'label': 'phishing'}, inplace=True)

In [85]:
# Remoção da data, mantendo apenas o horário (emails com phishing tem horários típicos? Vale a pena estudar)

df['time'] = df['date'].str.extract(r'(\d{2}:\d{2}:\d{2})')
df.drop(columns=['date'], inplace=True)
df.insert(0, 'time', df.pop('time'))

In [86]:
# Remoção das colunas receiver e sender

df.drop(['receiver', 'sender'], axis=1, inplace=True)
print("Colunas removidas: receiver, sender\n")

Colunas removidas: receiver, sender



In [87]:
# Limita a quantidade de amostras por valor de uma coluna
# e remove as linhas excedentes (n=5)

df = limit_repeats_per_value(df, 5, ['body', 'subject'])

Limitando a 5 amostras por valor da coluna 'body'
Limitando a 5 amostras por valor da coluna 'subject'


In [88]:
antes = len(df)

# Substitui strings vazias e espaços em branco por NaN
df = df.replace(r'^\s*$', np.nan, regex=True)

# Remove as linhas com qualquer valor NaN
df = df.dropna(how='any')

depois = len(df)

print(f"\nLinhas com dados faltantes (removidas): {antes - depois}")



Linhas com dados faltantes (removidas): 474


In [89]:
print(f"\n{(df['phishing'] == 1).sum()} casos de phishing")
print(f"{(df['phishing'] == 0).sum()} casos limpos\n")



14170 casos de phishing
13552 casos limpos



In [90]:
df.head()

Unnamed: 0,time,subject,body,urls,phishing
0,02:38:20,URGENT BUSINESS ASSISTANCE AND PARTNERSHIP,FROM:MR. JAMES NGOLA.\nCONFIDENTIAL TEL: 233-2...,0,1
1,05:10:00,URGENT ASSISTANCE /RELATIONSHIP (P),"Dear Friend,\n\nI am Mr. Ben Suleman a custom ...",0,1
2,22:17:55,GOOD DAY TO YOU,FROM HIS ROYAL MAJESTY (HRM) CROWN RULER OF EL...,0,1
3,22:44:20,GOOD DAY TO YOU,FROM HIS ROYAL MAJESTY (HRM) CROWN RULER OF EL...,0,1
4,01:45:04,I Need Your Assistance.,"Dear sir, \n \nIt is with a heart full of hope...",0,1


In [91]:
df.describe()

Unnamed: 0,urls,phishing
count,27722.0,27722.0
mean,0.62243,0.511146
std,0.484788,0.499885
min,0.0,0.0
25%,0.0,0.0
50%,1.0,1.0
75%,1.0,1.0
max,1.0,1.0


## Tratamento das strings

In [92]:
# Silenciar avisos do BeautifulSoup
warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)

In [93]:
# Função pra tratar o texto
def limpar_texto(texto):
    if pd.isnull(texto):
        return ""
    
    # Remove HTML
    texto = BeautifulSoup(texto, "html.parser").get_text()
    
    # Deixa tudo minúsculo
    texto = texto.lower()
    
    # Substitui pontuação por espaço
    texto = re.sub(rf"[{re.escape(string.punctuation)}]", " ", texto)
    
    # Remove espaços excessivos
    texto = re.sub(r'\s+', ' ', texto).strip()

    return texto

In [94]:
# Cópia do texto original
df['BodyClear'] = df['body']

# Remove e-mails
df['BodyClear'] = df['BodyClear'].str.replace(r'\S+@\S+', '', regex=True)

# Remove links
df['BodyClear'] = df['BodyClear'].str.replace(r'http\S+|www.\S+', '', regex=True)

# Remove telefones
df['BodyClear'] = df['BodyClear'].str.replace(r'\+?\d[\d\- ]{7,}', '', regex=True)

# Aplica a função de limpeza
df['BodyClear'] = df['BodyClear'].apply(limpar_texto)
df['SubjectClear'] = df['subject'].apply(limpar_texto)

In [95]:
# Converte tempo completo pra somente hora
df['time'] = pd.to_datetime(df['time'], format='%H:%M:%S', errors='coerce').dt.hour

In [96]:
# Cria a feature do momento do envio do e-mail

def sending_period(hour):
    if pd.isnull(hour):
        return 'undefined'
    elif hour < 6:
        return 'dawn'
    elif hour < 12:
        return 'morning'
    elif hour < 18:
        return 'afternoon'
    else:
        return 'evening'

df['sendingPeriod'] = df['time'].apply(sending_period)

In [97]:
# Reordena as colunas para ficar mais organizado

df.insert(1, 'sendingPeriod', df.pop('sendingPeriod'))
df.insert(3, 'SubjectClear', df.pop('SubjectClear'))
df.insert(5, 'BodyClear', df.pop('BodyClear'))

In [98]:
# Embaralha as linhas do DataFrame

df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [99]:
antes = len(df)

# Substitui strings vazias e espaços em branco por NaN
df = df.replace(r'^\s*$', np.nan, regex=True)

# Remove as linhas com qualquer valor NaN
df = df.dropna(how='any')

depois = len(df)

print(f"\nLinhas com dados faltantes (removidas): {antes - depois}")


Linhas com dados faltantes (removidas): 21


In [100]:
# Verifica a distribuição da coluna 'phishing' (balanceado)
df['phishing'].value_counts()

phishing
1    14153
0    13548
Name: count, dtype: int64

In [101]:
df.head()

Unnamed: 0,time,sendingPeriod,subject,SubjectClear,body,BodyClear,urls,phishing
0,7,morning,[ie-rant] British Police Chief Calls For Legal...,ie rant british police chief calls for legalis...,P45 at the ready for this chap...\n\nhttp://ne...,p45 at the ready for this chap north wales pol...,1,0
1,23,evening,from Reyes Greene,from reyes greene,\n\n\n\n\n\n\nBu up yi yq ng M wvz edic pg ine...,bu up yi yq ng m wvz edic pg ine on tp line vi...,1,1
2,12,afternoon,Turning a small knob into a huge wand!,turning a small knob into a huge wand,Problems everywhere?\n\nGood business & he zu ...,problems everywhere good business he zu alth h...,0,1
3,23,evening,"[UAI] ICAPS-08 Call for Papers, Tutorial Propo...",uai icaps 08 call for papers tutorial proposal...,\nhttp://icaps08.icaps-conference.org/\n\nTuto...,tutorial workshop proposal deadlines approachi...,1,0
4,16,afternoon,[UAI] CFP: SAT 2005,uai cfp sat 2005,With apologies for multiple copies:\n\n ...,with apologies for multiple copies call for pa...,1,0


In [102]:
df.describe()

Unnamed: 0,time,urls,phishing
count,27701.0,27701.0,27701.0
mean,11.85802,0.62283,0.51092
std,7.028433,0.484687,0.49989
min,0.0,0.0,0.0
25%,6.0,0.0,0.0
50%,12.0,1.0,1.0
75%,18.0,1.0,1.0
max,23.0,1.0,1.0


# Exportação do arquivo csv

In [103]:
df.to_csv("phishing_dataset_CIS.csv", index=False, encoding="utf-8")