# 01.01 - PROYECTO PHISHING EMAILS ML

# Download data


In [72]:
#Dependencias
from sklearn.model_selection import train_test_split, StratifiedKFold, GroupKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (confusion_matrix, classification_report, precision_recall_curve, roc_auc_score, roc_curve, average_precision_score)
import joblib
import matplotlib.pyplot as plt
%matplotlib inline


## Visualización del dataset


In [73]:
import pandas as pd

df = pd.read_csv("../Dataset/phishing_legit_dataset_KD_10000.csv", encoding="utf-8")
df.head()

Unnamed: 0,text,label,phishing_type,severity,confidence
0,Subject: Office maintenance\n\nThanks for your...,0,legitimate,low,0.95
1,"Hello, your profile has been locked. Use the s...",1,credential_harvesting,high,0.89
2,"Hi there, congratulations! You are the winner ...",1,financial_scam,medium,0.69
3,"Attention, this is the fraud prevention accoun...",1,authority_scam,high,0.91
4,"Notice, your profile has been restricted. Use ...",1,credential_harvesting,high,0.8


In [74]:
from IPython.display import display

info_df = pd.DataFrame({
    "Tipo": df.dtypes,
    "Valores no nulos": df.count(),
    "Valores nulos": df.isnull().sum(),
    "Porcentaje nulos": (df.isnull().sum() / len(df) * 100).round(2)
})

display(info_df)

Unnamed: 0,Tipo,Valores no nulos,Valores nulos,Porcentaje nulos
text,object,10000,0,0.0
label,int64,10000,0,0.0
phishing_type,object,10000,0,0.0
severity,object,10000,0,0.0
confidence,float64,10000,0,0.0


## Estadísticas descriptivas


In [75]:
df.describe(include='all')

Unnamed: 0,text,label,phishing_type,severity,confidence
count,10000,10000.0,10000,10000,10000.0
unique,9956,,11,3,
top,Subject: Budget planning\n\nThanks for attendi...,,legitimate,high,
freq,2,,4000,4488,
mean,,0.6,,,0.898855
std,,0.489922,,,0.08343
min,,0.0,,,0.65
25%,,0.0,,,0.84
50%,,1.0,,,0.92
75%,,1.0,,,0.97


## Estadísticas de columnas númericas


In [76]:
display(df.describe(include=['number']))

Unnamed: 0,label,confidence
count,10000.0,10000.0
mean,0.6,0.898855
std,0.489922,0.08343
min,0.0,0.65
25%,0.0,0.84
50%,1.0,0.92
75%,1.0,0.97
max,1.0,1.0


## Estadísticas de columnas categóricas


In [77]:
display(df.describe(include=['object']))

Unnamed: 0,text,phishing_type,severity
count,10000,10000,10000
unique,9956,11,3
top,Subject: Budget planning\n\nThanks for attendi...,legitimate,high
freq,2,4000,4488


# Limpieza del dataset

Algunos correos tienen al final del texto una firma que incluye frases como:
- Security Team
- IT Support
- Helpdesk
- Technical Support
  
Esto puede generar **sesgo**: el modelo podría aprender que esas palabras indican phishing,
porque aparecen con frecuencia en los correos maliciosos de entrenamiento.

Por eso:
1. Buscaremos esos patrones en los correos.
2. Crearemos una variable booleana (`has_signature_pattern`) que indique si el correo contiene alguno.
3. Analizaremos si los correos con esas firmas están correlacionados con la etiqueta *phishing = 1*.


In [78]:
import re

#Lista de patrones comunes en las firmas usando expresiones regulaes
SIGNATURE_PATTERNS = [
    r'\bsecurity team\b',
    r'\bit team\b',
    r'\bhelpdesk\b',
    r'\bsupport team\b',
    r'\btechnical support\b',
    r'\bsystems? admin\b',
    r'\bsysadmin\b',
    r'\bit department\b'
]

#Compilamos todos en una sola expresión regular
signature_regex = re.compile("|".join(SIGNATURE_PATTERNS), flags=re.IGNORECASE)

#Función para detectar si un texto tiene una "firma" con esos patrones
def detect_signature(text):
    """
    Analiza las últimas líneas del correo (donde suelen estar las firmas)
    y devuelve True si detecta alguna frase sospechosa como 'Security Team' o 'IT Support'.
    """
    if not isinstance(text, str):
        return False
    # Tomamos las últimas 10 líneas del mensaje
    tail = "\n".join(text.splitlines()[-10:])
    return bool(signature_regex.search(tail))


In [79]:
#Aplicamos la función a la columna de texto del correo
df['has_signature_pattern'] = df['text'].fillna('').apply(detect_signature)

#Contamos cuántos correos tienen firma detectada
print("Correos con firmas detectadas:", df['has_signature_pattern'].sum())
print("Porcentaje:", df['has_signature_pattern'].mean() * 100, "%")


Correos con firmas detectadas: 1316
Porcentaje: 13.16 %


In [80]:
#Tabla indicando la proporción de phishing según la presencia de firma
cross = pd.crosstab(df['has_signature_pattern'], df['label'], normalize='index')
print("Proporción de correos phishing según presencia de firma:")
display(cross)

#También mostramos conteos absolutos
print("Conteos absolutos:")
display(pd.crosstab(df['has_signature_pattern'], df['label']))


Proporción de correos phishing según presencia de firma:


label,0,1
has_signature_pattern,Unnamed: 1_level_1,Unnamed: 2_level_1
False,0.460617,0.539383
True,0.0,1.0


Conteos absolutos:


label,0,1
has_signature_pattern,Unnamed: 1_level_1,Unnamed: 2_level_1
False,4000,4684
True,0,1316


In [81]:
def remove_last_three_lines(text):
    """
    Elimina siempre las dos últimas líneas de un texto.
    Se usa para limpiar posibles firmas o despedidas en todos los correos.
    """
    if not isinstance(text, str):
        return ""
    lines = text.splitlines()
    
    if len(lines) > 2:
        cleaned = "\n".join(lines[:-3]) #borramos las últimas tres líneas, considerando la línea de espacio entre contenido y firma.
    else:
        cleaned = text 
    
    return cleaned.strip()


In [82]:
df['text_clean'] = df['text'].fillna('').apply(remove_last_three_lines)

**Texto original, mostrando solo las firmas**


In [85]:
from IPython.display import display, HTML

pd.set_option('display.max_colwidth', None)

sample = df[['text']].sample(3, random_state=42).reset_index(drop=True)

sample_html = sample.copy()
sample_html['text'] = sample_html['text'].str.replace('\n', '<br>')

display(HTML(sample_html.to_html(escape=False)))


Unnamed: 0,text
0,"Dear customer, you have a new notification. Please click here to view the update. For your convenience, a link and attachment are included. Do not reply to this auto-generated message. Keywords: classified, spread, sensitive, no-reply IT Desk, Alex Martinez"
1,"Subject: Documentation review Thanks for attending the workshop. Here are links to references and the slide deck. Feel free to add notes before we meet. Best, Avery Brown"
2,"Dear customer, this is an 48 hours notice regarding your account. Your access will be limited today. To avoid interruption, respond asap. This offer is time sensitive and ends soon. Follow the instructions now. Keywords: expires soon act now limited time asap Security Team, Alex Lee"


**Texto limpio, mostrando las últimas 5 líneas del contenido**


In [89]:
from IPython.display import display, HTML

pd.set_option('display.max_colwidth', None)

rows = []
for i, row in df.sample(3, random_state=42).iterrows():
    last_lines = "\n".join(row['text_clean'].splitlines()[-3:])
    rows.append({
        'from': row.get('entry', ''),
        'ultimas_3_lineas_limpias': last_lines
    })

tabla = pd.DataFrame(rows).reset_index(drop=True)

tabla_html = tabla.copy()
tabla_html['ultimas_3_lineas_limpias'] = tabla_html['ultimas_3_lineas_limpias'].str.replace('\n', '<br>')

display(HTML(tabla_html.to_html(escape=False)))


Unnamed: 0,from,ultimas_3_lineas_limpias
0,,"Dear customer, you have a new notification. Please click here to view the update. For your convenience, a link and attachment are included. Do not reply to this auto-generated message. Keywords: classified, spread, sensitive, no-reply"
1,,Thanks for attending the workshop. Here are links to references and the slide deck. Feel free to add notes before we meet.
2,,"Dear customer, this is an 48 hours notice regarding your account. Your access will be limited today. To avoid interruption, respond asap. This offer is time sensitive and ends soon. Follow the instructions now. Keywords: expires soon act now limited time asap"
