# 01 - Exploration: Phishing Email Dataset

In this notebook we:
- load the raw phishing email dataset
- inspect its structure (rows, columns, dtypes)
- look at label distribution (spam vs legitimate)
- check basic missing values and simple statistics

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

# Show plots inside the notebook
%matplotlib inline

# Path to the CSV
DATA_PATH = Path("../data/raw/CEAS_08.csv")

df = pd.read_csv(DATA_PATH, low_memory=False)
df.shape

In [None]:
# Show first 5 rows
df.head()

In [None]:
# Overview of columns and data types
df.info()

In [None]:
# Quick look at the column names
df.columns.tolist()

In [None]:
# Count missing values per column
df.isna().sum().sort_values(ascending=False)

In [None]:
df['label'].value_counts()

In [None]:
df['label'].value_counts().plot(kind='bar')
plt.title("Label Distribution")
plt.xlabel("Label (0 = legitimate, 1 = phishing)")
plt.ylabel("Number of Emails")
plt.show()

In [None]:
df['body_length'] = df['body'].str.len()

df['body_length'].describe()

In [None]:
df['body_length'].plot(kind='hist', bins=50)
plt.title("Distribution of Email Body Length")
plt.xlabel("Characters")
plt.show()