In [73]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import re

df_base = pd.read_csv("../data/phishing_email.csv")
df_1 = pd.read_csv("../data/Enron.csv")
df_2 = pd.read_csv("../data/Ling.csv")

def normalize_subject_body(df):
    df = df[["subject", "body", "label"]].copy()
    df["text"] = (
        df["subject"].fillna("") + " " +
        df["body"].fillna("")
    )
    df = df[["text", "label"]]
    df["text"] = df["text"].astype(str)
    df["label"] = df["label"].astype(int)
    return df

df_1 = normalize_subject_body(df_1)
df_2 = normalize_subject_body(df_2)

df_base = df_base.rename(columns={
    "text_combined": "text"
})

df_base = df_base[["text","label"]]
df_base["text"] = df_base["text"].astype(str)
df_base["label"] = df_base["label"].astype(int)

df_all = pd.concat(
    [df_base, df_1, df_2],
    ignore_index=True
)

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|[\w\.-]+@[\w\.-]+", " ", text)
    text = re.sub(r"[^a-z\s]", " ", text)
    text = " ".join([w for w in text.split() if len(w) > 2])
    text = re.sub(r"\s+", " ", text).strip()
    return text

df_all["text"] = df_all["text"].apply(clean_text)

In [74]:
X = df_all["text"]
y = df_all["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [75]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        stop_words='english',
        max_features=10000,
        min_df=5,
        max_df=0.9
    )),
    ('clf', LogisticRegression(solver="liblinear"))
])

In [76]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99     11558
           1       0.98      0.99      0.99     11465

    accuracy                           0.99     23023
   macro avg       0.99      0.99      0.99     23023
weighted avg       0.99      0.99      0.99     23023



In [77]:
y_pred = pipeline.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[11366   192]
 [  132 11333]]


In [None]:
import joblib

joblib.dump(pipeline, "../model/pipeline.joblib")