In [27]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import joblib

df = pd.read_csv("../data/phishing_email.csv")
df['label'].value_counts()
df['text_combined'] = df['text_combined'].astype(str)
df = df.dropna(subset=['text_combined', 'label'])

df.shape

(82486, 2)

In [28]:
X = df['text_combined']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [29]:
pipeline_v2 = Pipeline([
    ('tfidf', TfidfVectorizer(
        stop_words='english',
        max_features=7000,
        min_df=5,
        max_df=0.9
    )),
    ('clf', LogisticRegression(max_iter=1000))
])

In [30]:
pipeline_v2.fit(X_train,y_train)
y_pred_v2 = pipeline_v2.predict(X_test)
print(classification_report(y_test, y_pred_v2))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      7919
           1       0.98      0.99      0.98      8579

    accuracy                           0.98     16498
   macro avg       0.98      0.98      0.98     16498
weighted avg       0.98      0.98      0.98     16498



In [31]:
vectorizer = pipeline_v2.named_steps['tfidf']
model = pipeline_v2.named_steps['clf']

model.coef_.shape
feature_names = vectorizer.get_feature_names_out()

coef_df = pd.DataFrame({
    "word": feature_names,
    "weight": model.coef_[0]
})

In [32]:
results_df = pd.DataFrame({
    "text": X_test,
    "true_label": y_test,
    "pred_label": y_pred_v2
})

fp = results_df[
    (results_df['true_label'] == 1) &
    (results_df['pred_label'] == 0)
]

fn = results_df[
    (results_df["true_label"] == 1) &
    (results_df["pred_label"] == 0)
]

len(fp), len(fn)

(120, 120)

In [33]:
sample_text = X_test.iloc[41]
sample_true = y_test.iloc[1]
sample_pred = pipeline_v2.predict([sample_text])[0]

print("True label:", sample_true)
print("Predicted:", sample_pred)

True label: 0
Predicted: 1


In [34]:
sample_vector = vectorizer.transform([sample_text])

weights = model.coef_[0]
sample_array = sample_vector.toarray()[0]
contributions = sample_array * weights

contrib_df = pd.DataFrame({
    'word': feature_names,
    'contribution': contributions
})

contrib_df = contrib_df[contrib_df['contribution'] != 0]
contrib_df.sort_values(by='contribution', ascending=False, inplace=True)

In [36]:
joblib.dump(pipeline_v2, "../model/pipeline.joblib")

['../model/pipeline.joblib']