In [1]:
#importing the libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import pandas as pd

In [2]:
#load the dataset
data_path = 'datasets/phishing/archive/phishing_email.csv'
data = pd.read_csv(data_path)
#remove the missing values and duplicates
data = data.dropna()
data = data.drop_duplicates()
#reset the index
data = data.reset_index(drop=True)
print(data.head())
#check the shape of the dataset
print(data.shape)
#convert the text to lowercase
data['text_combined'] = data['text_combined'].apply(lambda x: x.lower())



                                       text_combined  label
0  hpl nom may 25 2001 see attached file hplno 52...      0
1  nom actual vols 24 th forwarded sabrae zajac h...      0
2  enron actuals march 30 april 1 201 estimated a...      0
3  hpl nom may 30 2001 see attached file hplno 53...      0
4  hpl nom june 1 2001 see attached file hplno 60...      0
(82078, 2)


In [3]:
# train test split for 5 fold cross validation
X_train, X_test, y_train, y_test = train_test_split(data['text_combined'], data['label'], test_size=0.2, random_state=42)
# initialize the count vectorizer
count_vectorizer = CountVectorizer()
# fit the count vectorizer
count_vectorizer.fit(X_train)
# transform the training data
X_train_count = count_vectorizer.transform(X_train)
# transform the testing data
X_test_count = count_vectorizer.transform(X_test)
# initialize logistic regression model
log_reg = LogisticRegression()
# fit the model on the training data
log_reg.fit(X_train_count, y_train)
# predict the labels for the testing data
y_pred = log_reg.predict(X_test_count)
# print the classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      7910
           1       0.99      0.99      0.99      8506

    accuracy                           0.99     16416
   macro avg       0.99      0.99      0.99     16416
weighted avg       0.99      0.99      0.99     16416



In [None]:
#save the model
import pickle
filename = 'phishing_email_model.pkl'
pickle.dump(log_reg, open(filename, 'wb'))
#save the count vectorizer
filename = 'count_vectorizer.pkl'
pickle.dump(count_vectorizer, open(filename, 'wb'))