In [11]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Download stopwords if not already installed
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))
# Load phishing emails
phishing_df = pd.read_csv("./dataset/phishing_email.csv")

# Balance amount of samples in two classes. 
phishing_df = phishing_df.sample(frac=0.4, random_state=42).reset_index(drop=True) 

# Load legitimate (ham) emails
enron_df = pd.read_csv("./dataset/Enron.csv")
# Check data structure
print(phishing_df.head())
print(enron_df.head())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                       text_combined  label
0  endangered languages workshop foundation endan...      0
1  claretta claretta_bordersfusemailcom cialis wo...      1
2  roger upole schkeramsncom kyle rickey wrote im...      0
3  barclays customer service testlightworldcojp d...      1
4  gmm 09 nov 2001 please find attached global ma...      0
                                      subject  \
0                   hpl nom for may 25 , 2001   
1            re : nom / actual vols for 24 th   
2  enron actuals for march 30 - april 1 , 201   
3                   hpl nom for may 30 , 2001   
4                   hpl nom for june 1 , 2001   

                                                body  label  
0  ( see attached file : hplno 525 . xls )\r\n- h...      0  
1  - - - - - - - - - - - - - - - - - - - - - - fo...      0  
2  estimated actuals\r\nmarch 30 , 2001\r\nno flo...      0  
3  ( see attached file : hplno 530 . xls )\r\n- h...      0  
4  ( see attached file : hplno 601 

In [12]:
def clean_text(text):
    if isinstance(text, str):
        # Remove HTML tags
        text = re.sub(r"<.*?>", "", text)
        # Remove special characters, digits
        text = re.sub(r"[^a-zA-Z\s]", "", text)
        # Convert to lowercase
        text = text.lower()
        # Remove stopwords
        text = " ".join(word for word in text.split() if word not in stop_words)
        return text
    return ""

phishing_df["clean_text"] = phishing_df["text_combined"].apply(clean_text)
enron_df["clean_text"] = enron_df["body"].apply(clean_text)

In [13]:
phishing_df["label"] = 1  # Phishing
enron_df["label"] = 0  # Legitimate

phish_df = phishing_df[["clean_text","label"]]
legit_df = enron_df[["clean_text","label"]]
# Merge datasets
df = pd.concat([phish_df, legit_df], ignore_index=True)

# Shuffle the data
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print(df.head())

print(df.tail())

# Check label distribution
print(df["label"].value_counts())

                                          clean_text  label
0  nan mrs lisa williams recommended following pa...      1
1  save prescription needs variety drugs reduced ...      0
2  letter daniel kabila investment offer dear app...      0
3  agree nominated totals eileen ponton david avi...      0
4  email administratoradosashleyindonesiacom emai...      1
                                              clean_text  label
62756  spur site offer natural male enhancement formu...      0
62757  dear vince delighted agreed take part energy p...      0
62758  celebrate texas excellence ex students associa...      1
62759  cnn alerts roxannakousebeetukanujatmx cnn aler...      1
62760  automatically generated delivery status notifi...      0
label
1    32994
0    29767
Name: count, dtype: int64


In [14]:
# Vectorize the words, keep only the top 5000.
vectorizer = TfidfVectorizer(max_features=5000)  
X = vectorizer.fit_transform(df["clean_text"]) 
print(X.shape,X[:2])

(62761, 5000) <Compressed Sparse Row sparse matrix of dtype 'float64'
	with 155 stored elements and shape (2, 5000)>
  Coords	Values
  (0, 2963)	0.088585311705786
  (0, 2937)	0.16311008014946057
  (0, 2597)	0.16171642509399095
  (0, 4905)	0.2257999949946537
  (0, 3686)	0.0847953812764875
  (0, 1746)	0.049413012036330334
  (0, 3188)	0.125003725332856
  (0, 165)	0.062349633414917636
  (0, 4861)	0.06119788746580329
  (0, 3366)	0.030637537929715953
  (0, 761)	0.0487498165725552
  (0, 2590)	0.0581464979618392
  (0, 4790)	0.05776463638766167
  (0, 1102)	0.05254372802894687
  (0, 1805)	0.06986487722092984
  (0, 2744)	0.18894186263596932
  (0, 4939)	0.07201051909358007
  (0, 392)	0.1228657158591921
  (0, 2638)	0.06761040264069916
  (0, 3020)	0.1780344480548367
  (0, 4979)	0.10647202831370114
  (0, 1207)	0.14833836183058
  (0, 4978)	0.05182197516811045
  (0, 4923)	0.051374831780430855
  (0, 711)	0.1679699241954574
  :	:
  (1, 3495)	0.09132296071125191
  (1, 4493)	0.06721694498069199
  (1, 4938)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, df["label"], test_size=0.2, random_state=42)

def train(model, X_train, y_train):
    model.fit(X_train, y_train)
    return model

model_SVC = train(SVC(), X_train, y_train)
y_pred_SCV = model_SVC.predict(X_test)
accuracy_SVC = accuracy_score(y_test, y_pred_SCV)

model_logistic = train(LogisticRegression(), X_train, y_train)
y_pred_logistic = model_logistic.predict(X_test)
accuracy_logistic = accuracy_score(y_test, y_pred_logistic)

In [16]:
print(f'The accuracy of logistic regression model is {accuracy_logistic}.', 
      '\n', f'The accuracy of SVC model is {accuracy_SVC}.')

The accuracy of logistic regression model is 0.791444276268621. 
 The accuracy of SVC model is 0.7857882577869832.
