In [111]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

## Data

In [112]:
df = pd.read_csv('Phishing_Email.csv')
df.head()

Unnamed: 0,Email Text,Email Type
0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email
1,the other side of * galicismos * * galicismo *...,Safe Email
2,re : equistar deal tickets are you still avail...,Safe Email
3,\r\nHello I am your hot lil horny toy.\r\n ...,Phishing Email
4,software at incredibly low prices ( 86 % lower...,Phishing Email


In [113]:
df.to_csv('Phishing_Email.csv', index=False)

In [114]:
df.columns

Index(['Email Text', 'Email Type'], dtype='object')

In [115]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18650 entries, 0 to 18649
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Email Text  18634 non-null  object
 1   Email Type  18650 non-null  object
dtypes: object(2)
memory usage: 291.5+ KB


In [116]:
df = df.head(10000)

In [117]:
df.isna().sum()

Email Text    13
Email Type     0
dtype: int64

In [118]:
empty_values = df[df['Email Text'].isnull()]['Email Text']
empty_values

31      NaN
387     NaN
1883    NaN
2049    NaN
2451    NaN
2972    NaN
3627    NaN
3806    NaN
5763    NaN
6299    NaN
6821    NaN
8594    NaN
9999    NaN
Name: Email Text, dtype: object

In [None]:
df.dropna(subset=['Email Text'], inplace=True) # Clean the n/a values
df

In [120]:
df.isna().any()

Email Text    False
Email Type    False
dtype: bool

## Dealing with text data

### Tokenization and Stop word removal and Stemming

In [121]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

In [122]:
import spacy
from spacy.lang.en import stop_words

In [123]:
sentences = df['Email Text']

nlp = spacy.load('en_core_web_sm')
nlp.max_length = 20_000_000
stop_words = stop_words.STOP_WORDS

In [124]:
from nltk.stem import PorterStemmer

In [125]:
stemmer = PorterStemmer()

In [126]:
tokenizer = Tokenizer(num_words=100, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)

tokenized_sentences = []
for sentence in sentences:
    if len(sentence) > nlp.max_length:
        sentence = sentence[:nlp.max_length]  # Truncate the sentence if it exceeds the maximum length
    doc = nlp(sentence)
    tokens = [token.lemma_ for token in doc if token.lemma_ not in stop_words]
    stemmed_tokens = [stemmer.stem(token) for token in tokens]  # Apply stemming to each token
    tokenized_sentences.append(" ".join(stemmed_tokens))

tokenizer.fit_on_texts(tokenized_sentences)
word_index = tokenizer.word_index

In [None]:
word_index

# Model

In [128]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [129]:
# Convert tokenized sentences to sequences of indices
sequences = tokenizer.texts_to_sequences(tokenized_sentences)

# Pad sequences to ensure equal length
max_sequence_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences)

In [130]:
X = padded_sequences
Y = df[['Email Type']]

In [132]:
from sklearn.model_selection import train_test_split

In [133]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [134]:
# Define Model
model = keras.Sequential()
model.add(keras.layers.Embedding(input_dim=len(word_index)+1, output_dim=100, input_length=max_sequence_length))
model.add(keras.layers.LSTM(units=64))
model.add(keras.layers.Dense(units=1, activation='sigmoid'))   

In [135]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [140]:
Y_train = np.where(Y_train == 'Safe Email', 0, 1)
Y_test = np.where(Y_test == 'Safe Email', 0, 1)

In [142]:
# Train the model
model.fit(X_train, Y_train, epochs=1, batch_size=32)



<keras.callbacks.History at 0x2ded2b0f710>

In [144]:
predictions = model.predict(X_test)



In [147]:
from sklearn.metrics import accuracy_score

rounded_predictions = predictions.round().astype(int)
accuracy = accuracy_score(Y_test.astype(int), rounded_predictions)

In [148]:
print("Accuracy:", accuracy)


Accuracy: 0.8818818818818819


In [149]:
model.save("model.h5")

## Custom Predict

In [152]:
import spacy

# Preprocess the input text
custom_text = "Leo Messi,\n\nThank you for taking the time to provide feedback on your subscription experience.\n\nBest Regards,\nThe Barcelona Football Team" # Safe Email
# custom_text = "You won 1000000 million dollars!!!" # Fake email

# Tokenize the text
nlp = spacy.load('en_core_web_sm')
doc = nlp(custom_text)
tokens = [token.lemma_ for token in doc if token.lemma_ not in stop_words]
stemmed_tokens = [stemmer.stem(token) for token in tokens]
tokenized_text = " ".join(stemmed_tokens)

# Convert tokenized text to sequences
sequences = tokenizer.texts_to_sequences([tokenized_text])

# Pad the sequence to match the desired length
padded_sequence = pad_sequences(sequences, maxlen=max_sequence_length)

# Make predictions
prediction = model.predict(padded_sequence)
prediction_class = "Safe" if prediction[0] < 0.5 else "Phishing"
print(prediction_class)

Safe
