# Safe or Phishing Email?

## Connecting to Google Drive

In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


## NLTK settings

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

## Libraries importing

In [None]:
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay, f1_score
from sklearn.svm import SVC

## Dataset loading

In [None]:
data = pd.read_csv('/content/drive/MyDrive/.../Phishing_Email.csv')
data = data.dropna()
data = data.drop('Unnamed: 0', axis=1)

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18634 entries, 0 to 18649
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Email Text  18634 non-null  object
 1   Email Type  18634 non-null  object
dtypes: object(2)
memory usage: 436.7+ KB


In [None]:
data.head()

Unnamed: 0,Email Text,Email Type
0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email
1,the other side of * galicismos * * galicismo *...,Safe Email
2,re : equistar deal tickets are you still avail...,Safe Email
3,\nHello I am your hot lil horny toy.\n I am...,Phishing Email
4,software at incredibly low prices ( 86 % lower...,Phishing Email


## Data cleaning

In [None]:
def text_preprocessing(text):
  wn = WordNetLemmatizer()
  text = re.sub('[^a-zA-Z]', ' ', text)
  text = text.lower()
  text = text.split()
  text = [word for word in text if word not in stopwords.words('english')]
  text = [wn.lemmatize(word) for word in text]
  text = ' '.join(text)
  return text

In [None]:
data['Email Text'] = data['Email Text'].apply(text_preprocessing)

In [None]:
data['Email Type'].replace('Safe Email', 0, inplace=True)
data['Email Type'].replace('Phishing Email', 1, inplace=True)

In [None]:
data

Unnamed: 0,Email Text,Email Type
0,disc uniformitarianism sex lang dick hudson ob...,0
1,side galicismos galicismo spanish term name im...,0
2,equistar deal ticket still available assist ro...,0
3,hello hot lil horny toy one dream open minded ...,1
4,software incredibly low price lower drapery se...,1
...,...,...
18645,date lonely housewife always wanted date lonel...,1
18646,request submitted access request anita dupont ...,0
18647,important prc mtg hi dorn john discovered rece...,0
18648,press clipping letter californian utility plea...,0


## Train - validation - test split

In [None]:
X = data['Email Text']
y = data['Email Type']
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.2, random_state=17)
X_validation, X_test, y_validation, y_test = train_test_split(X_validation, y_validation, test_size=0.2, random_state=17)

print('Training Data : ', X_train.shape)
print('Validation Data : ', X_validation.shape)
print('Test Data : ', X_test.shape)

Training Data :  (14907,)
Validation Data :  (2981,)
Test Data :  (746,)


## Feature Extraction - TFIDF - Training

In [None]:
from sklearn.neural_network import MLPClassifier
pipeline = Pipeline([ ('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', MLPClassifier(max_iter=100)),
                    ])

email_clf = pipeline.fit(X_train, y_train)

## Validation-set performance

In [None]:
y_pred = email_clf.predict(X_validation)

accuracy = accuracy_score(y_validation, y_pred)
precision = precision_score(y_validation, y_pred)
recall = recall_score(y_validation, y_pred)
f1 = f1_score(y_validation, y_pred, average='weighted')

print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("f1: ", f1)

Accuracy:  0.9792016101979202
Precision:  0.9653505237711523
Recall:  0.9843878389482333
f1:  0.9792319209469044


## Test-set performance

In [None]:
y_pred = email_clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)

print("Recall:", recall)

Accuracy: 0.9731903485254692
Precision: 0.9562043795620438
Recall: 0.9703703703703703
