In [None]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import nltk
import re
import contractions
import random
from sklearn.naive_bayes import MultinomialNB
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import Perceptron
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix, ConfusionMatrixDisplay, precision_recall_fscore_support as score
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
import warnings
import gensim.downloader as api
import torch.nn as nn
import torch.nn.functional as F
import torch
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.sampler import SubsetRandomSampler

In [None]:
SEED = 42

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')
stopWords=set(stopwords.words('english'))

In [None]:
dataset = pd.read_csv('./Dataset/fraud_email_.csv')
dataset.shape

In [None]:
len(dataset[dataset['Class'] == 1]), len(dataset[dataset['Class'] == 0])

In [None]:
dataset

## Pre-processing data

In [None]:
def remove_contractions(text):
    words = []
    for word in text.split():  
        words.append(contractions.fix(word))
    res = ' '.join(words)
    return res

def dataCleaning(text):
    text=str(text)
    text=text.lower()
    # Remove non-alphabetical characters and extra spaces
    text = remove_contractions(text)
    text = re.sub(r'\.', ' . ', text)
    text = re.sub(r'\,', ' , ', text)
    text = re.sub(r'\:', ' : ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    words=[]
     # Perform contractions
    return text

def removestopwords(text):
    words=text.split(' ')
    words = [word for word in words if word not in stopWords]    
    text = ' '.join(words)
    return text

def performlemmatization(text):
    lemmatizer = WordNetLemmatizer()
    
    words=text.split(' ')
    words = [lemmatizer.lemmatize(word) for word in words]    
    text = ' '.join(words)
    return text

def add_space_for_punctuations(text):
    # This regular expression captures:
    # 1. A word (\w+)
    # 2. A punctuation character ([.,!?])
    pattern = re.compile(r'(\w+)([.,!?])')
    result = pattern.sub(r'\1 \2', text)
    return result

def remove_tags(html):
    soup = BeautifulSoup(html, "html.parser")
    for data in soup(['style', 'script']):
        data.decompose()
    return ' '.join(soup.stripped_strings)

In [None]:
def pre_process_data(data, column_name):
    data["email_body_clean"] = data[column_name].apply(dataCleaning)
    data["email_body_clean"] = data["email_body_clean"].apply(removestopwords)
    data['email_body_clean'] =  [re.sub('[^a-zA-Z0-9]', ' ', str(x)) for x in data['email_body_clean']]
    data['email_body_clean'] =  [re.sub(' +', ' ', str(x)) for x in data['email_body_clean']]
    # data["email_body_clean"] = data["email_body_clean"].apply(add_space_for_punctuations)
    # data["email_body_clean"] = data["email_body_clean"].apply(performlemmatization)
    return data

In [None]:
dataset = pre_process_data(dataset, "Text")
dataset.head()

### Run models using TFIDF as features

In [None]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_features = tfidf_vectorizer.fit_transform(dataset["email_body_clean"])
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(tfidf_features, dataset['Class'].to_numpy(), 
                                                                test_size=0.2, random_state=SEED)

count_vectorizer = CountVectorizer()
count_features = count_vectorizer.fit_transform(dataset["email_body_clean"])

X_train_count, X_test_count, _, _ = train_test_split(count_features, dataset['Class'].to_numpy(), 
                                                                test_size=0.2, random_state=SEED)


In [None]:
def print_scores(y_test, y_predtest, print_matrix = True):
    precision = precision_score(y_test, y_predtest,average="macro")
    recall = recall_score(y_test, y_predtest,average="macro")
    f1 = f1_score(y_test, y_predtest,average="macro")
    accuracy = accuracy_score(y_test, y_predtest)
    print(precision,recall,f1,accuracy)
    
    cm = confusion_matrix(y_test, y_predtest)
    TN1, FP1, FN1, TP1 = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
    print("precision for 1 - ", TP1/(TP1+FP1), "Recall for 1 - ",TP1/(TP1+FN1))
    #print(TP+FN,sum(y_test))
    y_test_inv = [ abs(i-1) for i in y_test]
    y_predtest_inv = [ abs(i-1) for i in y_predtest]
    
    cm = confusion_matrix(y_test_inv, y_predtest_inv)
    TN0, FP0, FN0, TP0 = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
    #print(TP+FN,sum(y_test))
    print("precision for 0 - ", TP0/(TP0+FP0), "Recall for 0 - ",TP0/(TP0+FN0))
    
    if(print_matrix):
        confusion_mat = confusion_matrix(y_test, y_predtest, labels=[0,1])
        disp = ConfusionMatrixDisplay(confusion_matrix=confusion_mat, display_labels=[0,1])
        disp.plot()
        plt.show()


In [None]:
sum(y_test), len(y_test), sum(y_train), len(y_train)

In [None]:
X_train_tfidf.shape, X_test_tfidf.shape

In [None]:
SVM_tfidf = LinearSVC(C=1.0)
SVM_tfidf.fit(X_train_tfidf, y_train)
ypredtest_svm_tfidf = SVM_tfidf.predict(X_test_tfidf)

SVM_count = LinearSVC(C=1.0)
SVM_count.fit(X_train_count, y_train)
ypredtest_svm_count = SVM_count.predict(X_test_count)

perceptron_tfidf = Perceptron()
perceptron_tfidf.fit(X_train_count, y_train)
ypredtest_perceptron = perceptron_tfidf.predict(X_test_count)

In [None]:
print("SVM with TFIDF:")
print_scores(y_test, ypredtest_svm_tfidf)

print("SVM with count:")
print_scores(y_test, ypredtest_svm_count)

print("Perceptron with count:")
print_scores(y_test, ypredtest_perceptron)

In [None]:
########## TEST The Model with Other datasets ##########

In [None]:
iwspa_dataset = pd.read_csv('./Dataset/IWSPA-AP-Parsed.csv')
iwspa_dataset.head()

In [None]:
iwspa_dataset = pre_process_data(iwspa_dataset, "email_body")

In [None]:
iwspa_dataset.head()

In [None]:
tfidf_features_iwspa = tfidf_vectorizer.transform(iwspa_dataset["email_body_clean"])

In [None]:
testLabels = iwspa_dataset['is_phishing'].to_numpy()
ypred_iwspa_svm_tfidf = SVM_tfidf.predict(tfidf_features_iwspa)
ypred_iwspa_svm_tfidf.shape

In [None]:
print("SVM with TFIDF Test:")
print_scores(testLabels, ypred_iwspa_svm_tfidf)

In [None]:
# del X_train_tfidf, X_test_tfidf, X_train_count, X_test_count, tfidf_vectorizer
# del tfidf_features, count_vectorizer, count_features

## Using Word2Vec features

In [None]:
wv = api.load('word2vec-google-news-300')

In [None]:
def get_word2vec_features(data):
    master=[]
    for index, row in data.iterrows():
        sentence = row['email_body_clean']
        words=sentence.split(' ')
        sentence_vector=[]
        for word in words:
            sentence_vector.append(wv[word] if word in wv else np.zeros(300))
        master.append(np.mean(sentence_vector, axis=0))
    return np.array(master)

In [None]:
train_word2vec_features = get_word2vec_features(dataset)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_word2vec_features, dataset['Class'].to_numpy(), 
                                                    test_size=0.2, random_state=SEED)
# del train_word2vec_features

In [None]:
# SVM model
print("SVM with Word2Vec Train (train/val/test) :")
svm_model = LinearSVC(C=1.0)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
print_scores(y_test, y_pred_svm)

In [None]:
# Test features
print("SVM with Word2Vec Test:")
test_features_iwspa = get_word2vec_features(iwspa_dataset)
testLabels = iwspa_dataset['is_phishing'].to_numpy()
ypred_iwspa_svm = svm_model.predict(test_features_iwspa)
print_scores(testLabels, ypred_iwspa_svm)

## GRU Model

In [None]:
def predict(model, dataloader):
    prediction_list = []
    for data, target in dataloader:
        outputs = model(data)
        _, predicted = torch.max(outputs.data, 1) 
        prediction_list.extend(list(np.array(predicted.cpu())))
    return prediction_list

In [None]:
print("Max sentence length :", max([len(x) for x in dataset['email_body_clean'].str.split()]))
print("Average sentence length :", sum([len(x) for x in dataset['email_body_clean'].str.split()]) / dataset['email_body_clean'].shape[0])

In [None]:
class SimpleGRU(nn.Module):
    def __init__(self):
        super(SimpleGRU, self).__init__()
        self.GRU = nn.GRU(300, 100, batch_first=True)
        self.fc = nn.Linear(100, 2)
        #self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        out, _ = self.GRU(x)
        out = self.fc(out[:, -1, :])
        return out
    
# initialize the NN
simple_gru_model = SimpleGRU()
print(simple_gru_model)

In [None]:
class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, index):
        feature = torch.tensor(self.features[index], dtype=torch.float32)
        label = torch.tensor(self.labels[index], dtype=torch.long) 
        return feature, label

def get_rnn_features(data):
    rnn_features_vector = []
    for index, row in data.iterrows():
        sentence = row['email_body_clean']
        words=sentence.split(' ')
        sentence_vector=[]
        for word in words[:100]:
            sentence_vector.append(wv[word] if word in wv else np.zeros(300))
        if len(sentence_vector)<100:
            for i in range(100-len(sentence_vector)):
                sentence_vector.append(np.zeros(300))
        rnn_features_vector.append(sentence_vector)
    return np.array(rnn_features_vector)

In [None]:
train_rnn_features = get_rnn_features(dataset)
X_train, X_test, y_train, y_test = train_test_split(train_rnn_features, dataset['Class'].to_numpy(), 
                                                    test_size=0.2, random_state=SEED)
train_data = CustomDataset(features=X_train, labels=np.array(y_train))
test_data=CustomDataset(features=X_test, labels=np.array(y_test))

In [None]:
# del train_rnn_features, X_train, X_test

In [None]:
num_workers = 0
# the number of samples per batch to load
batch_size = 20
# percentage of training set to use as validation
valid_size = 0.2

num_train = len(train_data)
indices = list(range(num_train))
np.random.shuffle(indices)
split = int(np.floor(valid_size * num_train))
train_idx, valid_idx = indices[split:], indices[:split]

train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)

train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size,
    sampler=train_sampler, num_workers=num_workers)
valid_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, 
    sampler=valid_sampler, num_workers=num_workers)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, 
    num_workers=num_workers)

In [None]:
# Test features
test_rnn_features = get_rnn_features(iwspa_dataset)
testLabels = iwspa_dataset['is_phishing'].to_numpy()
test_iwspa_loader = torch.utils.data.DataLoader(CustomDataset(features=test_rnn_features, labels=testLabels), 
                                                batch_size=batch_size, num_workers=num_workers)

In [None]:
def train_model(model, criterion, optimizer, n_epochs, train_loader, valid_loader, 
                test_loader, test_labels, new_test_loader, new_test_labels):
    best_model = None
    valid_loss_min = np.Inf
    for epoch in range(n_epochs):
        train_loss = 0.0
        valid_loss = 0.0
        model.train()
        print("Starting epoch", epoch+1)
        batch_no = 1
        for data, target in train_loader:
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()*data.size(0)
            print("Finished batch", batch_no)
            batch_no += 1

        # validate the model
        model.eval()
        for data, target in valid_loader:
            output = model(data)
            loss = criterion(output, target)
            valid_loss += loss.item()*data.size(0)

        train_loss = train_loss/len(train_loader.dataset)
        valid_loss = valid_loss/len(valid_loader.dataset)

        ypred_test = predict(model, test_loader)
        print("Accuracy scores on validation data :")
        print_scores(test_labels, ypred_test, print_matrix = False)

        ypred_new_test = predict(model, new_test_loader)
        print("Accuracy scores on iwspa data :")
        print_scores(new_test_labels, ypred_new_test, print_matrix = False)

        print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
            epoch+1, 
            train_loss,
            valid_loss,
            ))

        # saving the model if validation loss has decreased
        if valid_loss <= valid_loss_min:
            print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
            valid_loss_min,
            valid_loss))
            torch.save(model.state_dict(), 'model.pt')
            valid_loss_min = valid_loss
            best_model = model
    return best_model

In [None]:
N_EPOCHS = 50
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(simple_gru_model.parameters())

best_model = train_model(simple_gru_model, criterion, optimizer, N_EPOCHS, train_loader, 
                         valid_loader, test_loader, y_test, test_iwspa_loader, testLabels)

In [None]:
# model.load_state_dict(torch.load('model.pt'))