<a href="https://colab.research.google.com/github/dibsdibsdibs/ISDA/blob/main/deep_PHISHYing_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Prerequisites

The following code imports the necessary libraries and modules for data analysis, visualization, traning, and testing of machine learning models

In [1]:
import numpy as np
import pandas as pd
import nltk
import matplotlib.pyplot as plt
import collections
from collections import Counter
from sklearn import feature_extraction, model_selection, naive_bayes, metrics, svm
from IPython.display import Image
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
import warnings
import seaborn as sns
warnings.filterwarnings("ignore")
%matplotlib inline
!pip install scikeras
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
nltk.download('stopwords')

stemmer=PorterStemmer()

Collecting scikeras
  Downloading scikeras-0.12.0-py3-none-any.whl (27 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.12.0


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Import the dataset from github and read using pandas.

In [2]:
msg_url = 'https://raw.githubusercontent.com/dibsdibsdibs/ISDA/main/dataset.csv'
msg_dataset = pd.read_csv(msg_url, encoding='latin-1')
msg_dataset.head(n=10)

Unnamed: 0,ADDRESS,MESSAGE,CLASSIFICATION,CLASS,SMISH
0,+6391***88335,Loan?PMme,Spam,0,0
1,TNT,You're trying to access sites not included in ...,Ham,0,0
2,TNT,"Para tuloy-tuloy ang saya with the tropa, umiw...",Ham,0,0
3,TNT,LF: FREE Harry Styles Love On Tour concert tic...,Ham,0,0
4,Eddy,nahihilo na ako punyeta,Ham,0,0
5,+6391***45143,"JUAN C., Join WinPlus 2x Win Tournament! Win 4...",Spam,1,1
6,+6390***48409,"JUAN C., Join JPC: 100% up to 2k Welcome Bonus...",Spam,1,1
7,+6392***01008,Discover Twitter! Go to m.twittercom/CNN for m...,Spam,1,1
8,+6399***34987,Lucky Wheel's Mystery Bonus? http://phgaming.s...,Spam,1,1
9,+6390***56711,Banco De Oro You're having issues with account...,Spam,1,1


The dataset includes over 3000 messages, almost half of them are spam messages and the rest are ham messages. The SMISH column identifies whether the data is smishing or just a spam message with no malicious intent - represented by 1 and 0, respectively.

# Feature Engineering

## Text Preprocessing with CountVectorizer and Stop Words

The presented code employs the CountVectorizer from sklearn.feature_extraction.text to preprocess textual data, particularly focusing on removing stop words from both English and Tagalog languages. Stop words represent the frequently used words in a language; there isn't a singular, universally accepted compilation of these words.


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

tagalog_stop_words = [
    'akin', 'aking', 'ako', 'alin', 'am', 'amin', 'aming', 'ang', 'ano', 'anumang',
    'apat', 'at', 'atin', 'ating', 'ay', 'bababa', 'bago', 'bakit', 'bawat', 'bilang',
    'dahil', 'dalawa', 'dapat', 'din', 'dito', 'doon', 'gagawin', 'gayunman', 'ginagawa',
    'ginawa', 'ginawang', 'gumawa', 'gusto', 'habang', 'hanggang', 'hindi', 'huwag',
    'iba', 'ibaba', 'ibabaw', 'ibig', 'ikaw', 'ilagay', 'ilalim', 'ilan', 'inyong',
    'isa', 'isang', 'itaas', 'ito', 'iyo', 'iyon', 'iyong', 'ka', 'kahit', 'kailangan',
    'kailanman', 'kami', 'kanila', 'kanilang', 'kanino', 'kanya', 'kanyang', 'kapag',
    'kapwa', 'karamihan', 'katiyakan', 'katulad', 'kaya', 'kaysa', 'ko', 'kong', 'kulang',
    'kumuha', 'kung', 'laban', 'lahat', 'lamang', 'likod', 'lima', 'maaari', 'maaaring',
    'maging', 'mahusay', 'makita', 'marami', 'marapat', 'masyado', 'may', 'mayroon', 'mga',
    'minsan', 'mismo', 'mula', 'muli', 'na', 'nabanggit', 'naging', 'nagkaroon', 'nais',
    'nakita', 'namin', 'napaka', 'narito', 'nasaan', 'ng', 'ngayon', 'ni', 'nila', 'nilang',
    'nito', 'niya', 'niyang', 'noon', 'o', 'pa', 'paano', 'pababa', 'paggawa', 'pagitan',
    'pagkakaroon', 'pagkatapos', 'palabas', 'pamamagitan', 'panahon', 'pangalawa', 'para',
    'paraan', 'pareho', 'pataas', 'pero', 'pumunta', 'pumupunta', 'sa', 'saan', 'sabi',
    'sabihin', 'sarili', 'sila', 'sino', 'siya', 'tatlo', 'tayo', 'tulad', 'tungkol', 'una',
    'walang', 'ba', 'eh', 'kasi', 'lang', 'mo', 'naman', 'opo', 'po', 'si', 'talaga', 'yung'
]

# Combine English and Tagalog stop words
all_stop_words = list(ENGLISH_STOP_WORDS) + tagalog_stop_words

#declare empty list to store tokenized message
corpus=[]

#iterate through the df["Message"]
for message in msg_dataset["MESSAGE"]:

    #replace every special characters, numbers etc.. with whitespace of message
    #It will help retain only letter/alphabets
    message=re.sub("[^a-zA-Z]"," ",message)

    #convert every letters to its lowercase
    message=message.lower()

    #split the word into individual word list
    message=message.split()

    #perform stemming using PorterStemmer for all non-english-stopwords
    message=[stemmer.stem(words)
            for words in message
             if words not in all_stop_words
            ]
    #join the word lists with the whitespace
    message=" ".join(message)

    #append the message in corpus list
    corpus.append(message)

In [4]:
msg_dataset["LENGTH"] = msg_dataset["MESSAGE"].apply(len)
msg_dataset.head(n=10)

Unnamed: 0,ADDRESS,MESSAGE,CLASSIFICATION,CLASS,SMISH,LENGTH
0,+6391***88335,Loan?PMme,Spam,0,0,9
1,TNT,You're trying to access sites not included in ...,Ham,0,0,92
2,TNT,"Para tuloy-tuloy ang saya with the tropa, umiw...",Ham,0,0,202
3,TNT,LF: FREE Harry Styles Love On Tour concert tic...,Ham,0,0,288
4,Eddy,nahihilo na ako punyeta,Ham,0,0,23
5,+6391***45143,"JUAN C., Join WinPlus 2x Win Tournament! Win 4...",Spam,1,1,130
6,+6390***48409,"JUAN C., Join JPC: 100% up to 2k Welcome Bonus...",Spam,1,1,96
7,+6392***01008,Discover Twitter! Go to m.twittercom/CNN for m...,Spam,1,1,117
8,+6399***34987,Lucky Wheel's Mystery Bonus? http://phgaming.s...,Spam,1,1,93
9,+6390***56711,Banco De Oro You're having issues with account...,Spam,1,1,100


# Spam vs Ham Classification

## Long Short Term Memory

In [5]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
nltk.download('stopwords')

stemmer=PorterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
#declare empty list to store tokenized message
corpus=[]

#iterate through the df["Message"]
for message in msg_dataset["MESSAGE"]:

    #replace every special characters, numbers etc.. with whitespace of message
    #It will help retain only letter/alphabets
    message=re.sub("[^a-zA-Z]"," ",message)

    #convert every letters to its lowercase
    message=message.lower()

    #split the word into individual word list
    message=message.split()

    #perform stemming using PorterStemmer for all non-english-stopwords
    message=[stemmer.stem(words)
            for words in message
             if words not in all_stop_words
            ]
    #join the word lists with the whitespace
    message=" ".join(message)

    #append the message in corpus list
    corpus.append(message)

In [7]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense
from tensorflow.keras.optimizers import Adam

In [8]:
msg_dataset["LENGTH"] = msg_dataset["MESSAGE"].apply(len)
msg_dataset.head(n=10)

Unnamed: 0,ADDRESS,MESSAGE,CLASSIFICATION,CLASS,SMISH,LENGTH
0,+6391***88335,Loan?PMme,Spam,0,0,9
1,TNT,You're trying to access sites not included in ...,Ham,0,0,92
2,TNT,"Para tuloy-tuloy ang saya with the tropa, umiw...",Ham,0,0,202
3,TNT,LF: FREE Harry Styles Love On Tour concert tic...,Ham,0,0,288
4,Eddy,nahihilo na ako punyeta,Ham,0,0,23
5,+6391***45143,"JUAN C., Join WinPlus 2x Win Tournament! Win 4...",Spam,1,1,130
6,+6390***48409,"JUAN C., Join JPC: 100% up to 2k Welcome Bonus...",Spam,1,1,96
7,+6392***01008,Discover Twitter! Go to m.twittercom/CNN for m...,Spam,1,1,117
8,+6399***34987,Lucky Wheel's Mystery Bonus? http://phgaming.s...,Spam,1,1,93
9,+6390***56711,Banco De Oro You're having issues with account...,Spam,1,1,100


### Grid Search

In [None]:
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.base import BaseEstimator, ClassifierMixin
from tensorflow.keras.optimizers import Adam

In [None]:
vocab_size=10000

oneHot_doc=[one_hot(words,n=vocab_size) for words in corpus]

In [None]:
sentence_len=200
embedded_doc=pad_sequences(
    oneHot_doc,
    maxlen=sentence_len,
    padding="pre"
)

In [None]:
extract_features=pd.DataFrame(data=embedded_doc)
target=msg_dataset["CLASS"]
msg_dataset_final=pd.concat([extract_features,target],axis=1)

In [None]:
X = msg_dataset_final.drop("CLASS",axis=1)
y = msg_dataset_final["CLASS"]

X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, random_state=42, test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, random_state=42, test_size=0.2)

In [None]:
def lstm_model(vocab_size=10000, feature_num=100, sentence_len=200, mem_cells=128, learning_rate=0.001, dropout_rate=0.2, optimizer='Adam'):
  model=Sequential()
  model.add(Embedding(input_dim=vocab_size, output_dim=feature_num, input_length=sentence_len))
  model.add(LSTM(units=mem_cells, dropout=dropout_rate))
  model.add(Dense(units=1, activation="sigmoid"))
  model.compile(optimizer=optimizer(learning_rate=learning_rate), loss="binary_crossentropy", metrics=["accuracy"])
  return model

In [None]:
class KerasClassifierWrapper(BaseEstimator, ClassifierMixin):
    def __init__(self, vocab_size=10000, feature_num=100, sentence_len=200, mem_cells=128, learning_rate=0.001, dropout_rate=0.2, optimizer='Adam'):
        self.vocab_size = vocab_size
        self.feature_num = feature_num
        self.sentence_len = sentence_len
        self.mem_cells = mem_cells
        self.learning_rate = learning_rate
        self.dropout_rate = dropout_rate
        self.optimizer = optimizer
        self.model = self._build_model()

    def _build_model(self):
        model = Sequential()
        model.add(Embedding(input_dim=self.vocab_size, output_dim=self.feature_num, input_length=self.sentence_len))
        model.add(LSTM(units=self.mem_cells, dropout=self.dropout_rate))
        model.add(Dense(units=1, activation="sigmoid"))

        # Map optimizer string to optimizer class
        optimizer_classes = {'Adam': Adam}  # You can add more optimizers as needed
        optimizer_class = optimizer_classes.get(self.optimizer, None)

        if optimizer_class is None:
            raise ValueError(f"Unsupported optimizer: {self.optimizer}")

        optimizer_instance = optimizer_class(learning_rate=self.learning_rate)
        model.compile(optimizer=optimizer_instance, loss="binary_crossentropy", metrics=["accuracy"])
        return model

    def fit(self, X, y):
        self.model.fit(X, y, epochs=10, validation_split=0.15, verbose=1)
        return self

    def predict(self, X):
        y_pred_proba = self.model.predict(X)
        y_pred = (y_pred_proba > 0.5).astype(int)
        return y_pred

    def score(self, X, y):
        y_pred = self.predict(X)
        return accuracy_score(y, y_pred), precision_score(y, y_pred), recall_score(y, y_pred), confusion_matrix(y, y_pred)


In [None]:
param_grid={
    'optimizer':['Adam'],
    'feature_num':[50, 100, 200],
    'mem_cells':[64, 128, 256],
    'learning_rate':[0.001, 0.01, 0.0001],
    'dropout_rate':[0.2, 0.5, 0.8],
    'sentence_len':[150, 200]
}

keras_model = KerasClassifierWrapper()

grid = GridSearchCV(estimator=keras_model, param_grid=param_grid, cv=2, scoring='accuracy')
grid_result = grid.fit(X_train, y_train)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
E

In [None]:
model = KerasClassifier(model=lstm_model, epochs=10, batch_size=10, verbose=0)
# optimizer = ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam']
optimizer = ['Adam']
feature_num=[50, 100, 200]
mem_cells=[64, 128, 256]
sentence_len=[150, 200]

param_grid = dict(
    model__optimizer=optimizer,
    model__feature_num=feature_num,
    model__sentence_len=sentence_len,
    model__mem_cells=mem_cells,
    model__learning_rate=learning_rate,
    model__dropout_rate=dropout_rate
    )

grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(X_train, y_train, error_score='raise')

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
batch_size = [10, 20, 40, 60, 80, 100]
epochs = [10, 50, 100]
param_grid = dict(batch_size=batch_size, epochs=epochs)