In [2]:
import pathlib
import pandas as pd
import random

BASE_DIR = pathlib.Path().resolve().parent
DATASET_DIR = BASE_DIR / "datasets"
RAW_DATA_DIR = DATASET_DIR / "raw-data"
EXPORT_DIR = DATASET_DIR / "exports"
EXPORT_DIR.mkdir(exist_ok=True, parents=True)
PHISHING_DATASET_PATH = EXPORT_DIR / "phishing-dataset.csv"

METADATA_EXPORT_PATH = EXPORT_DIR / 'phishing-metadata.pkl'
TOKENIZER_EXPORT_PATH = EXPORT_DIR / 'phishing-tokenizer.json'

In [3]:
df = pd.read_csv(PHISHING_DATASET_PATH)
df.text = df.text.astype(str)

In [4]:
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,ham,U dun say so early hor... U c already then say...
3,ham,"Nah I don't think he goes to usf, he lives aro..."
4,ham,Even my brother is not like to speak with me. ...


In [5]:
labels = df['label'].tolist()
texts = df['text'].tolist()

In [6]:
labels[2350], texts[2350]

('phishing',
 'aw-confirm@ebay.com Power Seller \r\nDear eBay member,\r\n\r\nCongratulations! Your recent selling activity entitles you to Silver status in the eBay PowerSeller Program. Your membership comes with some great benefits and services:\r\n\r\nSee the PowerSeller icon next to your User ID \r\n\r\nFree seller phone support, Monday-Friday, 6am-6pm PST.\r\n\r\nGet exclusive offerings on the PowerSeller portal--check back often for updates!\r\n\r\nNetwork on the exclusive PowerSeller Discussion Board.\r\n\r\nDownload free business templates for PowerSeller business cards and letterhead.\r\n\r\nBe sure to sign up today--it\'s FREE! Visit www.ebay.com/powerseller and click "Member Sign In."\r\nAgain, congratulations and best wishes for your continued success!\r\nSincerely,\r\neBay PowerSeller Team\r\n\r\neBay sent this communication to you because of your outstanding feedback, high sales, and good account standing. If you would not like to be invited to join the PowerSeller program

In [7]:
label_legend = {"ham": 0, "phishing": 1}
label_legend_inverted = {f"{v}": k for k,v in label_legend.items()}
label_legend_inverted

{'0': 'ham', '1': 'phishing'}

In [8]:
labels_as_int = [label_legend[x] for x in labels]
labels_as_int[2350]

1

In [9]:
random_idx = random.randint(0, len(labels))

assert texts[random_idx] == df.iloc[random_idx].text

assert labels[random_idx] == df.iloc[random_idx].label

assert label_legend_inverted[str(labels_as_int[random_idx])] == df.iloc[random_idx].label

In [10]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [11]:
MAX_NUM_WORDS = 450

In [12]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
sequences

[[140, 286, 193, 11, 307, 107, 224, 349],
 [338, 84],
 [84, 101, 84, 188],
 [21, 317, 122, 2, 122, 111],
 [53, 14, 18, 103, 2, 31, 85, 273, 85, 103],
 [36, 4, 258, 86, 83, 36, 4, 12, 40, 2, 4],
 [198, 28, 390, 311, 6, 21, 317, 277, 2, 62, 9],
 [83,
  12,
  3,
  230,
  2,
  60,
  5,
  12,
  9,
  21,
  21,
  168,
  4,
  55,
  12,
  6,
  22,
  53,
  5,
  29,
  83,
  6,
  16,
  48,
  40],
 [21, 29, 16, 325, 19, 31, 22],
 [198, 111],
 [84, 336, 109, 130, 232, 221, 21, 393, 122, 172, 286, 21],
 [25, 3, 219, 84, 3, 219, 208, 266],
 [14, 23, 109, 5, 232, 221],
 [386, 2, 12, 130, 193],
 [101, 387, 188, 151, 14],
 [21, 53, 188, 21, 140, 429, 84, 378],
 [100, 219, 21, 74, 142, 31, 5],
 [191, 2, 16, 198, 18, 9, 14, 122, 198, 151, 21],
 [4, 351, 101],
 [393, 5, 3, 26, 5, 41, 393, 5, 172, 16, 26, 5, 4, 56, 5, 53, 445],
 [198, 358, 3, 96, 5, 192, 25],
 [321, 21, 336, 23, 175, 291, 34, 103],
 [364, 18, 40, 23, 352, 5, 18, 340, 62, 85, 30, 23, 436, 277, 2, 31, 61],
 [122,
  349,
  11,
  48,
  130,
  6,

In [28]:
word_index = tokenizer.word_index

In [29]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [30]:
MAX_SEQUENCE_LENGTH = 1000

In [31]:
X = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [32]:
X

array([[  0,   0,   0, ..., 107, 224, 349],
       [  0,   0,   0, ...,   0, 338,  84],
       [  0,   0,   0, ..., 101,  84, 188],
       ...,
       [  0,   0,   0, ...,  70,   4,   8],
       [  0,   0,   0, ..., 440,  77,  71],
       [  0,   0,   0, ...,  33,  62, 123]])

In [18]:
import numpy as np
from tensorflow.keras.utils import to_categorical

In [19]:
labels_as_int_array = np.asarray(labels_as_int)

In [20]:
labels_as_int_array

array([0, 0, 0, ..., 1, 1, 1])

In [21]:
y = to_categorical(labels_as_int_array)

In [22]:
y

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [0., 1.],
       [0., 1.],
       [0., 1.]], dtype=float32)

In [23]:
!pip install scikit-learn



You should consider upgrading via the 'C:\Users\Bachi\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


In [24]:
from sklearn.model_selection import train_test_split

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [34]:
import pickle

In [35]:
training_data = {
    "X_train": X_train,
    "X_test": X_test,
    "y_train": y_train,
    "y_test": y_test,
    "max_words": MAX_NUM_WORDS,
    "max_seq_length": MAX_SEQUENCE_LENGTH,
    "legend": label_legend,
    "legend_inverted": label_legend_inverted,
    "tokenizer": tokenizer
}

4422166

In [36]:
with open(METADATA_EXPORT_PATH, 'wb') as f:
    pickle.dump(training_data, f)

In [None]:
data = {}

with open(TRAINING_DATA_PATH, 'rb') as f:
    data = pickle.load(f)