In [1]:
import pathlib
import pandas as pd
import random

BASE_DIR = pathlib.Path().resolve().parent
DATASET_DIR = BASE_DIR / "datasets"
RAW_DATA_DIR = DATASET_DIR / "raw-data"
EXPORT_DIR = DATASET_DIR / "exports"
EXPORT_DIR.mkdir(exist_ok=True, parents=True)
PHISHING_DATASET_PATH = EXPORT_DIR / "phishing-dataset.csv"

METADATA_EXPORT_PATH = EXPORT_DIR / 'phishing-metadata.pkl'
TOKENIZER_EXPORT_PATH = EXPORT_DIR / 'phishing-tokenizer.json'

In [2]:
df = pd.read_csv(PHISHING_DATASET_PATH)
df.text = df.text.astype(str)

In [3]:
df.head()

Unnamed: 0,label,text
0,phishing,onlineservice@nafcu.org Account Notification:Y...
1,phishing,"mis@bsch.es Banca Personal Estimados clientes,..."
2,phishing,technical_department-id-708ziy@bbandt.com BB&T...
3,phishing,customer_service@paypal.com Notification of Li...
4,phishing,customer_service@paypal.com Notification of Li...


In [4]:
labels = df['label'].tolist()
texts = df['text'].tolist()

In [5]:
labels[2430], texts[2430]

('ham',
 'DMarket\xa0<newsletter@dmarket.com> Unsubscribe To: elvisdurmishi@yahoo.com Tue, Feb 7 at 9:03 AM😎 GET A CHANCE TO WIN: ST Deagle | Light Rail (FN) - Check The Price 👀 ST M4A4 | Desolate Space (WW) - Check The Price 👀 AWP | Fever Dream (FN) - Check The Price 👀 ST USP-S | Ticket to Hell (FN) - Check The Price 👀 Souvenir USP-S | Royal Blue FT - Check The Price 👀 Left: 6 Days ⌛WANNA GET IT FOR FREE?Submit as many entries as you can! The more entries, the more chances to win! 🔥 PARTICIPATE IN THE GIVEAWAY 🔥 If you like giveaways from DMarket, tell your friends about them and compete for free skins with your besties. Also, look out for more as new giveaways are on their way.If you have any questions, please do not hesitate to contact us at support@dmarket.com. We are also available 24/7 via our social media channels.© 2023 DMarket. All rights reserved. You are receiving this email because you opted in at our website.Unsubscribe from this list')

In [6]:
label_legend = {"ham": 0, "phishing": 1}
label_legend_inverted = {f"{v}": k for k,v in label_legend.items()}
label_legend_inverted

{'0': 'ham', '1': 'phishing'}

In [7]:
labels_as_int = [label_legend[x] for x in labels]
labels_as_int[2430]

0

In [8]:
random_idx = random.randint(0, len(labels))

assert texts[random_idx] == df.iloc[random_idx].text

assert labels[random_idx] == df.iloc[random_idx].label

assert label_legend_inverted[str(labels_as_int[random_idx])] == df.iloc[random_idx].label

In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [10]:
MAX_NUM_WORDS = 450

In [11]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
sequences

[[7,
  93,
  2,
  49,
  7,
  214,
  69,
  43,
  34,
  3,
  432,
  6,
  28,
  55,
  1,
  20,
  161,
  15,
  24,
  422,
  1,
  3,
  343,
  30,
  10,
  55,
  12,
  362,
  3,
  50,
  6,
  39,
  9,
  28,
  15,
  30,
  60,
  4,
  10,
  192,
  1,
  73,
  2,
  28,
  234,
  304,
  40,
  177,
  6,
  7,
  144,
  15,
  347,
  4,
  1,
  44,
  2,
  317,
  15,
  24,
  1,
  20,
  19,
  39,
  341,
  45,
  149,
  30,
  80,
  281,
  20,
  187,
  49,
  97,
  12,
  8,
  4,
  21,
  27,
  268,
  1,
  16,
  9,
  18,
  152,
  6,
  44,
  2,
  7,
  36,
  3,
  74,
  108,
  33,
  68,
  152,
  39,
  152,
  4,
  12,
  2,
  227,
  1,
  8,
  271,
  330,
  330,
  330,
  330,
  330,
  15,
  24,
  389,
  1,
  47,
  4,
  4,
  288,
  7,
  230,
  29,
  1,
  158,
  15,
  24,
  114,
  12,
  4,
  193,
  55,
  38,
  22,
  29,
  55,
  4,
  52,
  252,
  20,
  49,
  47,
  22,
  55,
  56,
  55,
  56,
  29,
  53,
  35,
  131,
  45,
  103,
  107],
 [178, 202, 424, 202, 132, 432, 189, 16],
 [337,
  242,
  88,
  11,
  412,
  306,
  19,

In [12]:
word_index = tokenizer.word_index

In [13]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [14]:
MAX_SEQUENCE_LENGTH = 1000

In [15]:
X = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [16]:
X

array([[  0,   0,   0, ...,  45, 103, 107],
       [  0,   0,   0, ..., 432, 189,  16],
       [  0,   0,   0, ...,  45, 103, 107],
       ...,
       [  0,   0,   0, ..., 100,  87, 105],
       [  0,   0,   0, ...,   8,  22, 146],
       [  0,   0,   0, ...,  87, 346, 105]])

In [17]:
import numpy as np
from tensorflow.keras.utils import to_categorical

In [18]:
labels_as_int_array = np.asarray(labels_as_int)

In [19]:
labels_as_int_array

array([1, 1, 1, ..., 1, 1, 1])

In [20]:
y = to_categorical(labels_as_int_array)

In [21]:
y

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [0., 1.],
       [0., 1.],
       [0., 1.]], dtype=float32)

In [22]:
!pip install scikit-learn



You should consider upgrading via the 'C:\Users\Bachi\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


In [23]:
from sklearn.model_selection import train_test_split

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [25]:
import pickle

In [26]:
training_data = {
    "X_train": X_train,
    "X_test": X_test,
    "y_train": y_train,
    "y_test": y_test,
    "max_words": MAX_NUM_WORDS,
    "max_seq_length": MAX_SEQUENCE_LENGTH,
    "legend": label_legend,
    "legend_inverted": label_legend_inverted,
    "tokenizer": tokenizer
}

In [27]:
with open(METADATA_EXPORT_PATH, 'wb') as f:
    pickle.dump(training_data, f)

In [29]:
data = {}

with open(TRAINING_DATA_PATH, 'rb') as f:
    data = pickle.load(f)

NameError: name 'TRAINING_DATA_PATH' is not defined