In [30]:
import pathlib
import pandas as pd
import random

BASE_DIR = pathlib.Path().resolve().parent
DATASET_DIR = BASE_DIR / "datasets"
RAW_DATA_DIR = DATASET_DIR / "raw-data"
EXPORT_DIR = DATASET_DIR / "exports"
EXPORT_DIR.mkdir(exist_ok=True, parents=True)
PHISHING_DATASET_PATH = EXPORT_DIR / "phishing-dataset.csv"

METADATA_EXPORT_PATH = EXPORT_DIR / 'phishing-metadata.pkl'
TOKENIZER_EXPORT_PATH = EXPORT_DIR / 'phishing-tokenizer.json'

In [31]:
df = pd.read_csv(PHISHING_DATASET_PATH)
df.text = df.text.astype(str)

In [32]:
df.head()

Unnamed: 0,label,text
0,phishing,onlineservice@nafcu.org Account Notification:Y...
1,phishing,"mis@bsch.es Banca Personal Estimados clientes,..."
2,phishing,technical_department-id-708ziy@bbandt.com BB&T...
3,phishing,customer_service@paypal.com Notification of Li...
4,phishing,customer_service@paypal.com Notification of Li...


In [33]:
labels = df['label'].tolist()
texts = df['text'].tolist()

In [34]:
labels[2350], texts[2350]

('ham',
 'Buy Me a Coffee\xa0<hello@buymeacoffee.com> Unsubscribe To: elvisdurmishi@yahoo.com Thu, Mar 2 at 2:58 PMWelcome to Buy Me a Coffee’s monthly newsletter for our creators.\xa0BTW, if you are a YouTuber and are interested in sharing your experience of using BMC on your channel, we would love to send you lots of coffee. Fill up this form and we will get in touch 😃\xa0TLDR; on what we shipped in February 👇Commissions Shop 2.0 (rebranded from Extras) and lots of small fixes & improvements \u200a Introducing CommissionsCommissions, one of the most requested features by our creators, is now live for everyone. You can now offer unique and customised services directly on your page, ranging from art commissions to freelance services. Simply set the price, terms, and deadline, and receive payment in advance for your work.Create your commission \u200a Shop 2.0 (rebranded from Extras)Selling products on BMC is going to be the best in class experience for creators. We are very close to shi

In [35]:
label_legend = {"ham": 0, "phishing": 1}
label_legend_inverted = {f"{v}": k for k,v in label_legend.items()}
label_legend_inverted

{'0': 'ham', '1': 'phishing'}

In [36]:
labels_as_int = [label_legend[x] for x in labels]
labels_as_int[2350]

0

In [37]:
random_idx = random.randint(0, len(labels))

assert texts[random_idx] == df.iloc[random_idx].text

assert labels[random_idx] == df.iloc[random_idx].label

assert label_legend_inverted[str(labels_as_int[random_idx])] == df.iloc[random_idx].label

In [38]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [39]:
MAX_NUM_WORDS = 450

In [40]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
sequences

[[7,
  92,
  2,
  49,
  7,
  218,
  69,
  43,
  34,
  3,
  437,
  6,
  28,
  53,
  1,
  20,
  160,
  15,
  24,
  423,
  1,
  3,
  344,
  31,
  10,
  53,
  12,
  364,
  3,
  50,
  6,
  39,
  9,
  28,
  15,
  31,
  60,
  4,
  10,
  197,
  1,
  74,
  2,
  28,
  234,
  305,
  40,
  180,
  6,
  7,
  147,
  15,
  347,
  4,
  1,
  44,
  2,
  313,
  15,
  24,
  1,
  20,
  18,
  39,
  345,
  45,
  151,
  31,
  79,
  287,
  20,
  183,
  49,
  100,
  12,
  8,
  4,
  21,
  26,
  271,
  1,
  16,
  9,
  19,
  152,
  6,
  44,
  2,
  7,
  36,
  3,
  73,
  108,
  33,
  68,
  152,
  39,
  152,
  4,
  12,
  2,
  228,
  1,
  8,
  272,
  332,
  332,
  332,
  332,
  332,
  15,
  24,
  391,
  1,
  46,
  4,
  4,
  282,
  7,
  233,
  29,
  1,
  157,
  15,
  24,
  114,
  12,
  4,
  200,
  53,
  38,
  22,
  29,
  53,
  4,
  52,
  254,
  20,
  49,
  46,
  22,
  53,
  54,
  53,
  54,
  29,
  433,
  56,
  35,
  132,
  45,
  103,
  106],
 [181, 203, 428, 203, 129, 437, 184, 16],
 [339,
  246,
  88,
  11,
  414,
  30

In [41]:
word_index = tokenizer.word_index

In [42]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [14]:
MAX_SEQUENCE_LENGTH = 1000

In [15]:
X = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [16]:
X

array([[  0,   0,   0, ...,  45, 103, 107],
       [  0,   0,   0, ..., 432, 189,  16],
       [  0,   0,   0, ...,  45, 103, 107],
       ...,
       [  0,   0,   0, ..., 100,  87, 105],
       [  0,   0,   0, ...,   8,  22, 146],
       [  0,   0,   0, ...,  87, 346, 105]])

In [17]:
import numpy as np
from tensorflow.keras.utils import to_categorical

In [18]:
labels_as_int_array = np.asarray(labels_as_int)

In [19]:
labels_as_int_array

array([1, 1, 1, ..., 1, 1, 1])

In [20]:
y = to_categorical(labels_as_int_array)

In [21]:
y

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [0., 1.],
       [0., 1.],
       [0., 1.]], dtype=float32)

In [22]:
!pip install scikit-learn



You should consider upgrading via the 'C:\Users\Bachi\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


In [23]:
from sklearn.model_selection import train_test_split

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [25]:
import pickle

In [26]:
training_data = {
    "X_train": X_train,
    "X_test": X_test,
    "y_train": y_train,
    "y_test": y_test,
    "max_words": MAX_NUM_WORDS,
    "max_seq_length": MAX_SEQUENCE_LENGTH,
    "legend": label_legend,
    "legend_inverted": label_legend_inverted,
    "tokenizer": tokenizer
}

In [27]:
with open(METADATA_EXPORT_PATH, 'wb') as f:
    pickle.dump(training_data, f)

In [29]:
data = {}

with open(TRAINING_DATA_PATH, 'rb') as f:
    data = pickle.load(f)

NameError: name 'TRAINING_DATA_PATH' is not defined