In [2]:
import pathlib
import pandas as pd
import random

BASE_DIR = pathlib.Path().resolve().parent
DATASET_DIR = BASE_DIR / "datasets"
RAW_DATA_DIR = DATASET_DIR / "raw-data"
EXPORT_DIR = DATASET_DIR / "exports"
EXPORT_DIR.mkdir(exist_ok=True, parents=True)
PHISHING_DATASET_PATH = EXPORT_DIR / "phishing-dataset.csv"

METADATA_EXPORT_PATH = EXPORT_DIR / 'phishing-metadata.pkl'
TOKENIZER_EXPORT_PATH = EXPORT_DIR / 'phishing-tokenizer.json'

In [3]:
df = pd.read_csv(PHISHING_DATASET_PATH)
df.text = df.text.astype(str)

In [4]:
df.head()

Unnamed: 0,label,text
0,phishing,onlineservice@nafcu.org Account Notification:Y...
1,phishing,"mis@bsch.es Banca Personal Estimados clientes,..."
2,phishing,technical_department-id-708ziy@bbandt.com BB&T...
3,phishing,customer_service@paypal.com Notification of Li...
4,phishing,customer_service@paypal.com Notification of Li...


In [5]:
labels = df['label'].tolist()
texts = df['text'].tolist()

In [6]:
labels[2350], texts[2350]

('ham',
 "rosehannah@yahoo.com For real when u getting on yo? I only need 2 more tickets and one more jacket and I'm done. I already used all my multis.")

In [7]:
label_legend = {"ham": 0, "phishing": 1}
label_legend_inverted = {f"{v}": k for k,v in label_legend.items()}
label_legend_inverted

{'0': 'ham', '1': 'phishing'}

In [8]:
labels_as_int = [label_legend[x] for x in labels]
labels_as_int[2350]

0

In [9]:
random_idx = random.randint(0, len(labels))

assert texts[random_idx] == df.iloc[random_idx].text

assert labels[random_idx] == df.iloc[random_idx].label

assert label_legend_inverted[str(labels_as_int[random_idx])] == df.iloc[random_idx].label

In [10]:
from tensorflow.keras.preprocessing.text import Tokenizer

ModuleNotFoundError: No module named 'tensorflow.keras'

In [11]:
MAX_NUM_WORDS = 1200

In [12]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
sequences

NameError: name 'Tokenizer' is not defined

In [13]:
word_index = tokenizer.word_index

NameError: name 'tokenizer' is not defined

In [14]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

ModuleNotFoundError: No module named 'tensorflow.keras'

In [15]:
MAX_SEQUENCE_LENGTH = 1200

In [15]:
X = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [16]:
X

array([[  0,   0,   0, ...,  45, 103, 107],
       [  0,   0,   0, ..., 432, 189,  16],
       [  0,   0,   0, ...,  45, 103, 107],
       ...,
       [  0,   0,   0, ..., 100,  87, 105],
       [  0,   0,   0, ...,   8,  22, 146],
       [  0,   0,   0, ...,  87, 346, 105]])

In [17]:
import numpy as np
from tensorflow.keras.utils import to_categorical

In [18]:
labels_as_int_array = np.asarray(labels_as_int)

In [19]:
labels_as_int_array

array([1, 1, 1, ..., 1, 1, 1])

In [20]:
y = to_categorical(labels_as_int_array)

In [21]:
y

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [0., 1.],
       [0., 1.],
       [0., 1.]], dtype=float32)

In [22]:
!pip install scikit-learn



You should consider upgrading via the 'C:\Users\Bachi\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


In [23]:
from sklearn.model_selection import train_test_split

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=42)

In [25]:
import pickle

In [26]:
training_data = {
    "X_train": X_train,
    "X_test": X_test,
    "y_train": y_train,
    "y_test": y_test,
    "max_words": MAX_NUM_WORDS,
    "max_seq_length": MAX_SEQUENCE_LENGTH,
    "legend": label_legend,
    "legend_inverted": label_legend_inverted,
    "tokenizer": tokenizer
}

In [27]:
with open(METADATA_EXPORT_PATH, 'wb') as f:
    pickle.dump(training_data, f)

In [29]:
data = {}

with open(TRAINING_DATA_PATH, 'rb') as f:
    data = pickle.load(f)

NameError: name 'TRAINING_DATA_PATH' is not defined