In [1]:
import random
import pathlib
import pandas as pd

BASE_DIR = pathlib.Path().resolve().parent
DATASET_DIR = BASE_DIR / "datasets"
ZIPS_DIR = DATASET_DIR / "zips"
EXPORTS_DIR = DATASET_DIR / "exports"
ZIPS_DIR.mkdir(exist_ok=True, parents=True)
EXPORTS_DIR.mkdir(exist_ok=True, parents=True)
SPAM_DATASET_PATH = EXPORTS_DIR / "spam-dataset.csv"

SPAM_SMS_ZIPS_PATH = ZIPS_DIR / "sms-spam.zip"
SPAM_YOUTUBE_ZIPS_PATH = ZIPS_DIR / "youtube-spam.zip"

METADATA_EXPORT_PATH = EXPORTS_DIR / "spam-metadata.pkl"

TOKENIZER_EXPORT_PATH = EXPORTS_DIR / "spam-tokenizer.json"

In [2]:
df = pd.read_csv(SPAM_DATASET_PATH)
df.tail()

Unnamed: 0,label,text,source
12694,ham,This song means so much to me thank you soooo...,youtube-spam
12695,ham,&lt;3﻿,youtube-spam
12696,spam,"KATY PERRY, I AM THE ""DÉCIO CABELO"", ""DECIO HA...",youtube-spam
12697,ham,Honestly speaking except taylor swift and adel...,youtube-spam
12698,ham,who is going to reach the billion first : katy...,youtube-spam


In [3]:
labels = df['label'].tolist()
text = df['text'].tolist()

In [4]:
labels[120], text[120]

('ham',
 'Subject: txu noms . for 3 / 14 / 01\r\n( see attached file : hplno 314 . xls )\r\n- hplno 314 . xls')

In [5]:
label_legend = {'ham': 0, 'spam': 1}
label_legend_inverted = {f"{v}": k for k,v in label_legend.items()}
label_legend_inverted

{'0': 'ham', '1': 'spam'}

In [6]:
labels_as_int = [label_legend[x] for x in labels]
# label_legend_inverted[str(labels_as_int[120])]

In [7]:
random_idx = random.randint(0, len(labels))

assert text[random_idx] == df.iloc[random_idx].text

assert labels[random_idx] == df.iloc[random_idx].label

assert label_legend_inverted[str(labels_as_int[random_idx])] == df.iloc[random_idx].label

In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer

2022-05-12 11:31:40.540490: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib/mesa-diverted/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu/mesa:/usr/lib/x86_64-linux-gnu/dri:/usr/lib/x86_64-linux-gnu/gallium-pipe
2022-05-12 11:31:40.540552: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [9]:
MAX_NUM_WORDS = 280

In [10]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(text)
sequences = tokenizer.texts_to_sequences(text)

In [11]:
word_index = tokenizer.word_index

In [12]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [13]:
MAX_SEQ_LENGTH = 280

In [14]:
X = pad_sequences(sequences, maxlen=MAX_SEQ_LENGTH)

In [15]:
import numpy as np
from tensorflow.keras.utils import to_categorical

In [16]:
labels_as_int_array = np.asarray(labels_as_int)

In [17]:
y = to_categorical(labels_as_int_array)

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [20]:
import pickle

In [21]:
training_data = {
    "X_train": X_train, 
    "X_test": X_test, 
    "y_train": y_train, 
    "y_test": y_test,
    "max_words": MAX_NUM_WORDS,
    "max_seq_length": MAX_SEQ_LENGTH,
    "label_legend": label_legend,
    "label_legend_inverted": label_legend_inverted,
    "tokenizer": tokenizer
}

tokenizer_json = tokenizer.to_json()
TOKENIZER_EXPORT_PATH.write_text(tokenizer_json)

5922675

In [22]:
with open(METADATA_EXPORT_PATH, 'wb') as f:
    pickle.dump(training_data, f)