In [None]:
import random
import pathlib
import pandas as pd

BASE_DIR = pathlib.Path().resolve().parent
DATASET_DIR = BASE_DIR / "datasets"
ZIPS_DIR = DATASET_DIR / "zips"
EXPORTS_DIR = DATASET_DIR / "exports"
ZIPS_DIR.mkdir(exist_ok=True, parents=True)
EXPORTS_DIR.mkdir(exist_ok=True, parents=True)
SPAM_DATASET_PATH = EXPORTS_DIR / "spam-dataset.csv"

SPAM_SMS_ZIPS_PATH = ZIPS_DIR / "sms-spam.zip"
SPAM_YOUTUBE_ZIPS_PATH = ZIPS_DIR / "youtube-spam.zip"

METADATA_EXPORT_PATH = EXPORTS_DIR / "spam-metadata.pkl"

TOKENIZER_EXPORT_PATH = EXPORTS_DIR / "spam-tokenizer.json"

: 

In [4]:
df = pd.read_csv(SPAM_DATASET_PATH)
df.head()

Unnamed: 0,label,text,source
0,ham,"Go until jurong point, crazy.. Available only ...",sms-spam
1,ham,Ok lar... Joking wif u oni...,sms-spam
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,sms-spam
3,ham,U dun say so early hor... U c already then say...,sms-spam
4,ham,"Nah I don't think he goes to usf, he lives aro...",sms-spam


In [5]:
labels = df['label'].tolist()
text = df['text'].tolist()

In [6]:
labels[120], text[120]

('spam',
 'PRIVATE! Your 2004 Account Statement for 07742676969 shows 786 unredeemed Bonus Points. To claim call 08719180248 Identifier Code: 45239 Expires')

In [14]:
label_legend = {'ham': 0, 'spam': 1}
label_legend_inverted = {f"{v}": k for k,v in label_legend.items()}
label_legend_inverted

{'0': 'ham', '1': 'spam'}

In [15]:
labels_as_int = [label_legend[x] for x in labels]
# label_legend_inverted[str(labels_as_int[120])]

'spam'

In [24]:
random_idx = random.randint(0, len(labels))

assert text[random_idx] == df.iloc[random_idx].text

assert labels[random_idx] == df.iloc[random_idx].label

assert label_legend_inverted[str(labels_as_int[random_idx])] == df.iloc[random_idx].label

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
MAX_NUM_WORDS = 280

In [None]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(text)
sequences = tokenizer.texts_to_sequences(text)

In [None]:
word_index = tokenizer.word_index

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
MAX_SEQ_LENGTH = 300

In [None]:
X = pad_sequences(sequences, maxlen=MAX_SEQ_LENGTH)

In [None]:
import numpy as np
from tensorflow.keras.utils import to_categorical

In [None]:
labels_as_int_array = np.asarray(labels_as_int)

In [None]:
y = to_categorical(labels_as_int_array)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
import pickle

In [None]:
training_data = {
    "X_train": X_train, 
    "X_test": X_test, 
    "y_train": y_train, 
    "y_test": y_test,
    "max_words": MAX_NUM_WORDS,
    "max_seq_length": MAX_SEQ_LENGTH,
    "label_legend": label_legend,
    "label_legend_inverted": label_legend_inverted,
}

tokenizer_json = tokenizer.to_json()
TOKENIZER_EXPORT_PATH.write_text(tokenizer_json)

In [None]:
with open(METADATA_EXPORT_PATH, 'wb') as f:
    pickle.dump(training_data, f)