In [None]:
import pathlib
import pandas as pd
import random

BASE_DIR= pathlib.Path().resolve().parent
DATABASE_DIR = BASE_DIR / 'datasets'
EXPORT_DIR = DATABASE_DIR / 'exports'
EXPORT_DIR.mkdir(exist_ok=True,parents=True)
SPAM_DATASET_PATH = EXPORT_DIR / 'spam-dataset.csv'

METADATA_EXPORT_PATH = EXPORT_DIR / 'spam-metadata.pkl'
TOKENIZER_EXPORT_PATH = EXPORT_DIR / 'spam-tokenizer.json'

In [None]:
df = pd.read_csv(SPAM_DATASET_PATH)
df.head()

In [None]:
labels= df['label'].tolist()
texts= df['text'].tolist()

In [None]:
label_legend = {"ham": 0, "spam": 1}
label_legend_inverted = {f"{v}": k for k,v in label_legend.items()}

In [None]:
label_legend_inverted

In [None]:
label_as_int = [label_legend[x] for x in labels ]

In [None]:
random_idx = random.randint(0,len(labels))

assert texts[random_idx]  == df.iloc[random_idx].text
assert label_legend_inverted[str(label_as_int[random_idx])]  == df.iloc[random_idx].label

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
MAX_NUM_WORDS = 280

In [None]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
sequences

In [None]:
tokenizer.word_index

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
MAX_SEQUENCE_LENGTH = 300

In [None]:
x= pad_sequences(sequences,maxlen=MAX_SEQUENCE_LENGTH)
x

In [None]:
import numpy as np
from tensorflow.keras.utils import to_categorical


In [None]:
labels_as_int_array = np.asarray(label_as_int)
labels_as_int_array

In [None]:
y=to_categorical(labels_as_int_array)
y

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
import pickle

In [None]:
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.333,random_state=42)

In [None]:
training_data={
    "X_train" : X_train,
    "X_test" : X_test,
    "y_train" : y_train,
    "y_test" : y_test,
    "max_words" : MAX_NUM_WORDS,
    "max_sequence" : MAX_SEQUENCE_LENGTH,
    "legend" : label_legend,
    "labels_legend_inverted" : label_legend_inverted,
    "tokenizer" : tokenizer,
}

tokenizer_json = tokenizer.to_json()
TOKENIZER_EXPORT_PATH.write_text(tokenizer_json)

In [None]:
with open(METADATA_EXPORT_PATH,'wb') as f:
    pickle.dump(training_data,f)

In [None]:
data = {}

with open(TRAINING_DATA_PATH, 'rb') as f:
    data = pickle.load(f)