## Preamble

In [None]:
import sys
import pickle
import json
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

In [None]:
# in
datasetInputFile = '../training/dataset/spam-dataset.csv'
# out
trainingDumpFile = '../training/prepared_dataset/spam_training_data.pickle'

## Reading and transforming the input

#### Reading the input file and preparing legend info

In [None]:
df = pd.read_csv(datasetInputFile)
labels = df['label'].tolist()
texts = df['text'].tolist()
#
labelLegend = {'ham': 0, 'spam': 1}
labelLegendInverted = {'%i' % v: k for k,v in labelLegend.items()}
labelsAsInt = [labelLegend[x] for x in labels]

**Look at:** the contents of `texts`,
`labelLegend`,
`labelLegendInverted`,
`labels`,
`labelsAsInt`

In [None]:
## Uncomment any one of the following and press Shift+Enter to print the variable
# texts
# labelLegend
# labelLegendInverted
# labels
# labelsAsInt

#### Tokenization of texts

In [None]:
MAX_NUM_WORDS = 280
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

**Look at:** `tokenizer.word_index`, `inverseWordIndex`, `sequences` and how they play together:

In [None]:
# This is only needed for demonstration purposes, will not be dumped with the rest:
inverseWordIndex = {v: k for k, v in tokenizer.word_index.items()}

## Uncomment any one of the following and press Shift+Enter to print the variable
# tokenizer.word_index
# inverseWordIndex
# sequences
# [[inverseWordIndex[i] for i in seq] for seq in sequences]
# texts

#### Padding of sequences

In [None]:
MAX_SEQ_LENGTH = 300
X = pad_sequences(sequences, maxlen=MAX_SEQ_LENGTH)

**Look at:** `sequences`, `X` and compare their shape and contents:

In [None]:
## Uncomment any one of the following and press Shift+Enter to print the variable
# [len(s) for s in sequences]
# len(sequences)
# X.shape
# type(X)
# X

#### Switch to categorical form for labels

In [None]:
labelsAsIntArray = np.asarray(labelsAsInt)
y = to_categorical(labelsAsIntArray)

**Look at:** `labelsAsIntArray`, `y` and how they relate to `labels` and `labelLegend`:

In [None]:
## Uncomment any one of the following and press Shift+Enter to print the variable
# labelsAsIntArray
# labelsAsIntArray.shape
# y.shape
# y
# labels
# labelLegend

## Splitting the labeled dataset and saving everything to file

#### Splitting dataset (train/test)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

**Look at:** the shape of the four resulting numpy 2D arrays:

In [None]:
## Uncomment any one of the following and press Shift+Enter to print the variable
# X_train.shape
# X_test.shape
# y_train.shape
# y_test.shape

In [None]:
trainingData = {
    'X_train': X_train, 
    'X_test': X_test,
    'y_train': y_train,
    'y_test': y_test,
    'max_words': MAX_NUM_WORDS,
    'max_seq_length': MAX_SEQ_LENGTH,
    'label_legend': labelLegend,
    'label_legend_inverted': labelLegendInverted, 
    'tokenizer': tokenizer,
}
with open(trainingDumpFile, 'wb') as f:
    pickle.dump(trainingData, f)