In [72]:
import pathlib
import pandas as pd

BASE_DIR = pathlib.Path().resolve().parent 
DATASET_DIR = BASE_DIR / "datasets"
ZIPS_DIR = DATASET_DIR / 'zips'
ZIPS_DIR.mkdir(exist_ok=True, parents=True)

SPAM_SMS_ZIP_PATH = ZIPS_DIR / "spam-dataset.zip"
SPAM_YOUTUBE_ZIP_PATH = ZIPS_DIR / "youtube-spam-dataset.zip"

EXPORT_DIR = DATASET_DIR / 'export'
SPAM_DATASET_PATH = EXPORT_DIR / "spam-dataset.csv"


METADATA_EXPORT_PATH = EXPORT_DIR / 'spam-metadata.pkl'
TOKENIZER_EXPORT_PATH = EXPORT_DIR / 'spam_tokenizer.json'

In [73]:
df = pd.read_csv(SPAM_DATASET_PATH)


In [74]:
labels = df['label'].tolist()
texts = df['text'].tolist()

In [75]:
BASE_DIR

WindowsPath('C:/Users/natsc/Desktop/machine_learning/AI_as_an_API')

In [76]:
labels[120], texts[120]

('ham', 'wow')

In [77]:
label_legend = {"ham" : 0,"spam" : 1}

In [78]:
labels_as_int = [label_legend[x] for x in labels]
label_legend_inverted = {f"{v}": k for k, v in label_legend.items()}
label_legend_inverted

{'0': 'ham', '1': 'spam'}

In [79]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [80]:
MAX_NUM_WORDS = 280
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
sequences

[[97, 131],
 [1, 20, 17, 15],
 [1, 20, 17, 15],
 [117, 117, 42, 12, 4, 98, 132, 12, 8, 12, 15],
 [29, 16, 68, 14],
 [2, 68, 58, 186, 17, 187, 15],
 [1],
 [146, 108, 17, 171, 8, 29],
 [172, 15],
 [1, 49, 29, 15],
 [29, 55, 55, 21, 20, 7, 17, 15],
 [236, 34],
 [1, 87, 13, 37, 237, 7, 17, 47, 187, 73, 79, 15],
 [20, 131],
 [],
 [69, 1, 173, 7, 17, 1, 237, 147, 13, 61, 238, 2, 15],
 [174, 7, 22, 31, 34, 118],
 [1, 20, 7, 17, 34, 188, 212, 90, 21, 13, 15],
 [1, 20, 7, 17, 32, 4],
 [55, 68],
 [148, 149, 1, 119, 20, 7, 21],
 [],
 [10, 9, 7, 19, 52, 15],
 [175, 239, 8, 62, 13, 61, 14, 3],
 [236,
  44,
  34,
  7,
  17,
  16,
  34,
  172,
  12,
  49,
  66,
  189,
  38,
  31,
  1,
  120,
  1,
  80,
  49,
  12,
  81,
  117,
  117,
  117,
  15],
 [97, 20],
 [119, 174, 22, 15],
 [29, 16, 6, 240],
 [1, 7, 17],
 [],
 [45, 44, 174, 7, 22, 15],
 [241,
  2,
  14,
  27,
  4,
  22,
  27,
  91,
  189,
  2,
  19,
  74,
  2,
  176,
  133,
  148,
  19,
  2,
  22,
  3,
  8,
  14,
  242,
  40,
  7,
  16,
  19,
 

In [81]:
len(tokenizer.word_index)

1441

In [82]:
from tensorflow.keras.utils import pad_sequences

MAX_SEQ_LENGTH = 300

x = pad_sequences(sequences, maxlen=MAX_SEQ_LENGTH)

In [83]:
x

array([[  0,   0,   0, ...,   0,  97, 131],
       [  0,   0,   0, ...,  20,  17,  15],
       [  0,   0,   0, ...,  20,  17,  15],
       ...,
       [  0,   0,   0, ...,   0,   0, 157],
       [  0,   0,   0, ...,  77,  44,  34],
       [  0,   0,   0, ...,  16,   2,  68]])

In [84]:
from tensorflow.keras.utils import to_categorical
import numpy as np



In [85]:
labels_as_int_array = np.asarray(labels_as_int)
y = to_categorical(labels_as_int_array)

In [86]:
[0,0,1]
['ham', 'ham', 'spam']

['ham', 'spam']

[[1,0], [1,0],]

[[1, 0], [1, 0]]

In [87]:
from sklearn.model_selection import train_test_split

In [88]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.33,random_state=42)

In [89]:
import pickle

In [90]:
training_data = {
    'X_train' : X_train,
    'X_test' : X_test,
    'y_train' : y_train,
    'y_test' : y_test,
    'max_words' : MAX_NUM_WORDS,
    'max_seq_legnth' : MAX_SEQ_LENGTH,
    'legend' : label_legend,
    'legend_inverted' : label_legend_inverted,
    
}   

In [91]:
tokenizer_json = tokenizer.to_json()
TOKENIZER_EXPORT_PATH.write_text(tokenizer_json)



121501

In [92]:
with open(METADATA_EXPORT_PATH, 'wb') as f:
    pickle.dump(training_data, f)
