In [None]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization

In [None]:
sentences = [
    "I like eggs and ham.",
    "I love chocolate and bunnies.",
    "I hate onions."
]

In [None]:
MAX_VOCAB_SIZE = 20000
vectorization_layer = TextVectorization(max_tokens=MAX_VOCAB_SIZE)

In [None]:
vectorization_layer.adapt(sentences)

In [None]:
sequences = vectorization_layer(sentences)
print(sequences)

tf.Tensor(
[[ 2  6  9  3  8]
 [ 2  5 10  3 11]
 [ 2  7  4  0  0]], shape=(3, 5), dtype=int64)


In [None]:
vectorization_layer.get_vocabulary()

['',
 '[UNK]',
 np.str_('i'),
 np.str_('and'),
 np.str_('onions'),
 np.str_('love'),
 np.str_('like'),
 np.str_('hate'),
 np.str_('ham'),
 np.str_('eggs'),
 np.str_('chocolate'),
 np.str_('bunnies')]

In [None]:
#How do we get the word to index mapping?

word2idx = {v: k for k, v in enumerate(vectorization_layer.get_vocabulary())}
print(word2idx)

{'': 0, '[UNK]': 1, np.str_('i'): 2, np.str_('and'): 3, np.str_('onions'): 4, np.str_('love'): 5, np.str_('like'): 6, np.str_('hate'): 7, np.str_('ham'): 8, np.str_('eggs'): 9, np.str_('chocolate'): 10, np.str_('bunnies'): 11}


In [None]:
#truncation

vectorization_layer_truncated = TextVectorization(
    max_tokens=MAX_VOCAB_SIZE,
    output_sequence_length=3,
)

vectorization_layer_truncated.adapt(sentences)

sequences_truncated = vectorization_layer_truncated(sentences)
print(sequences_truncated)

tf.Tensor(
[[ 2  6  9]
 [ 2  5 10]
 [ 2  7  4]], shape=(3, 3), dtype=int64)


#RNN

In [None]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split

from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D
from tensorflow.keras.layers import LSTM, Embedding, TextVectorization
from tensorflow.keras.models import Model

In [None]:
!wget -nc https://lazyprogrammer.me/course_files/spam.csv

File ‘spam.csv’ already there; not retrieving.



In [None]:
df = pd.read_csv("spam.csv", encoding="ISO-8859-1")

In [None]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [None]:
df = df.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1);
df.columns = ['labels', 'data'];

In [None]:
df.head()

Unnamed: 0,labels,data
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
#create binary labels
df['b_labels'] = df['labels'].map({'ham': 0, 'spam': 1})
Y = df['b_labels'].values

In [None]:
df_train, df_test, Ytrain, Ytest = train_test_split(df['data'], Y, test_size=0.33)

In [None]:
# create tf datasets
train_ds = tf.data.Dataset.from_tensor_slices((df_train.values, Ytrain))
test_ds = tf.data.Dataset.from_tensor_slices((df_test.values, Ytest))

In [None]:
# convert sentences to sequences
MAX_FEATURES = 20_000
vectorization = TextVectorization(max_tokens=MAX_FEATURES)
vectorization.adapt(train_ds.map(lambda x, y: x))

In [None]:
# shuffle and batch dataset
train_ds = train_ds.shuffle(10000).batch(32).prefetch(tf.data.AUTOTUNE)
test_ds = test_ds.batch(32).prefetch(tf.data.AUTOTUNE)

In [None]:
V = len(vectorization.get_vocabulary())

In [None]:
# create the model

D = 20
M = 15

i = Input(shape=(1,), dtype=tf.string)
x = vectorization(i)
x = Embedding(V, D)(x)
x = LSTM(M, return_sequences=True)(x)
x = GlobalMaxPooling1D()(x)
x = Dense(1, activation="sigmoid")(x)

model = Model(i, x)

In [None]:
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [None]:
r = model.fit(
    train_ds,
    validation_data=test_ds,
    epochs=10
)

Epoch 1/10
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step - accuracy: 0.8185 - loss: 0.5900 - val_accuracy: 0.8700 - val_loss: 0.3278
Epoch 2/10
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9046 - loss: 0.2427 - val_accuracy: 0.9761 - val_loss: 0.1264
Epoch 3/10
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9883 - loss: 0.0883 - val_accuracy: 0.9723 - val_loss: 0.0984
Epoch 4/10
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9920 - loss: 0.0531 - val_accuracy: 0.9772 - val_loss: 0.0796
Epoch 5/10
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9934 - loss: 0.0416 - val_accuracy: 0.9723 - val_loss: 0.0958
Epoch 6/10
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.9969 - loss: 0.0219 - val_accuracy: 0.9657 - val_loss: 0.1141
Epoch 7/10
[1m117/117[0m