In [241]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, Embedding, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences, plot_model

In [242]:
spam = pd.read_csv("spam.csv")
spam.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [243]:
spam.shape

(5572, 2)

In [244]:
encoder = LabelEncoder()
y = encoder.fit_transform(spam['Category'])
print(y)

[0 0 1 ... 0 0 0]


In [245]:
mesages = spam['Message'].values
X_train, X_test, y_train, y_test = train_test_split(mesages, y, test_size=0.3)

In [246]:
token = Tokenizer(num_words=1000)
token.fit_on_texts(X_train)
token.fit_on_texts(X_train)
X_train = token.texts_to_sequences(X_train)
X_test = token.texts_to_sequences(X_test)

In [247]:
print(X_train)

[[52, 433, 549, 573, 20, 85, 129], [1, 202, 213, 5, 110, 158, 21, 97, 65, 4, 157], [98, 610, 105, 18, 303, 15, 679, 347, 15, 8, 790, 550, 99], [1, 98, 49, 55, 2, 33, 2, 63, 53, 15, 39, 23, 67, 59, 166, 734, 54, 39, 98, 10], [68, 22, 4, 57], [3, 31, 45, 2, 1, 411, 680], [203, 13, 12, 15, 4, 434, 50, 681, 102, 106, 3, 27, 33, 285, 178, 24, 65, 4, 434, 8, 4], [92, 115, 17, 10, 58, 1, 100, 49, 682, 12, 367, 3, 100, 71, 2, 145, 26, 34, 3, 16, 90, 237, 367, 32, 683, 105, 10, 13, 12, 89], [202, 106, 15, 648, 36, 380, 219, 523, 15, 11, 611], [1, 49, 18, 11, 244, 151, 261, 18], [16, 4, 56, 396], [55, 684, 115, 105, 10], [262, 56, 112, 735, 4, 649, 26, 187, 116], [1, 860, 472, 412, 1, 18, 304, 7, 494, 18, 58, 413, 40, 574, 97, 17, 5, 26, 65, 4, 473, 2, 164], [791, 29, 305, 435, 15, 861, 575, 26, 227, 48, 454, 20, 736, 737, 278, 76, 495, 2, 738, 141, 937], [103, 103, 28, 181, 286, 46, 31, 3, 5, 739], [195, 38, 22, 290, 2, 209, 6, 576, 278, 312, 18, 3, 16, 196, 4, 612, 153, 227, 17, 50, 685, 291, 

In [248]:
X_train = pad_sequences(X_train, padding="post", maxlen=500)
X_test = pad_sequences(X_test, padding="post", maxlen=500)

In [249]:
print(len(token.word_index))


7528


In [250]:
model = Sequential()
model.add(Embedding(input_dim=len(token.word_index), output_dim=50, embeddings_initializer="uniform"))
model.add(Flatten())
model.add(Dense(units=10, activation="relu"))
model.add(Dropout(0.1))
model.add(Dense(units=1, activation='sigmoid'))
model.compile(loss="mean_squared_error", optimizer="adam", metrics=["accuracy"])
model.fit(X_train, y_train, epochs=20, batch_size=10, verbose=True, validation_data=(X_test,y_test))

Epoch 1/20
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 12ms/step - accuracy: 0.8710 - loss: 0.1315 - val_accuracy: 0.8684 - val_loss: 0.1316
Epoch 2/20
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 20ms/step - accuracy: 0.8707 - loss: 0.1293 - val_accuracy: 0.8684 - val_loss: 0.1316
Epoch 3/20
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - accuracy: 0.8688 - loss: 0.1312 - val_accuracy: 0.8684 - val_loss: 0.1316
Epoch 4/20
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.8619 - loss: 0.1379 - val_accuracy: 0.8684 - val_loss: 0.1316
Epoch 5/20
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 13ms/step - accuracy: 0.8628 - loss: 0.1372 - val_accuracy: 0.8684 - val_loss: 0.1316
Epoch 6/20
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 11ms/step - accuracy: 0.8578 - loss: 0.1422 - val_accuracy: 0.8684 - val_loss: 0.1316
Epoch 7/20
[1m390/390

<keras.src.callbacks.history.History at 0x7c60442e4d90>

In [251]:
model.summary()

In [252]:
loss, accuracy = model.evaluate(X_test,y_test)
print("Loss: ", loss)
print("Accuracy: ", accuracy)

[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8691 - loss: 0.1309
Loss:  0.13157886266708374
Accuracy:  0.8684210777282715


In [253]:
prediction = model.predict(X_test)
print(prediction)

[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
[[5.9135374e-07]
 [3.7432417e-07]
 [4.4268958e-07]
 ...
 [6.8639565e-07]
 [4.0743481e-07]
 [4.0676511e-07]]


In [254]:
pred = (prediction > 0.5)
print(pred)

[[False]
 [False]
 [False]
 ...
 [False]
 [False]
 [False]]


In [255]:
cm = confusion_matrix(y_test, pred)
print(cm)

[[1452    0]
 [ 220    0]]
