In [2]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [6]:
# Load dataset
#sms = pd.read_csv('data/spam.csv', encoding='latin-1')[['v1', 'v2']]
url = "https://raw.githubusercontent.com/geekysudh/cnn-spam-classifier/main/data/spam.csv"
sms = pd.read_csv(url, encoding='latin-1')[['v1', 'v2']]
sms.columns = ['label', 'text']
sms['label'] = sms['label'].map({'ham': 0, 'spam': 1})

In [7]:
# Tokenization
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(sms.text)
sequences = tokenizer.texts_to_sequences(sms.text)
padded = pad_sequences(sequences, maxlen=100, padding='post')

In [8]:
X_train, X_test, y_train, y_test = train_test_split(padded, sms.label, test_size=0.2, random_state=42)

In [9]:
# CNN Model
model = Sequential([
    Embedding(input_dim=10000, output_dim=64, input_length=100),
    Conv1D(128, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



In [10]:
# Training
model.fit(X_train, y_train, epochs=5, validation_split=0.1, batch_size=32)

Epoch 1/5
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 25ms/step - accuracy: 0.8442 - loss: 0.4151 - val_accuracy: 0.9731 - val_loss: 0.0897
Epoch 2/5
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 32ms/step - accuracy: 0.9904 - loss: 0.0357 - val_accuracy: 0.9753 - val_loss: 0.0912
Epoch 3/5
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 23ms/step - accuracy: 0.9993 - loss: 0.0039 - val_accuracy: 0.9753 - val_loss: 0.0958
Epoch 4/5
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 23ms/step - accuracy: 1.0000 - loss: 9.1295e-04 - val_accuracy: 0.9753 - val_loss: 0.1167
Epoch 5/5
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 23ms/step - accuracy: 1.0000 - loss: 2.3300e-04 - val_accuracy: 0.9776 - val_loss: 0.1255


<keras.src.callbacks.history.History at 0x7ad3cea3fd50>

In [11]:
# Evaluation
y_pred = (model.predict(X_test) > 0.5).astype('int32')
print(classification_report(y_test, y_pred))

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.99      0.90      0.94       150

    accuracy                           0.99      1115
   macro avg       0.99      0.95      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [13]:
# Save model
model.save('../models/cnn_spam.keras')

In [14]:
# Download it to your local machine
from google.colab import files
files.download('../models/cnn_spam.keras')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
#Save it back to repo