In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [6]:
data = pd.read_csv("spam.csv", encoding="cp1252")  # Common on Windows
data = data.rename(columns={"v1": "category", "v2": "message"})
data['Spam']=data['category'].apply(lambda x:1 if x=='spam' else 0)
X_train,X_test,y_train,y_test=train_test_split(data.message,data.Spam,test_size=0.25)

In [27]:
emails=[
    'Sounds great! Are you home now?',
    'Will u meet ur dream partner soon? Is ur career off 2 a flyng start? 2 find out free, txt HORO followed by ur star sign, e. g. HORO ARIES'
]

# Multinomial NB

In [9]:
from sklearn.pipeline import Pipeline
clfmulti=Pipeline([
    ('vectorizer',CountVectorizer()),
    ('nb',MultinomialNB())
])
clfmulti.fit(X_train,y_train)
clfmulti.score(X_test,y_test)

0.9820531227566404

# Logistic Regression

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer

clflog = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('logreg', LogisticRegression(max_iter=1000))
])
clflog.fit(X_train,y_train)
clflog.score(X_test,y_test)

0.9798994974874372

In [12]:
from sklearn.svm import SVC

clfsvm = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('svm', SVC(kernel='linear'))
])
clfsvm.fit(X_train,y_train)
clfsvm.score(X_test,y_test)

0.9798994974874372

In [13]:
from sklearn.ensemble import RandomForestClassifier

clfrf= Pipeline([
    ('vectorizer', CountVectorizer()),
    ('rf', RandomForestClassifier(n_estimators=100))
])
clfrf.fit(X_train,y_train)
clfrf.score(X_test,y_test)

0.9763101220387652

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

clflog2 = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('logreg', LogisticRegression(max_iter=1000))
])
clflog2.fit(X_train,y_train)
clflog2.score(X_test,y_test)

0.9633883704235463

Deep Learning Models

In [15]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Parameters
max_words = 5000  # Vocabulary size
max_len = 100  # Max sequence length

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(data['message'])
X = tokenizer.texts_to_sequences(data['message'])
X = pad_sequences(X, maxlen=max_len)

# Convert labels to binary
y = (data['category'] == 'spam').astype(int)

# LSTM Model
model = Sequential([
    Embedding(max_words, 128, input_length=max_len),
    SpatialDropout1D(0.2),
    LSTM(100, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X, y, epochs=5, batch_size=32, validation_split=0.2)


Epoch 1/5




[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 30ms/step - accuracy: 0.8820 - loss: 0.2964 - val_accuracy: 0.9865 - val_loss: 0.0525
Epoch 2/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 28ms/step - accuracy: 0.9910 - loss: 0.0323 - val_accuracy: 0.9874 - val_loss: 0.0427
Epoch 3/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 28ms/step - accuracy: 0.9954 - loss: 0.0158 - val_accuracy: 0.9865 - val_loss: 0.0430
Epoch 4/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 28ms/step - accuracy: 0.9957 - loss: 0.0245 - val_accuracy: 0.9874 - val_loss: 0.0439
Epoch 5/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 28ms/step - accuracy: 0.9983 - loss: 0.0067 - val_accuracy: 0.9865 - val_loss: 0.0448


<keras.src.callbacks.history.History at 0x286603243a0>

In [16]:
model = Sequential([
    Embedding(max_words, 128, input_length=max_len),
    SpatialDropout1D(0.2),
    tf.keras.layers.Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2)),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=5, batch_size=32, validation_split=0.2)

Epoch 1/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 58ms/step - accuracy: 0.8940 - loss: 0.2961 - val_accuracy: 0.9839 - val_loss: 0.0544
Epoch 2/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 54ms/step - accuracy: 0.9872 - loss: 0.0468 - val_accuracy: 0.9794 - val_loss: 0.0580
Epoch 3/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 55ms/step - accuracy: 0.9936 - loss: 0.0228 - val_accuracy: 0.9901 - val_loss: 0.0372
Epoch 4/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 55ms/step - accuracy: 0.9978 - loss: 0.0068 - val_accuracy: 0.9874 - val_loss: 0.0407
Epoch 5/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 55ms/step - accuracy: 0.9967 - loss: 0.0118 - val_accuracy: 0.9874 - val_loss: 0.0572


<keras.src.callbacks.history.History at 0x286655647f0>