<a href="https://colab.research.google.com/github/divya374r1/Neural-Network-and-Deep-Learning/blob/main/Spam_Detection(with_dataset_Tensorflow).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Load dataset with correct encoding
df = pd.read_csv("spam.csv", encoding="latin1")

# Rename columns if dataset has extra columns (common in Kaggle file)
df = df.rename(columns={"v1": "label", "v2": "text"})

# Keep only required columns
df = df[['text', 'label']]

# Convert labels
df['label'] = df['label'].map({'spam': 1, 'ham': 0})

# Split dataset
X = df['text'].values
y = df['label'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Tokenization
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

# Convert text to sequences
train_seq = tokenizer.texts_to_sequences(X_train)
test_seq = tokenizer.texts_to_sequences(X_test)

# Padding
max_length = 100
X_train_padded = pad_sequences(train_seq, maxlen=max_length, padding="post")
X_test_padded = pad_sequences(test_seq, maxlen=max_length, padding="post")

# Build TensorFlow model
model = Sequential([
    Embedding(5000, 32, input_length=max_length),
    LSTM(64),
    Dense(32, activation="relu"),
    Dense(1, activation="sigmoid")
])

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Train model
model.fit(X_train_padded, y_train, epochs=5, batch_size=32, validation_split=0.2)

# Evaluate model
loss, acc = model.evaluate(X_test_padded, y_test)
print("Accuracy:", acc)

# Predict a new message
new_message = ["Congratulations! You won free tickets, click now!"]
new_seq = tokenizer.texts_to_sequences(new_message)
new_pad = pad_sequences(new_seq, maxlen=max_length, padding="post")

pred = model.predict(new_pad)[0][0]

print("Output =", pred, "| Spam" if pred > 0.5 else "| Not Spam")


Epoch 1/5




[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 54ms/step - accuracy: 0.8375 - loss: 0.4611 - val_accuracy: 0.8621 - val_loss: 0.4017
Epoch 2/5
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 57ms/step - accuracy: 0.8715 - loss: 0.3858 - val_accuracy: 0.8621 - val_loss: 0.4018
Epoch 3/5
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 52ms/step - accuracy: 0.8688 - loss: 0.3898 - val_accuracy: 0.8621 - val_loss: 0.4032
Epoch 4/5
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 51ms/step - accuracy: 0.8717 - loss: 0.3862 - val_accuracy: 0.8621 - val_loss: 0.4016
Epoch 5/5
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 73ms/step - accuracy: 0.8665 - loss: 0.3966 - val_accuracy: 0.8621 - val_loss: 0.4017
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.8760 - loss: 0.3752
Accuracy: 0.865470826625824
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m