TASK - 4 SPAM SMS DETECTION

Build an AI model that can classify SMS messages as spam or
legitimate. Use techniques like TF-IDF or word embeddings with
classifiers like Naive Bayes, Logistic Regression, or Support Vector Machines to identify spam messages

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load your SMS dataset (replace 'your_sms_dataset.csv' with your actual file name)
df = pd.read_csv('C:/Users/Dharsana/Downloads/archive (6)/spam.csv', encoding='ISO-8859-1')

# Assuming your dataset has columns 'text' for SMS messages and 'label' for the spam/legitimate label
X = df['v2']
y = df['v1']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust the max_features parameter based on your dataset

# Transform the text data into TF-IDF features
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train a Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)

# Train a Logistic Regression classifier
lr_classifier = LogisticRegression()
lr_classifier.fit(X_train_tfidf, y_train)

# Train a Support Vector Machine (SVM) classifier
svm_classifier = SVC(kernel='linear')  # You can experiment with different kernels like 'rbf' or 'poly'
svm_classifier.fit(X_train_tfidf, y_train)

# Predictions
nb_predictions = nb_classifier.predict(X_test_tfidf)
lr_predictions = lr_classifier.predict(X_test_tfidf)
svm_predictions = svm_classifier.predict(X_test_tfidf)

# Evaluate the models
def evaluate_model(predictions, true_labels, model_name):
    accuracy = accuracy_score(true_labels, predictions)
    conf_matrix = confusion_matrix(true_labels, predictions)
    classification_report_str = classification_report(true_labels, predictions)

    print(f'{model_name} Model Evaluation:')
    print(f'Accuracy: {accuracy:.2f}')
    print(f'Confusion Matrix:\n{conf_matrix}')
    print('Classification Report:\n', classification_report_str)

# Evaluate Naive Bayes model
evaluate_model(nb_predictions, y_test, 'Naive Bayes')

# Evaluate Logistic Regression model
evaluate_model(lr_predictions, y_test, 'Logistic Regression')

# Evaluate SVM model
evaluate_model(svm_predictions, y_test, 'Support Vector Machine')


Naive Bayes Model Evaluation:
Accuracy: 0.97
Confusion Matrix:
[[965   0]
 [ 37 113]]
Classification Report:
               precision    recall  f1-score   support

         ham       0.96      1.00      0.98       965
        spam       1.00      0.75      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115

Logistic Regression Model Evaluation:
Accuracy: 0.97
Confusion Matrix:
[[964   1]
 [ 35 115]]
Classification Report:
               precision    recall  f1-score   support

         ham       0.96      1.00      0.98       965
        spam       0.99      0.77      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115

Support Vector Machine Model Evaluation:
Accuracy: 0.98
Confusion Matrix:
[[963   2]
 [ 17 133]]
Classification Report