In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# Load the SMS Spam Collection dataset

# data = pd.read_csv('/kaggle/input/sms-spam-collection-dataset/spam.csv', encoding='latin-1')
data = pd.read_csv(r"C:\Users\Kunjal Thorat\Desktop\SMS Detection\spam.csv", encoding='latin-1')

data = data[['v1', 'v2']]
data.columns = ['Class', 'Message']

In [4]:
print("Training Data - First 5 rows:")
data.head()

Training Data - First 5 rows:


Unnamed: 0,Class,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
# Data Preprocessing
data['Message'] = data['Message'].str.lower()

# Split the data into train and test sets
X = data['Message']
y = data['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [7]:
# Section 1: Naive Bayes
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Model Training and Evaluation for Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
y_pred_nb = nb_model.predict(X_test_tfidf)

# Evaluation Metrics for Naive Bayes
accuracy_nb = accuracy_score(y_test, y_pred_nb)
confusion_matrix_nb = confusion_matrix(y_test, y_pred_nb)
report_nb = classification_report(y_test, y_pred_nb)

# Print Results for Naive Bayes
print("Naive Bayes Model:")
print(f'Accuracy: {accuracy_nb * 100:.2f}%')
print('Confusion Matrix:')
print(confusion_matrix_nb)
print('Classification Report:')
print(report_nb)

Naive Bayes Model:
Accuracy: 95.98%
Confusion Matrix:
[[1202    0]
 [  56  135]]
Classification Report:
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1202
        spam       1.00      0.71      0.83       191

    accuracy                           0.96      1393
   macro avg       0.98      0.85      0.90      1393
weighted avg       0.96      0.96      0.96      1393



In [13]:
# Specify if each message is spam or not based on model predictions
def classify_messages(model, X_tfidf):
    predictions = model.predict(X_tfidf)
    return ['spam' if label == 'spam' else 'not spam' for label in predictions]

# Specify if each message is spam or not based on model predictions
classified_nb = classify_messages(nb_model, X_test_tfidf)

# Display the first 10 classified messages for NB model
print("\nClassified Messages (First 10 for each model):")
for i in range(10):
    print(f"Naive Bayes: {classified_nb[i]} - {X_test.iloc[i]}")
    print()


Classified Messages (First 10 for each model):
Naive Bayes: not spam - funny fact nobody teaches volcanoes 2 erupt, tsunamis 2 arise, hurricanes 2 sway aroundn no 1 teaches hw 2 choose a wife natural disasters just happens

Naive Bayes: not spam - i sent my scores to sophas and i had to do secondary application for a few schools. i think if you are thinking of applying, do a research on cost also. contact joke ogunrinde, her school is one me the less expensive ones

Naive Bayes: not spam - we know someone who you know that fancies you. call 09058097218 to find out who. pobox 6, ls15hb 150p

Naive Bayes: not spam - only if you promise your getting out as soon as you can. and you'll text me in the morning to let me know you made it in ok.

Naive Bayes: spam - congratulations ur awarded either å£500 of cd gift vouchers & free entry 2 our å£100 weekly draw txt music to 87066 tncs www.ldew.com1win150ppmx3age16

Naive Bayes: not spam - i'll text carlos and let you know, hang on

Naive Bayes

In [11]:
spam_count_nb = classified_nb.count('spam')
print(f"Naive Bayes: {spam_count_nb} spam messages")

Naive Bayes: 135 spam messages


In [12]:
# Section 2: Logistic Regression
# TF-IDF Vectorization (already done)

# Model Training and Evaluation for Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)
y_pred_lr = lr_model.predict(X_test_tfidf)

# Evaluation Metrics for Logistic Regression
accuracy_lr = accuracy_score(y_test, y_pred_lr)
confusion_matrix_lr = confusion_matrix(y_test, y_pred_lr)
report_lr = classification_report(y_test, y_pred_lr)

# Print Results for Logistic Regression
print("\nLogistic Regression Model:")
print(f'Accuracy: {accuracy_lr * 100:.2f}%')
print('Confusion Matrix:')
print(confusion_matrix_lr)
print('Classification Report:')
print(report_lr)



Logistic Regression Model:
Accuracy: 96.34%
Confusion Matrix:
[[1201    1]
 [  50  141]]
Classification Report:
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1202
        spam       0.99      0.74      0.85       191

    accuracy                           0.96      1393
   macro avg       0.98      0.87      0.91      1393
weighted avg       0.96      0.96      0.96      1393



In [14]:
classified_lr = classify_messages(lr_model, X_test_tfidf)

for i in range(10):
    print(f"Logistic Regression: {classified_lr[i]} - {X_test.iloc[i]}")
    print()

Logistic Regression: not spam - funny fact nobody teaches volcanoes 2 erupt, tsunamis 2 arise, hurricanes 2 sway aroundn no 1 teaches hw 2 choose a wife natural disasters just happens

Logistic Regression: not spam - i sent my scores to sophas and i had to do secondary application for a few schools. i think if you are thinking of applying, do a research on cost also. contact joke ogunrinde, her school is one me the less expensive ones

Logistic Regression: not spam - we know someone who you know that fancies you. call 09058097218 to find out who. pobox 6, ls15hb 150p

Logistic Regression: not spam - only if you promise your getting out as soon as you can. and you'll text me in the morning to let me know you made it in ok.

Logistic Regression: spam - congratulations ur awarded either å£500 of cd gift vouchers & free entry 2 our å£100 weekly draw txt music to 87066 tncs www.ldew.com1win150ppmx3age16

Logistic Regression: not spam - i'll text carlos and let you know, hang on

Logistic Re

In [16]:
spam_count_lr = classified_lr.count('spam')
print(f"Logistic Regression: {spam_count_lr} spam messages")

Logistic Regression: 142 spam messages


In [17]:
# Section 3: Support Vector Machines (SVM)
# TF-IDF Vectorization (already done)

# Model Training and Evaluation for SVM
svm_model = SVC()
svm_model.fit(X_train_tfidf, y_train)
y_pred_svm = svm_model.predict(X_test_tfidf)

# Evaluation Metrics for SVM
accuracy_svm = accuracy_score(y_test, y_pred_svm)
confusion_matrix_svm = confusion_matrix(y_test, y_pred_svm)
report_svm = classification_report(y_test, y_pred_svm)

# Print Results for SVM
print("\nSupport Vector Machines (SVM) Model:")
print(f'Accuracy: {accuracy_svm * 100:.2f}%')
print('Confusion Matrix:')
print(confusion_matrix_svm)
print('Classification Report:')
print(report_svm)


Support Vector Machines (SVM) Model:
Accuracy: 97.92%
Confusion Matrix:
[[1202    0]
 [  29  162]]
Classification Report:
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1202
        spam       1.00      0.85      0.92       191

    accuracy                           0.98      1393
   macro avg       0.99      0.92      0.95      1393
weighted avg       0.98      0.98      0.98      1393



In [18]:
classified_svm = classify_messages(svm_model, X_test_tfidf)

for i in range(10):
    print(f"SVM: {classified_svm[i]} - {X_test.iloc[i]}")
    print()

SVM: not spam - funny fact nobody teaches volcanoes 2 erupt, tsunamis 2 arise, hurricanes 2 sway aroundn no 1 teaches hw 2 choose a wife natural disasters just happens

SVM: not spam - i sent my scores to sophas and i had to do secondary application for a few schools. i think if you are thinking of applying, do a research on cost also. contact joke ogunrinde, her school is one me the less expensive ones

SVM: spam - we know someone who you know that fancies you. call 09058097218 to find out who. pobox 6, ls15hb 150p

SVM: not spam - only if you promise your getting out as soon as you can. and you'll text me in the morning to let me know you made it in ok.

SVM: spam - congratulations ur awarded either å£500 of cd gift vouchers & free entry 2 our å£100 weekly draw txt music to 87066 tncs www.ldew.com1win150ppmx3age16

SVM: not spam - i'll text carlos and let you know, hang on

SVM: not spam - k.i did't see you.:)k:)where are you now?

SVM: not spam - no message..no responce..what happen

In [19]:
spam_count_svm = classified_svm.count('spam')

print(f"SVM: {spam_count_svm} spam messages")

SVM: 162 spam messages
