<a href="https://colab.research.google.com/github/engineereliab076/SPAM_DETECTION_USING_ML/blob/main/01_SPAM_DETECTION_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#IMPORT THE DEPENDENCIES
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
import pickle

DATA COLLECTION AND PREPROCESSING

In [None]:
# Load the dataset (change the path if necessary)
file_path = "/content/SMSSpamCollection"

# Read the file (it's tab-separated)
mail_data = pd.read_csv(file_path, sep='\t', header=None, names=['label', 'message'])

# Display first few rows
mail_data.head()


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
#print first 5 rows
mail_data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
mail_data.shape

(5572, 2)

In [None]:
mail_data.describe()

Unnamed: 0,label,message
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [None]:
#replace null values with a null string
#mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')

LABEL ENCODING

In [None]:
#SPAM EMAIL AS 1, AND HAM MAIL AS 0

mail_data.loc[mail_data['label'] == 'spam', 'label',] = 1
mail_data.loc[mail_data['label'] == 'ham', 'label',] = 0


In [None]:
mail_data.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
print(mail_data['label'].value_counts())

label
0    4825
1     747
Name: count, dtype: int64


In [None]:
#imbalanced data---oversample
from sklearn.utils import resample

# Split data
ham_data = mail_data[mail_data['label'] == 0]
spam_data = mail_data[mail_data['label'] == 1]

spam_oversampled = resample(spam_data, replace=True, n_samples=len(ham_data), random_state=42)
mail_data_balanced = pd.concat([ham_data, spam_oversampled]).sample(frac=1, random_state=42)

print(mail_data_balanced['label'].value_counts())  # Now balanced!



label
0    4825
1    4825
Name: count, dtype: int64


In [None]:
#separataing the texts and labels

x = mail_data_balanced['message']

y = mail_data_balanced['label']

In [None]:
print(x)

4365              So what about you. What do you remember
4685    My life Means a lot to me, Not because I love ...
4812    E admin building there? I might b slightly ear...
3948                 Sorry, went to bed early, nightnight
5449    Latest News! Police station toilet stolen, cop...
                              ...                        
5497    SMS SERVICES. for your inclusive text credits,...
1780    Loan for any purpose £500 - £75,000. Homeowner...
4377    If you don't, your prize will go to another cu...
1014    Just buy a pizza. Meat lovers or supreme. U ge...
4918    This is the 2nd time we have tried 2 contact u...
Name: message, Length: 9650, dtype: object


In [None]:
print(y)

4365    0
4685    0
4812    0
3948    0
5449    1
       ..
5497    1
1780    1
4377    1
1014    0
4918    1
Name: label, Length: 9650, dtype: object


TRAIN TEST SPLIT

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
print(x_train.shape, x_test.shape)

(7720,) (1930,)


FEATURE EXTRACTION

In [None]:
#TRANSFORM THE TEXT DATA TO FEATURE VECTORSTHAT CAN BE USED AS INPUT TO THE LOGISTIC REGRESSION

feature_extraction = TfidfVectorizer(min_df=1, lowercase=True)

x_train_features = feature_extraction.fit_transform(x_train)
x_test_features = feature_extraction.transform(x_test)

#convert y_train and y_test as integers

y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [None]:
print(x_train_features)

  (0, 5004)	0.21639039418965486
  (0, 1617)	0.3861298511021601
  (0, 7164)	0.265044528374306
  (0, 5155)	0.2904808653431019
  (0, 3696)	0.26341710185364814
  (0, 2047)	0.2908417965413917
  (0, 5224)	0.2311792885327263
  (0, 3279)	0.2102162910132164
  (0, 4873)	0.2213055190321867
  (0, 6798)	0.37788476838219226
  (0, 3131)	0.45394481679055965
  (1, 7536)	0.19006123255103358
  (1, 1477)	0.28988537554075156
  (1, 1274)	0.17303419956087415
  (1, 605)	0.32631234012082416
  (1, 7197)	0.13573563083630638
  (1, 5461)	0.1927432335762193
  (1, 3878)	0.30504286993365853
  (1, 1811)	0.15346687087937247
  (1, 1741)	0.0895619241399662
  (1, 136)	0.30504286993365853
  (1, 1964)	0.1360727181229329
  (1, 2824)	0.17509593899746123
  (1, 832)	0.30504286993365853
  (1, 5463)	0.20868119433127066
  :	:
  (7719, 3324)	0.13852512741215606
  (7719, 3143)	0.10725726089158007
  (7719, 5051)	0.09711980701390971
  (7719, 2129)	0.1943675711410635
  (7719, 6748)	0.24688629961906178
  (7719, 7478)	0.11437737184859038

In [None]:
print(x_test_features)

  (0, 796)	0.5173408469147439
  (0, 1770)	0.21272996891606338
  (0, 1887)	0.1429236436272745
  (0, 2420)	0.18853941215267642
  (0, 2487)	0.25867042345737196
  (0, 3303)	0.20755008429069582
  (0, 3380)	0.1898590679565677
  (0, 3677)	0.20015761856708947
  (0, 4341)	0.3162532254817115
  (0, 4741)	0.1557941427271965
  (0, 4967)	0.12318594369756866
  (0, 5149)	0.19420962544894904
  (0, 5404)	0.12829454730028964
  (0, 6223)	0.14049885497582076
  (0, 6250)	0.11759814223516517
  (0, 6271)	0.13591569149718716
  (0, 6283)	0.18214149067057875
  (0, 6284)	0.25867042345737196
  (0, 6748)	0.11203238486393022
  (0, 7045)	0.109307023525567
  (0, 7074)	0.08004419142613699
  (0, 7197)	0.1688784301821946
  (0, 7478)	0.10380462392962031
  (0, 7562)	0.1493277473905602
  (0, 7996)	0.0822490901825249
  :	:
  (1929, 421)	0.2238004578363877
  (1929, 477)	0.24748210260344145
  (1929, 777)	0.2613349767475673
  (1929, 1579)	0.2613349767475673
  (1929, 1866)	0.25237119894547155
  (1929, 1944)	0.20011881306933396
 

In [None]:
y_train

Unnamed: 0,label
2986,0
2791,1
5084,0
4139,0
2770,1
...,...
4768,1
856,1
4237,1
1407,1


In [None]:
y_test

Unnamed: 0,label
4534,1
3978,1
1358,0
4206,1
2730,1
...,...
1041,0
1659,1
1299,0
4332,0


MODELL TRAINING

In [None]:
lr_model = LogisticRegression()

NameError: name 'LogisticRegression' is not defined

In [None]:
#training the model

lr_model.fit(x_train_features, y_train)

MODEL EVALUATION

In [None]:
#predict on training data

prediction_on_training_data = lr_model.predict(x_train_features)
accuracy_on_training_data = accuracy_score(y_train, prediction_on_training_data)

In [None]:
print('Accuracy on training data: ', accuracy_on_training_data)

Accuracy on training data:  0.9924870466321244


In [None]:
#prediction on test data

#predict on training data

prediction_on_test_data = lr_model.predict(x_test_features)
accuracy_on_test_data = accuracy_score(y_test, prediction_on_test_data)

In [None]:
print('Accuracy on test data: ', accuracy_on_test_data)

Accuracy on test data:  0.9906735751295337


BUILD A PREDICTIVE SYSTEM

In [None]:
message = """Subject: 🎉 You’ve Been Selected – Claim Your Special Gift Now!

Body:
Hi [Your Name],

We wanted to personally reach out and thank you for your continued support. As a valued customer, you are eligible for an exclusive loyalty reward! 🎁

There’s nothing you need to buy, just click the link below to confirm your shipping details, and we’ll send your surprise gift straight to your door.

👉 YourCompany.com/reward

Hurry, this offer is only available for the next 24 hours! If you have any questions, feel free to reply to this email.

Best regards,
[YourCompany] Customer Support Team"""
input_mail = [message]

#convert text to feature vectors
input_data_features = feature_extraction.transform(input_mail)

#making prediction

prediction = lr_model.predict(input_data_features)
print(prediction)

if (prediction[0]==1):
  print('spam MAIL')
else:
  print('ham MAIL')

[1]
spam MAIL


In [None]:
print(mail_data_balanced['label'].value_counts())


label
0    4825
1    4825
Name: count, dtype: int64


My model wasn't predicting correct spam and ham mails , until when i added back the stop words......removing the stop words was a mistake.

NAIVE BAYES CLASSIFIER

In [None]:
nb = MultinomialNB(alpha=1.0)

nb.fit(x_train_features, y_train)

In [None]:
#predict on training data

prediction_on_training_data = nb.predict(x_train_features)
accuracy_on_training_data = accuracy_score(y_train, prediction_on_training_data)

print('Accuracy on training data: ', accuracy_on_training_data)

Accuracy on training data:  0.9887305699481865


In [None]:
#predict on test data

prediction_on_test_data = nb.predict(x_test_features)
accuracy_on_test_data = accuracy_score(y_test, prediction_on_test_data)

print('Accuracy on test data: ', accuracy_on_test_data)

Accuracy on test data:  0.9854922279792746


In [None]:
input_mail = [message]

#convert text to feature vectors
input_data_features = feature_extraction.transform(input_mail)

#making prediction

prediction = nb.predict(input_data_features)
print(prediction)

if (prediction[0]==1):
  print('spam MAIL')
else:
  print('ham MAIL')

[1]
spam MAIL


SUPPORT VECTOR MACHINE

In [None]:
svm = SVC(kernel='linear', C=1.0, random_state=42)

svm.fit(x_train_features, y_train)

In [None]:
#predict on training data

prediction_on_training_data = svm.predict(x_train_features)
accuracy_on_training_data = accuracy_score(y_train, prediction_on_training_data)

print('Accuracy on training data: ', accuracy_on_training_data)

Accuracy on training data:  0.9997409326424871


In [None]:
#predict on test data

prediction_on_test_data = svm.predict(x_test_features)
accuracy_on_test_data = accuracy_score(y_test, prediction_on_test_data)

print('Accuracy on test data: ', accuracy_on_test_data)

Accuracy on test data:  0.9974093264248705


In [None]:

input_mail = [message]
#convert text to feature vectors
input_data_features = feature_extraction.transform(input_mail)

#making prediction

prediction = svm.predict(input_data_features)
print(prediction)

if (prediction[0]==1):
  print('spam MAIL')
else:
  print('ham MAIL')

[1]
spam MAIL
