<a href="https://colab.research.google.com/github/chanu0073/Spam-Mail-Detection-ML-Project/blob/main/Spam_Mail_Detection_ML_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing the required dependencies

In [None]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
mails_data = pd.read_csv('/content/mail_data.csv')

In [None]:
mails_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
mails_data.shape

(5572, 2)

In [None]:
mails_data['Category'].value_counts()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
ham,4825
spam,747


In [None]:
mails_data.isnull().sum()

Unnamed: 0,0
Category,0
Message,0


In [None]:
mails_data = mails_data.where((pd.notnull(mails_data)),'')  #replaces null values with null strings

Label Encoding

In [23]:
# label spam mail = 0, ham mail = 1
mails_data.loc[mails_data['Category']== 'spam','Category',] = 0  #spam = 0
mails_data.loc[mails_data['Category'] == 'ham', 'Category',] = 1 #ham = 1

In [24]:
mails_data.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


Separating the features and target

In [27]:
X = mails_data['Message']
Y = mails_data['Category']

In [28]:
print(X)
print(Y)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object
0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object


Textual Preprocessing

In [29]:
vectorizer = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)
X_features = vectorizer.fit_transform(X)

In [30]:
print(X_features)

  (0, 4244)	0.3509747937708837
  (0, 5775)	0.27451666535585145
  (0, 2278)	0.27180581777101714
  (0, 1281)	0.2625176995308806
  (0, 1715)	0.29650492406235857
  (0, 3551)	0.19387866945820548
  (0, 8281)	0.2374071580094415
  (0, 4370)	0.29650492406235857
  (0, 1713)	0.3350433781715565
  (0, 2003)	0.29650492406235857
  (0, 3511)	0.16453831818791095
  (0, 1061)	0.3509747937708837
  (0, 8079)	0.19610332236431888
  (1, 5373)	0.2718944069420321
  (1, 4406)	0.4083258549263009
  (1, 4212)	0.5236804332035243
  (1, 8187)	0.43162957585464123
  (1, 5399)	0.5466243141314314
  (2, 3276)	0.11676028650249681
  (2, 2885)	0.3644022596021207
  (2, 8239)	0.19287984407221892
  (2, 2119)	0.19686982823560253
  (2, 8199)	0.14953315491852773
  (2, 3014)	0.47550942852592687
  (2, 2337)	0.20418515380343544
  :	:
  (5567, 2777)	0.23210746089026935
  (5567, 307)	0.24294734175129457
  (5567, 707)	0.2498172309216542
  (5567, 5836)	0.2799050574582319
  (5567, 165)	0.33657880027391784
  (5567, 5293)	0.33657880027391784

In [38]:
Y = Y.astype('int') #converting Y values as integers

Splitting the data for training and testing

In [39]:
X_train, X_test, Y_train, Y_test = train_test_split(X_features, Y, test_size=0.2, random_state=3)

In [40]:
print(X_train.shape)
print(X_test.shape)

(4457, 8440)
(1115, 8440)


Training the model

In [41]:
model = LogisticRegression()
model.fit(X_train,Y_train)

In [44]:
training_data_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(training_data_prediction, Y_train)

Accuracy score of training data

In [45]:
print(training_data_accuracy)

0.9670181736594121


In [46]:
test_data_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(test_data_prediction, Y_test)

Accuracy score of test data

In [47]:
print(test_data_accuracy)

0.9524663677130045


Making a predictive model

In [57]:
X_new = X_test[4];
prediction = model.predict(X_new)
print(prediction)

if(prediction==1):
  print('Ham Spam')
else:
  print('Spam mail')

[1]
The Mail is Not spam


In [63]:
input_mail = ["WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only."]

#covert text to  feature vectors
input_mail_features = vectorizer.transform(input_mail)

prediction = model.predict(input_mail_features)
if(prediction==1):
  print('Ham Spam')
else:
  print('Spam mail')

Spam mail
