Importing necessary libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

Data Preprocessing

In [None]:
#load the dataset to pandas Data Frame
raw_mail_data=pd.read_csv('spam.csv', encoding = "ISO-8859-1")
#replace null values with null string
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')

In [None]:
mail_data.shape

(5572, 5)

In [None]:
mail_data.head()

Unnamed: 0,Category,Message,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [None]:
mail_data.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True)

In [None]:
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [2]:
#Label spam mail as 1;Non spam mail(ham) as 0

In [None]:
mail_data['Category'] = mail_data['Category'].map( {'spam': 1, 'ham': 0})

In [None]:
mail_data

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [None]:
#Seperate the data as text and label.
#X -->Text
#Y --->Label
X=mail_data['Message']
Y=mail_data['Category']

In [None]:
Y

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: Category, Length: 5572, dtype: int64

In [None]:
print(X)
print('\n   --------------------------------------              \n')
print(Y)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object

   --------------------------------------              

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: Category, Length: 5572, dtype: int64


Train Test Split

In [None]:
#Splitting the data as train data and test data
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,train_size=0.8,test_size=0.2,random_state=3,stratify=Y)

Feature Extraction


In [None]:
#Transform the text data to feature vectors that can be used as input to the SVM model using TfidVectorizer
#Convert the text to lower case letters
feature_extraction=TfidfVectorizer(min_df=1,stop_words='english',lowercase=True)
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)


In [None]:
#Convert Y_train and Y_test values as integers
Y_train=Y_train.astype('int')
Y_test=Y_test.astype('int')

Training the model ---->Support Vector Machine

In [None]:
#Training the support vector machine model with training data
model=LinearSVC()
model.fit(X_train_features,Y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

Evaluation of the model

In [None]:
#Prediction on training data
prediction_on_training_data=model.predict(X_train_features)
accuracy_on_training_data=accuracy_score(Y_train,prediction_on_training_data)

In [None]:
print("Accuracy on training data :",accuracy_on_training_data)

Accuracy on training data : 1.0


In [None]:
#Prediction on test data
prediction_on_testing_data=model.predict(X_test_features)
accuracy_on_testing_data=accuracy_score(Y_test,prediction_on_testing_data)

In [None]:
print("Accuracy on testing data :",accuracy_on_testing_data)

Accuracy on testing data : 0.9874439461883409


In [None]:
#Prediction on new mail
input_mail =["ehronline web address change this message is intended for ehronline users only . due to a..."]
#Convert text to feature vectors
input_mail_features = feature_extraction.transform(input_mail)

In [None]:
#Making prediction
prediction=model.predict(input_mail_features)
print(prediction)
if prediction[0]==1:
  print("Spam Mail")
else :
  print("Ham Mail")

[0]
Ham Mail
