# Email Spam Detection

In [1]:
# Importing the libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Load the dataset
data = pd.read_csv("spam emails.csv", encoding = "ISO-8859-1")
data

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [3]:
# Checking Missing Values
data.isnull().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [4]:
data = data.dropna(axis=1)
data

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [5]:
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
data.describe()

Unnamed: 0,v1,v2
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [7]:
data.shape

(5572, 2)

In [8]:
data.isnull().sum()

v1    0
v2    0
dtype: int64

In [9]:
# Renaming the columns
data = data.rename(columns={'v1': 'label', 'v2': 'email'})
data

Unnamed: 0,label,email
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [10]:
data['label'] = data['label'].replace({'ham': 'non-spam', 'spam': 'spam'})

In [11]:
data.head()

Unnamed: 0,label,email
0,non-spam,"Go until jurong point, crazy.. Available only ..."
1,non-spam,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,non-spam,U dun say so early hor... U c already then say...
4,non-spam,"Nah I don't think he goes to usf, he lives aro..."


In [12]:
# Converting categorical variables into numerical
data.loc[data['label'] == 'spam', 'label',] = 0
data.loc[data['label'] == 'non-spam', 'label',] = 1

In [13]:
data.head(10)

Unnamed: 0,label,email
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."
5,0,FreeMsg Hey there darling it's been 3 week's n...
6,1,Even my brother is not like to speak with me. ...
7,1,As per your request 'Melle Melle (Oru Minnamin...
8,0,WINNER!! As a valued network customer you have...
9,0,Had your mobile 11 months or more? U R entitle...


In [14]:
# Converting categorical variables into numerical
data.loc[data['label'] == 'spam', 'label',] = 0
data.loc[data['label'] == 'non-spam', 'label',] = 1

In [15]:
# Taking Features and Target variables
x = data['email'] # Feature
y = data['label'] # Target

In [16]:
# Splitting the dataset into training and testing
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42) 

In [17]:

# Feature extraction
tfidf = TfidfVectorizer(min_df = 1, stop_words='english', lowercase=True)

# fitting and transforming the training data
x_train_feature = tfidf.fit_transform(x_train)

# Transforming the testing data
x_test_feature = tfidf.transform(x_test)

y_train = y_train.astype(int)
y_test = y_test.astype(int)

In [18]:
# Initializing the SVC classifier
svm = SVC()

# fit the classifier
svm.fit(x_train_feature, y_train)

In [19]:
# Prediction on the testing data
y_pred = svm.predict(x_test_feature)

# Calculating accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)

Accuracy:  0.9766816143497757


In [20]:
# Printing the classification report
report = classification_report(y_test, y_pred)
print("Classification report: ", report)

Classification report:                precision    recall  f1-score   support

           0       0.99      0.83      0.91       150
           1       0.97      1.00      0.99       965

    accuracy                           0.98      1115
   macro avg       0.98      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [None]:
# Predicting spam and non-spam message

mail_message = [input("Enter a message: ")]
mail_feature = tfidf.transform(mail_message)
prediction = svm.predict(mail_feature)
print(prediction)

if prediction[0] == 1:
    print("not-spam")
else:
    print("spam")