# Email Spam Detection Using Machine Learning

We are using Logistic Regression for training the machine learning model in order to detect spam emails.

# 1. Import the libraries

In [80]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## 2.Import the dataset

In [81]:
 df=pd.read_csv('/kaggle/input/sms-spam-collection-dataset/spam.csv',encoding='latin')


In [82]:
df.head(10)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


## 3. Cleaning the data

In [83]:
#Checking for missing values
df.isnull().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [84]:
#Deleting unwanted columns
del df['Unnamed: 2']
del df['Unnamed: 3']
del df['Unnamed: 4']

In [85]:
df.head(10)

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [86]:
df.rename(columns={"v1": "label","v2":"spam"}, inplace=True)

In [87]:
pd.unique(df['label'])

array(['ham', 'spam'], dtype=object)

In [88]:
#One hot encoding
df['label'] = df.label.map({'ham':0, 'spam':1})

In [89]:
df.head()

Unnamed: 0,label,spam
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [90]:
df.isnull().sum()

label    0
spam     0
dtype: int64

## 4. Training the model

In [91]:
# Seperating the text as texts and label
x= df['spam']
y= df['label']

In [92]:
#Splitting the data into train and test set
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)


In [93]:
# Feature Extraction
# Transform text data to feature vectors that can be used as input to the logistic regression
from sklearn.feature_extraction.text import TfidfVectorizer
fe= TfidfVectorizer(min_df=1,stop_words='english',lowercase='True')

In [94]:
x_train_feature = fe.fit_transform(x_train)
x_test_feature = fe.transform(x_test)

#  Convert Y_train and Y_test as Integers

y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [95]:
# Training the model
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()
lr.fit(x_train_feature,y_train)

LogisticRegression()

In [96]:
y_pred=lr.predict(x_train_feature)
print(y_pred)

[0 0 0 ... 0 0 0]


## 5. Testing the Model

In [97]:
y_predict=lr.predict(x_test_feature)
print(y_predict)

[0 0 0 ... 0 0 0]


## 5.Calculate the Accuracy of model

In [98]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test,y_predict)

In [99]:
accuracy*100

95.60538116591928