# Email Spam Detection

### Importing a CSV file which contains both spam and ham emails

In [1]:
import pandas as pd              #importing a library called pandas and assigning in to a varible called pd

In [2]:
data=pd.read_csv('spam.csv')     #reading a csv file called spam.csv which is located at the directory
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Feature Engineering or Data processing

In [3]:
data.isna().sum()   #checking whether there is any NaN values in the dataset.

Category    0
Message     0
dtype: int64

In [5]:
from sklearn.preprocessing import LabelEncoder #since we have categorical data in Category column, we need to encode them.

In [6]:
le=LabelEncoder()
data['Spam']=le.fit_transform(data.Category)
data.head()

Unnamed: 0,Category,Message,Spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [7]:
new=data.drop('Category',axis='columns')         #drop Category Column to make the dataset look simple
new.head()

Unnamed: 0,Message,Spam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


### Splitting the dataset into train and test

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
x_train,x_test,y_train,y_test=train_test_split(new.Message,new.Spam,test_size=0.2,random_state=10)

#splitting the dataset as train and test data (i.e) 80% as train data and remaining 20% as test data
#ratio of the split was specified in test_size
#random_state is to control the shuffling applied to the data before applying the split.

In [11]:
len(x_train),len(x_test)             #length of train and test data (80%,20%)

(4457, 1115)

### Forming a pipeline to avoid repeatation of code

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer  #to convert the strings in Message column as 1s and 0s
from sklearn.naive_bayes import MultinomialNB                #the algorithm which we are going to use is Multinomial Navie Bayes

In [20]:
pipe=Pipeline([
    ('vectorizer',CountVectorizer()),
    ('nb',MultinomialNB())
])

### Training and evaluating the model through pipeline

In [21]:
pipe.fit(x_train,y_train)     #training our model with the help of train data

Pipeline(steps=[('vectorizer', CountVectorizer()), ('nb', MultinomialNB())])

In [22]:
pipe.predict(x_test)          #predicting the test data

array([0, 0, 0, ..., 0, 0, 0])

In [23]:
pipe.score(x_test,y_test)     #accuracy of our model

0.9829596412556054

In [26]:
test_email=[
    'Grab your 50% discount, by clicking the link below',
    'Iam busy right now, call you later',
    'This is the last date to pay your electricity bill'
]
#In this test emails, first one looks like a spam email and rest looks like ham 

In [27]:
pipe.predict(test_email)  #our model predicted the test emails correctly 

array([1, 0, 0])