# Spam Filter

In [1]:
#Import Packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB



In [2]:
# import data
spam_df = pd.read_csv("spam.csv")

In [3]:
spam_df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
spam_df.groupby('Category')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000216FFA9DF40>

In [5]:
spam_df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [6]:
#we have 4825 normal msg (4516 are unique ,,)most frequent are sorry i ll 
#call later

In [7]:
# we create a new column 'spam'

In [8]:
#turn spam/ham into numerical data , creating a new column called 'spam'
spam_df['spam'] = spam_df['Category'].apply(lambda x: 1 if x == 'spam' else 0)

In [9]:
spam_df

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1
5568,ham,Will ü b going to esplanade fr home?,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0
5570,ham,The guy did some bitching but I acted like i'd...,0


In [10]:
# Create train/test split
x_train, x_test , y_train, y_test = train_test_split(spam_df.Message, spam_df.spam, test_size = 0.25)

In [11]:
x_train

4282    Wn u r hurt by d prsn who s close 2 u, do figh...
458     I hope you that's the result of being consiste...
5474    Where's mummy's boy ? Is he being good or bad ...
5165                      Ü still got lessons?  Ü in sch?
5226        Prabha..i'm soryda..realy..frm heart i'm sory
                              ...                        
5519    Can you pls send me that company name. In saib...
4452    And that is the problem. You walk around in "j...
1685                       Do you want bold 2 or bb torch
2319                            On the way to office da..
1518    Our brand new mobile music service is now live...
Name: Message, Length: 4179, dtype: object

In [12]:
x_train.describe()

count                       4179
unique                      3903
top       Sorry, I'll call later
freq                          23
Name: Message, dtype: object

In [13]:
# find word cound and store data as a matrix
cv = CountVectorizer()
x_train_count = cv.fit_transform(x_train.values)

In [14]:
x_train_count

<4179x7326 sparse matrix of type '<class 'numpy.int64'>'
	with 55438 stored elements in Compressed Sparse Row format>

In [15]:
x_train_count.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [16]:
# train model
model = MultinomialNB()
model.fit(x_train_count, y_train)

MultinomialNB()

In [17]:
#pre-test ham
email_ham = ['hey wanna meet up for the game ?']
email_ham_count = cv.transform(email_ham)
model.predict(email_ham_count)

array([0], dtype=int64)

0 ==> Not spam 

In [18]:
#pre_test spam
email_spam = ["reward money click "]
email_spam_count = cv.transform(email_spam)
model.predict(email_spam_count)

array([1], dtype=int64)

1 ==> Spam

In [19]:
email = ["baseball tickets later"]
email_count = cv.transform(email)
model.predict(email_count)

array([0], dtype=int64)

In [25]:
# test model
x_test_count = cv.transform(x_test)
model.score(x_test_count,y_test)

0.9813352476669059

### Testing with a real email 

In [22]:
email_text = """
You might have been here before...
I know I have.

Going through tutorial after tutorial on YouTube.

Scrolling through PDFs that you found on some sketchy site.

You just want to get started with Machine LEARNING!!

WHY DOES THIS HAVE TO BE SO HARD

Well...

Imagine a land where you can smash-through-a-tech-course and learn everything you need to get started with the holy trinity.

Data Science
Machine Learning
And....you guessed it. Deep Learning!

Well, it's just a dream. 

Wake up.

....seriously?

Nah, nah I kid. I coded up that exact course. 

Originally parts of it were released on my YouTube channel but I realised I could provide better support and guidance through a proper course site. So when you need help or things don't a go quite so right (*read this in Mario's voice*)...I can help. 

So whattayouwaitingfor!

Here's how to get started learning ASAP:

1. Click the button below,  you'll be taken to the Python course at Courses from Nick. 

GET THE COURSE FREE!
2. Use the discount code FREEPYTHON for 100% OFF.

3. Start coding!

✌️

Nick

P.s. You'll learn all the basics and then some. How to work with Jupyter, Python, Numpy, Pandas and a bunch more!

P.p.s. I sliced and diced up the course so it's focused on what you need. No random stuff you don't!
"""

In [23]:
mail = [email_text]

In [24]:
mail_count = cv.transform(mail)
model.predict(mail_count)

array([0], dtype=int64)

### not a spam 
