# Spam Classifier using Naive Bayes Classifier

### Import Libraries & Read File

In [1]:
import numpy as np
import pandas as pd

In [2]:
file = 'spam.csv'
import chardet
with open(file, 'rb') as rawdata:
    df = chardet.detect(rawdata.read(100000))
df = pd.read_csv(file,encoding='ISO-8859-1')
df.head(3)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,


In [3]:
# drop Unuseful columns
df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'], axis=1, inplace=True)

In [4]:
df

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


### Natural Language Preprocessing

In [5]:
# NLP Libraries
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [6]:
lemmatizer = WordNetLemmatizer()

In [7]:
# Tokenization & Lemmatizing
sents = df['v2'].copy()
for i in range(len(sents)):
    _words = nltk.word_tokenize(sents[i])
    _words = [lemmatizer.lemmatize(_word) for _word in _words if _word not in set(stopwords.words('english'))]
    sents[i] = ' '.join(_words)
sents[0]

'Go jurong point , crazy .. Available bugis n great world la e buffet ... Cine got amore wat ...'

In [8]:
# Regular Expression library to remove chars other than alphabets
import re
corpus = []
for i in range(len(sents)):
    _sent = re.sub('[^a-zA-Z]',' ', sents[i])
    _sent = _sent.lower()
    _sent = _sent.split()    
    corpus.append(' '.join(_sent))
corpus[0]

'go jurong point crazy available bugis n great world la e buffet cine got amore wat'

In [9]:
# Bag of Words
from sklearn.feature_extraction.text import CountVectorizer
# We select top 6000 freq. columns
cv = CountVectorizer(max_features=6000)
X = cv.fit_transform(corpus).toarray()

In [10]:
X.shape

(5572, 6000)

In [11]:
y = pd.get_dummies(df['v1'],drop_first=True)
y.head(4)

Unnamed: 0,spam
0,0
1,0
2,1
3,0


### Naive Bayes Model

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [13]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()

In [14]:
model.fit(X_train,y_train)

  return f(*args, **kwargs)


MultinomialNB()

In [15]:
model.score(X_test, y_test)

0.9777458722182341

In [16]:
y_predicted = model.predict(X_test)

In [17]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_predicted)

In [18]:
cm

array([[1182,   20],
       [  11,  180]], dtype=int64)

In [19]:
# save the model to disk
import pickle
pickle.dump(model, open('model.pkl', 'wb'))

In [20]:
# load the model from disk
loaded_model = pickle.load(open('model.pkl', 'rb'))

In [21]:
import numpy as np
new_input1 = "Hi Deepak! Can you call me today, it is urgent"
new_input2 = "Hello! You have chance to win Rs.300 voucher."

_columns = pd.read_csv('columns.csv')

def preprocess_for_new_input(sents):
    new_input_x = pd.DataFrame(np.zeros((1,6000)),columns = _columns.columns)
    sents = re.sub('[^a-zA-Z]',' ', sents)
    sents = sents.lower()
    _words = nltk.word_tokenize(sents)
    _words = [lemmatizer.lemmatize(_word) for _word in _words if _word not in set(stopwords.words('english'))]
    for i in range(len(_words)):
        if _words[i] in cv.get_feature_names():
            print(_words[i])
            new_input_x[_words[i]]=1;
    return new_input_x;
new_input_x1 = preprocess_for_new_input(new_input1)
new_input_x2 = preprocess_for_new_input(new_input2)

hi
deepak
call
today
urgent
hello
chance
win
voucher


In [22]:
print(loaded_model.predict(new_input_x1))
print(loaded_model.predict(new_input_x2))
# 0 - Ham
# 1 - Spam

[0]
[1]


In [23]:
pd.DataFrame(columns = new_input_x1.columns).to_csv('columns.csv',index=False)

In [24]:
pip install -U scikit-learn

Note: you may need to restart the kernel to use updated packages.
