# Spam classifier using Multinomial Naive Bayes

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix

In [2]:
def dataFrameFromDirectory(path, classification):
    df=pd.DataFrame({'message': [], 'class': []})
    for filename in os.listdir(path):
        f=open(os.path.join(path,filename),'r')
        message=f.read()
        f.close()
        message=unicode(message.split('\n\n',1)[1],errors='ignore')
        df=df.append({'message': message, 'class': classification},ignore_index=True)
    return df
data = dataFrameFromDirectory('./emails/spam', 'spam')
data = data.append(dataFrameFromDirectory('./emails/ham', 'ham'))

In [3]:
data.head()

Unnamed: 0,class,message
0,spam,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.01 T..."
1,spam,"<HTML>\n<PRE>\nDear Valued Member,\n\nOfferClu..."
2,spam,<$B;v6H<T(B>\n$B;aL>(B:Vip-mail\n$BFMA3$N...
3,spam,<HTML><HEAD>\n<META http-equiv=3DContent-Type ...
4,spam,"\nWET, HORNY AND DIRTY GRANDMAS!\n\nRemember h..."


In [4]:
data.groupby('class').count()

Unnamed: 0_level_0,message
class,Unnamed: 1_level_1
ham,2500
spam,500


In [5]:
train, test = train_test_split(data,test_size=0.2,stratify=data['class'])

In [6]:
train.groupby('class').count()

Unnamed: 0_level_0,message
class,Unnamed: 1_level_1
ham,2000
spam,400


In [7]:
test.groupby('class').count()

Unnamed: 0_level_0,message
class,Unnamed: 1_level_1
ham,500
spam,100


In [9]:
vectorizer = CountVectorizer()
train_counts=vectorizer.fit_transform(train['message'].values)
classifier = MultinomialNB()
classifier.fit(train_counts, train['class'].values)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [10]:
train_predictions=classifier.predict(train_counts)
confusion_matrix(train['class'].values,train_predictions)

array([[1997,    3],
       [  57,  343]])

In [11]:
test_counts=vectorizer.transform(test['message'].values)
test_predictions=classifier.predict(test_counts)
confusion_matrix(test['class'].values,test_predictions)

array([[498,   2],
       [ 27,  73]])

In [14]:
class baggerMultinomialNB():
    def __init__(self,data,test_size=0.2,num_bags=1,class_name='class',classes=['ham','spam']):
        self.num_bags=num_bags
        self.class_name=class_name
        self.classes=classes
        self.train_data,self.test_data=train_test_split(data,test_size=test_size,stratify=data[self.class_name])
    def train(self):
        self.vectorizer = CountVectorizer()
        self.vectorizer.fit(self.train_data['message'].values)
        train_grouped=self.train_data.groupby([self.class_name])
        train_lg=train_grouped.get_group(self.classes[0])
        train_sm=train_grouped.get_group(self.classes[1])
        train_lg_bag=np.random.randint(self.num_bags,size=len(train_lg))
        train_lg.loc[:,'bag']=pd.Series(train_lg_bag,index=train_lg.index)
        train_lg_bagged=train_lg.groupby(['bag'])
        self.classifiers=[]
        for bag in train_lg_bagged.groups:
            train_bag=train_lg_bagged.get_group(bag).drop(['bag'],axis=1).append(train_sm)
            train_counts=self.vectorizer.transform(train_bag['message'].values)
            classifier = MultinomialNB()
            classifier.fit(train_counts, train_bag['class'].values)
            self.classifiers.append(classifier)
    def predict(self,individuals):
        individuals_counts=self.vectorizer.transform(individuals)
        votes=[]
        for classifier in self.classifiers:
            votes.append(classifier.predict(individuals_counts))
        return pd.DataFrame(votes).apply(lambda col: col.value_counts().idxmax())

In [15]:
baggedFilter=baggerMultinomialNB(data,num_bags=1)
baggedFilter.train()
baggedFilter.predict(['Free Viagra now!!!', "Hi Bob, how about a game of golf tomorrow?"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


0    spam
1     ham
dtype: object

In [17]:
train_predictions=baggedFilter.predict(train['message'].values)
confusion_matrix(train['class'].values,train_predictions)

array([[1997,    3],
       [  65,  335]])

In [19]:
test_predictions=baggedFilter.predict(test['message'].values)
confusion_matrix(test['class'].values,test_predictions)

array([[498,   2],
       [ 15,  85]])

In [20]:
baggedFilter=baggerMultinomialNB(data,num_bags=5)
baggedFilter.train()
baggedFilter.predict(['Free Viagra now!!!', "Hi Bob, how about a game of golf tomorrow?"])

0    spam
1     ham
dtype: object

In [21]:
train_predictions=baggedFilter.predict(train['message'].values)
confusion_matrix(train['class'].values,train_predictions)

array([[1994,    6],
       [  10,  390]])

In [22]:
test_predictions=baggedFilter.predict(test['message'].values)
confusion_matrix(test['class'].values,test_predictions)

array([[497,   3],
       [  5,  95]])