In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [2]:
df = pd.read_csv('spam.csv', encoding='ISO-8859-1')
print(df.describe)

<bound method NDFrame.describe of       Unnamed: 0 label                                               text  \
0            605   ham  Subject: enron methanol ; meter # : 988291\r\n...   
1           2349   ham  Subject: hpl nom for january 9 , 2001\r\n( see...   
2           3624   ham  Subject: neon retreat\r\nho ho ho , we ' re ar...   
3           4685  spam  Subject: photoshop , windows , office . cheap ...   
4           2030   ham  Subject: re : indian springs\r\nthis deal is t...   
...          ...   ...                                                ...   
5166        1518   ham  Subject: put the 10 on the ft\r\nthe transport...   
5167         404   ham  Subject: 3 / 4 / 2000 and following noms\r\nhp...   
5168        2933   ham  Subject: calpine daily gas nomination\r\n>\r\n...   
5169        1409   ham  Subject: industrial worksheets for august 2000...   
5170        4807  spam  Subject: important online banking alert\r\ndea...   

      label_num  
0             0  
1    

In [3]:
df = df.drop(["Unnamed: 0", "label_num"], axis=1)
print(df.describe)

<bound method NDFrame.describe of      label                                               text
0      ham  Subject: enron methanol ; meter # : 988291\r\n...
1      ham  Subject: hpl nom for january 9 , 2001\r\n( see...
2      ham  Subject: neon retreat\r\nho ho ho , we ' re ar...
3     spam  Subject: photoshop , windows , office . cheap ...
4      ham  Subject: re : indian springs\r\nthis deal is t...
...    ...                                                ...
5166   ham  Subject: put the 10 on the ft\r\nthe transport...
5167   ham  Subject: 3 / 4 / 2000 and following noms\r\nhp...
5168   ham  Subject: calpine daily gas nomination\r\n>\r\n...
5169   ham  Subject: industrial worksheets for august 2000...
5170  spam  Subject: important online banking alert\r\ndea...

[5171 rows x 2 columns]>


In [4]:
df.columns = ['label', 'text']
print(df.describe)

<bound method NDFrame.describe of      label                                               text
0      ham  Subject: enron methanol ; meter # : 988291\r\n...
1      ham  Subject: hpl nom for january 9 , 2001\r\n( see...
2      ham  Subject: neon retreat\r\nho ho ho , we ' re ar...
3     spam  Subject: photoshop , windows , office . cheap ...
4      ham  Subject: re : indian springs\r\nthis deal is t...
...    ...                                                ...
5166   ham  Subject: put the 10 on the ft\r\nthe transport...
5167   ham  Subject: 3 / 4 / 2000 and following noms\r\nhp...
5168   ham  Subject: calpine daily gas nomination\r\n>\r\n...
5169   ham  Subject: industrial worksheets for august 2000...
5170  spam  Subject: important online banking alert\r\ndea...

[5171 rows x 2 columns]>


In [5]:
df['b_labels'] = df['label'].map({'ham': 0, 'spam': 1})
print(df.head())

  label                                               text  b_labels
0   ham  Subject: enron methanol ; meter # : 988291\r\n...         0
1   ham  Subject: hpl nom for january 9 , 2001\r\n( see...         0
2   ham  Subject: neon retreat\r\nho ho ho , we ' re ar...         0
3  spam  Subject: photoshop , windows , office . cheap ...         1
4   ham  Subject: re : indian springs\r\nthis deal is t...         0


In [6]:
y = df['b_labels'].values
x_train, x_test, y_train, y_test = train_test_split(df['text'], y, test_size=0.33)
x_train = x_train.tolist()
x_test = x_test.tolist()

In [7]:
x_test

['Subject: reviews\r\nheather will be calling each of up to set up a time to discuss your review\r\nfor 2000 .\r\nplease bring with you two items that you plan on working on during the first\r\nsix months of 2001 .\r\nthese should be items that improve the process around your area or brings\r\nadditional\r\nincome to enron .\r\nthanks\r\nbob',
 "Subject: hl & p month to date flow\r\njanet . i ' m back from vacation ! i have updated the flow volumes for hl & p",
 'Subject: immediate contract payment .\r\nimmediate contract payment . our ref : cbn / ird / cbx / 021 / 05\r\nattn :\r\nduring the auditing and closing of all financial records of the central bank of nigeria ( cbn ) it was discovered from the records of outstanding foreign contractors due for payment with the federal government of nigeria in the year 2005 that your name and company is next on the list of those who will received their fund .\r\ni wish to officially notify you that your payment is being processed and will be rel

In [8]:
print(y_train)

[0 0 0 ... 0 0 0]


## 1- Trying tokenization

In [9]:
messages = x_train
word_set = set()
for message in messages:
    words = ''.join([c if c.isalpha() else ' ' for c in message]).split()
    for word in words:
        word_set.add(word.lower())
        
print(word_set)



In [10]:
word_list = list(word_set)
word_list = np.array(word_list)
print(len(word_list))

36759


In [11]:
# make a dictionary of words and their spam counts and ham counts
word_dict = {}
for word in word_list:
    word_dict[word] = (0,0)

for i in range(len(messages)):
        message = messages[i]
        words = ''.join([c if c.isalpha() else ' ' for c in message]).split()
        for word in words:
            word = word.lower()
            word_dict[word] = (word_dict[word][0] + (y_train[i] == 1), word_dict[word][1] + (y_train[i] == 0 ))
print(word_dict)



In [12]:
word_dict = {word: (word_dict[word][0] / len(messages),
                                 word_dict[word][1] / len(messages))
                                for word in word_dict}
word_dict

{'mambo': (0.0008660508083140878, 0.0),
 'musician': (0.0002886836027713626, 0.0002886836027713626),
 'courtroom': (0.0011547344110854503, 0.0),
 'cactus': (0.0002886836027713626, 0.0),
 'forecasting': (0.0, 0.0008660508083140878),
 'holmelin': (0.0, 0.0002886836027713626),
 'paratroop': (0.0002886836027713626, 0.0),
 'est': (0.00894919168591224, 0.003464203233256351),
 'jdoe': (0.0, 0.0002886836027713626),
 'intszetz': (0.0002886836027713626, 0.0),
 'reaching': (0.0002886836027713626, 0.0008660508083140878),
 'mont': (0.0005773672055427252, 0.0),
 'cleburne': (0.0, 0.02569284064665127),
 'fiie': (0.0002886836027713626, 0.0),
 'tablet': (0.0011547344110854503, 0.0),
 'fogy': (0.0002886836027713626, 0.0),
 'pressing': (0.0, 0.0005773672055427252),
 'honestly': (0.0, 0.0005773672055427252),
 'bluet': (0.0002886836027713626, 0.0),
 'presents': (0.0008660508083140878, 0.0002886836027713626),
 'hdvest': (0.0, 0.0002886836027713626),
 'juanita': (0.0008660508083140878, 0.0002886836027713626)

## 2- Writing the Naive Bayes Spam Classifier 

In [13]:
class NaiveBayesSpamClassifier:
    def fit(self, x_train, y_train):
        self.messages = x_train
        self.labels = y_train
        
        word_set = set()
        for message in self.messages:
            words = ''.join([c if c.isalpha() else ' ' for c in message]).split()
            for word in words:
                word_set.add(word.lower())
                
        word_list = list(word_set)
        word_list = np.array(word_list)
        print(len(word_list))
                
        # make a dictionary of words and their spam counts and ham counts
        word_dict = {}
        for word in word_list:
            word_dict[word] = (0,0)

        for i in range(len(self.messages)):
            message = self.messages[i]
            words = ''.join([c if c.isalpha() else ' ' for c in message]).split()
            for word in words:
                word = word.lower()
                word_dict[word] = (word_dict[word][0] + (y_train[i] == 1),
                                   word_dict[word][1] + (y_train[i] == 0 ))
        #print(word_dict)
        
        self.word_dict = word_dict
        #find the normalized word_dict
        self.spam_count = sum(self.labels)
        self.ham_count = len(self.labels) - self.spam_count
        self.word_dict = {word: (self.word_dict[word][0] / self.spam_count,
                                 self.word_dict[word][1] / self.ham_count)
                                for word in self.word_dict}
        self.p_spam = self.spam_count / len(self.labels)
        self.p_ham = 1 - self.p_spam
    def predict(self, message):
        p_spam_message = self.p_spam
        p_ham_message = self.p_ham
        words = ''.join([c if c.isalpha() else ' ' for c in message]).split()
        for word in words:
            if word not in self.word_dict:
                continue
            else : 
                p_spam_message *= (self.word_dict[word][0] ) 
                p_ham_message *= (self.word_dict[word][1] )
        return p_spam_message > p_ham_message
    def accuracy(self, X , y): 
        
        tp, tn, fp, fn = self.get_tp_tn_fp_fn( X , y)
        accuracy = (tp + tn) / (tp + tn + fp + fn)
        return accuracy
    def precision(self, X , y): 
        tp, tn, fp, fn = self.get_tp_tn_fp_fn( X , y)
        precision = tp / (tp + fp)
        return precision
    def recall(self, X , y): 
        tp, tn, fp, fn = self.get_tp_tn_fp_fn( X , y)
        recall = tp / (tp + fn)
        return recall
    def f1_score(self, X,y):
        precision = self.precision( X , y)
        recall = self.recall(X,y)
        f1 = 2 * precision * recall / (precision + recall)
        return f1
        
    def get_tp_tn_fp_fn(self, X , y):
        tp = 0
        tn = 0
        fp = 0
        fn = 0
        for i in range(len(X)):
            prediction = self.predict(X[i])
            if prediction == 1 : #spam
                if y[i] == 1:
                    tp+=1
                else:
                    fp +=1
            else :
                if y[i] == 0:
                    tn+=1
                else:
                    fn +=1
            
        return tp, tn, fp, fn
    
                

In [14]:
# Instantiate and train the Naive Bayes Classifier
nb = NaiveBayesSpamClassifier()
nb.fit(x_train, y_train)


36759


In [15]:
nb.word_dict


{'mambo': (0.003006012024048096, 0.0),
 'musician': (0.001002004008016032, 0.00040551500405515005),
 'courtroom': (0.004008016032064128, 0.0),
 'cactus': (0.001002004008016032, 0.0),
 'forecasting': (0.0, 0.0012165450121654502),
 'holmelin': (0.0, 0.00040551500405515005),
 'paratroop': (0.001002004008016032, 0.0),
 'est': (0.031062124248496994, 0.004866180048661801),
 'jdoe': (0.0, 0.00040551500405515005),
 'intszetz': (0.001002004008016032, 0.0),
 'reaching': (0.001002004008016032, 0.0012165450121654502),
 'mont': (0.002004008016032064, 0.0),
 'cleburne': (0.0, 0.036090835360908353),
 'fiie': (0.001002004008016032, 0.0),
 'tablet': (0.004008016032064128, 0.0),
 'fogy': (0.001002004008016032, 0.0),
 'pressing': (0.0, 0.0008110300081103001),
 'honestly': (0.0, 0.0008110300081103001),
 'bluet': (0.001002004008016032, 0.0),
 'presents': (0.003006012024048096, 0.00040551500405515005),
 'hdvest': (0.0, 0.00040551500405515005),
 'juanita': (0.003006012024048096, 0.00040551500405515005),
 'be

## 3- Testing the classifier using the train and test data

In [16]:
# Calculate training and test accuracy
print("train accuracy:", nb.accuracy(x_train, y_train))
print("train precision:", nb.precision(x_train, y_train))
print("train recall:", nb.recall(x_train, y_train))
print("train f1 score:", nb.f1_score(x_train, y_train))
print("test accuracy:", nb.accuracy(x_test, y_test))
print("test precision:", nb.precision(x_test, y_test))
print("test recall:", nb.recall(x_test, y_test))
print("test f1 score:", nb.f1_score(x_test, y_test))

train accuracy: 0.9416859122401847
train precision: 0.9925742574257426
train recall: 0.8036072144288577
train f1 score: 0.8881506090808416
test accuracy: 0.8342120679554774
test precision: 0.9541666666666667
test recall: 0.45708582834331335
test f1 score: 0.6180836707152497
