#  Spam classification with Naive Bayes

## Importing Libraries

In [42]:
import numpy as np
from sklearn import feature_extraction, model_selection, naive_bayes, metrics
import os, re, nltk
import pandas as pd
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [94]:
class Email:
    def __init__(self, content, label):
        self.content = content
        self.label = label
        self.preprocessing()

    def preprocessing(self):
        self.content = self.content.lower()
        for word in stopwords.words('english'):
            self.content = re.sub(' ' + word + ' ',  ' ', self.content)
        self.content = re.sub(r'\n', ' ', self.content)
        self.content = re.sub(r'^.+@[^\.].*\.[a-z]{2,}$', 'MailID', self.content)
        self.content = re.sub(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$', 'Links', self.content)
        self.content = re.sub(r'\d+(\.\d+)?', 'numbers', self.content)
        self.content = re.sub(r"[^a-zA-Z0-9]+", " ", self.content)
        self.content = re.sub(r'\s+', ' ', self.content)
        self.content = self.content.strip()

    @staticmethod
    def read():
        if not os.path.exists('messages.csv'):
            return []
        df = pd.read_csv('messages.csv').dropna()
        return list(map(lambda data: Email(data[0], data[1]), zip(df['subject'] + df['message'], df['label'])))

    @staticmethod
    def readTrainTestSplit():
        data = Email.read()
        df = pd.DataFrame()
        df['message'] = [email.content for email in data]
        vector=TfidfVectorizer()
        
        df['label'] = [email.label for email in data]
        return train_test_split(vector.fit_transform(df['message']), df['label'], test_size=0.2, random_state=225, stratify=df['label'])
X_train, X_test, Y_train, Y_test = Email.readTrainTestSplit()        


In [77]:
class Email:
    def __init__(self, content, isSpam):
        self.content = content
        self.isSpam = int(isSpam) == 1
        self.preprocessing()

    def preprocessing(self):
        self.content = self.content.lower()
        print(self.content)
        for word in stopwords.words('english'):
            self.content = re.sub(' ' + word + ' ',  ' ', self.content)
        self.content = re.sub(r'\n', ' ', self.content)
        self.content = re.sub(r'^.+@[^\.].*\.[a-z]{2,}$', 'MailID', self.content)
        self.content = re.sub(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$', 'Links', self.content)
        self.content = re.sub(r'\d+(\.\d+)?', 'numbers', self.content)
        self.content = re.sub(r"[^a-zA-Z0-9]+", " ", self.content)
        self.content = re.sub(r'\s+', ' ', self.content)
        self.content = self.content.strip()

    @staticmethod
    def read():
        if not os.path.exists('./data/messages.csv'):
            return []
        df = pd.read_csv('./data/messages.csv').dropna()
        return list(map(lambda data: Email(data[0], data[1]), zip(df['subject'] + df['message'], df['label'])))
data = Email.read()

In [None]:
print(data)

In [108]:
f = feature_extraction.text.CountVectorizer()
X = [dataset.content for dataset in data]
X = f.fit_transform(X)
y = [dataset.label for dataset in data]


In [112]:
print(X[0])

  (0, 25695)	1
  (0, 39159)	1
  (0, 2687)	5
  (0, 25028)	4
  (0, 42363)	6
  (0, 7732)	1
  (0, 28501)	1
  (0, 35032)	13
  (0, 7731)	3
  (0, 52768)	2
  (0, 32063)	1
  (0, 25748)	1
  (0, 53312)	1
  (0, 9608)	1
  (0, 23523)	1
  (0, 24214)	2
  (0, 49016)	2
  (0, 44559)	2
  (0, 33731)	2
  (0, 52395)	2
  (0, 45970)	4
  (0, 29483)	2
  (0, 29627)	2
  (0, 45028)	1
  (0, 47029)	5
  :	:
  (0, 45014)	1
  (0, 9488)	1
  (0, 42545)	1
  (0, 15198)	1
  (0, 37926)	1
  (0, 37292)	1
  (0, 40730)	1
  (0, 10131)	1
  (0, 49597)	1
  (0, 35028)	1
  (0, 33111)	1
  (0, 25469)	1
  (0, 29816)	1
  (0, 28320)	1
  (0, 30412)	1
  (0, 21582)	1
  (0, 33231)	1
  (0, 26550)	1
  (0, 49804)	1
  (0, 49566)	1
  (0, 17119)	1
  (0, 14984)	1
  (0, 25661)	1
  (0, 35482)	1
  (0, 45298)	1


In [109]:
#split training/test set
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=42)
print([np.shape(X_train), np.shape(X_test)])
print(X_test.shape)

[(2264, 56283), (567, 56283)]
(567, 56283)


### Multinomial naive bayes classifier

We train different bayes models changing the regularization parameter $\alpha$. <p>

In [111]:
# Chọn alpha
list_alpha = np.arange(1/100000, 20, 0.1)
score_train = np.zeros(len(list_alpha))
score_test = np.zeros(len(list_alpha))
recall_test = np.zeros(len(list_alpha))
precision_test= np.zeros(len(list_alpha))
count = 0
for alpha in list_alpha:
    bayes = naive_bayes.MultinomialNB(alpha=alpha)
    bayes.fit(X_train, y_train)
    score_train[count] = bayes.score(X_train, y_train)
    score_test[count]= bayes.score(X_test, y_test)
    recall_test[count] = metrics.recall_score(y_test, bayes.predict(X_test))
    precision_test[count] = metrics.precision_score(y_test, bayes.predict(X_test))
    count = count + 1 

Let's see the first 10 learning models and their metrics!

In [106]:
matrix = np.matrix(np.c_[list_alpha, score_train, score_test, recall_test, precision_test])
models = pd.DataFrame(data = matrix, columns = 
             ['alpha', 'Train Accuracy', 'Test Accuracy', 'Test Recall', 'Test Precision'])
models.head(n=10)

Unnamed: 0,alpha,Train Accuracy,Test Accuracy,Test Recall,Test Precision
0,1e-05,1.0,0.998236,0.989362,1.0
1,0.10001,0.999117,0.992945,0.978723,0.978723
2,0.20001,0.996466,0.987654,0.93617,0.988764
3,0.30001,0.991608,0.977072,0.861702,1.0
4,0.40001,0.97659,0.952381,0.712766,1.0
5,0.50001,0.957155,0.932981,0.595745,1.0
6,0.60001,0.940813,0.91358,0.478723,1.0
7,0.70001,0.920936,0.895944,0.37234,1.0
8,0.80001,0.904594,0.881834,0.287234,1.0
9,0.90001,0.890459,0.880071,0.276596,1.0


I select the model with the most test precision

In [107]:
best_index = models['Test Precision'].idxmax()
models.iloc[best_index, :]

alpha             0.000010
Train Accuracy    1.000000
Test Accuracy     0.998236
Test Recall       0.989362
Test Precision    1.000000
Name: 0, dtype: float64

**My best model does not produce any false positive, which is our goal.** <p>
Let's see if there is more than one model with 100% precision !

In [100]:
models[models['Test Precision']==1].head(n=10)

Unnamed: 0,alpha,Train Accuracy,Test Accuracy,Test Recall,Test Precision
0,1e-05,1.0,0.998236,0.989362,1.0
3,0.30001,0.991608,0.977072,0.861702,1.0
4,0.40001,0.97659,0.952381,0.712766,1.0
5,0.50001,0.957155,0.932981,0.595745,1.0
6,0.60001,0.940813,0.91358,0.478723,1.0
7,0.70001,0.920936,0.895944,0.37234,1.0
8,0.80001,0.904594,0.881834,0.287234,1.0
9,0.90001,0.890459,0.880071,0.276596,1.0
10,1.00001,0.878534,0.865961,0.191489,1.0
11,1.10001,0.870141,0.858907,0.148936,1.0


Between these models with the highest possible precision, we are going to select which has more test accuracy.

In [101]:
best_index = models[models['Test Precision']==1]['Test Accuracy'].idxmax()
bayes = naive_bayes.MultinomialNB(alpha=list_alpha[best_index])
bayes.fit(X_train, Y_train)
models.iloc[best_index, :]

alpha             0.000010
Train Accuracy    1.000000
Test Accuracy     0.998236
Test Recall       0.989362
Test Precision    1.000000
Name: 0, dtype: float64

#### Confusion matrix with naive bayes classifier

In [103]:
m_confusion_test = metrics.confusion_matrix(Y_test, bayes.predict(X_test))
pd.DataFrame(data = m_confusion_test, columns = ['Predicted 0', 'Predicted 1'],
            index = ['Actual 0', 'Actual 1'])
#print(X_test.shape)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,444,26
Actual 1,92,5
