In [1]:
import re
import pandas as pd
import numpy as np
import csv
import nltk
import random
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

### Loading data

In [2]:
data = pd.read_csv('C:/Users/eric the cool/Desktop/9665/project/spam_ham_dataset.csv', encoding = 'latin-1')
data = data[['text', 'label_num']]
data = data.rename(columns={'label_num': 'label'})

In [3]:
data.head()

Unnamed: 0,text,label
0,Subject: enron methanol ; meter # : 988291\r\n...,0
1,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,"Subject: photoshop , windows , office . cheap ...",1
4,Subject: re : indian springs\r\nthis deal is t...,0


### Train test splitting

In [4]:
random.seed(10)
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.2, random_state=0)
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

### Preprocessing email text for training and test set

In [5]:
def preprocessing (text):
    
    # Normalization and cleaning
    text = text.lower()
    text = re.sub("(http|https|www)(:|\.)\S+.com"," ",text)
    text = re.sub("[^a-zA-Z0-9\n]", " ", text)
    text = re.sub("[^\w\d]"," ",text)
    text = re.sub("\d+"," ",text)
    text = re.sub("\s+"," ", text)
        
    return text

In [6]:
processed_text_train = []
for i in range(train.shape[0]):
    processed_text_train.append(preprocessing(train["text"][i]))  
train['processed_text'] = processed_text_train

In [7]:
processed_text_test = []
for i in range(test.shape[0]):
    processed_text_test.append(preprocessing(test['text'][i]))
test['processed_text'] = processed_text_test

### Feature extraction

Using entire set of unique tokens. Set "True" when seeing the a token.

In [8]:
def extract_features(post):
    features = {}
    for word in nltk.word_tokenize(post):
        features['contains({})'.format(lemmatizer.lemmatize(word))] = True
    return features

Creating trainning and test feature set.

In [9]:
featureset_train = [(extract_features(train['processed_text'][i]), train['label'][i])
              for i in range(len(train['processed_text']))]

featureset_test = [(extract_features(test['processed_text'][i]), test['label'][i])
              for i in range(len(test['processed_text']))]

Training Naive Bayes classifier.

In [10]:
classifier = nltk.NaiveBayesClassifier.train(featureset_train)

In [11]:
print(nltk.classify.accuracy(classifier, featureset_train))

0.9497098646034816


Classifying the test set and printing score.

In [12]:
print(nltk.classify.accuracy(classifier, featureset_test))

0.9410628019323671


Making predictions on the test set, and use the predictions to calculate the confusion matrix.

In [56]:
predictions = [classifier.classify(featureset_test[i][0]) for i in range(len(featureset_test))]

In [57]:
from sklearn.metrics import confusion_matrix

confusion_matrix(test['label'], predictions, labels=None, sample_weight=None, normalize=None)

array([[677,  55],
       [  6, 297]], dtype=int64)

In [58]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(test['label'], predictions,
                                average = 'binary')


(0.84375, 0.9801980198019802, 0.9068702290076335, None)

### Precision = 0.8438
### Recall = 0.9802
### F-score = 0.9069