### Libraries

In [1]:
import re
import pandas as pd
import numpy as np
import csv
import nltk
import random
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

### Loading in Data

In [2]:
data = pd.read_csv('C:/Users/eric the cool/Desktop/9665/project/spam_ham_dataset.csv', encoding = 'latin-1')
data = data[['text', 'label_num']]
data = data.rename(columns={'label_num': 'label'})

random.seed(10)
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.2, random_state=0)
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

### Preprocessing

In [3]:
def preprocessing (text):
    
    text = text.lower()
    text = re.sub("(http|https|www)(:|\.)\S+.com"," ",text)
    text = re.sub('[^a-zA-Z0-9\n]', ' ', text)
    text = re.sub("[^\w\d]"," ",text)
    text = re.sub("\d+"," ",text)
    text = re.sub('\s+',' ', text)
    text = " ".join([lemmatizer.lemmatize(t) for t in text.split() if t not in nltk.corpus.stopwords.words("english")])
        
    return text

In [4]:
train.shape[0]

4136

In [7]:
processed_text_trn = []
for i in range(train.shape[0]):
    processed_text_trn.append(preprocessing(train['text'][i]))    
train['processed_text'] = processed_text_trn

processed_text_tst = []
for i in range(test.shape[0]):
    processed_text_tst.append(preprocessing(test['text'][i]))    
test['processed_text'] = processed_text_tst

In [12]:
X_train = train['processed_text']
Y_train = train['Label']

X_test = test['processed_text']
Y_test = test['Label']


print(X_train.shape)
print(Y_train.shape)

(4136,)
(4136,)


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
text_vec = TfidfVectorizer(min_df=10, max_features=1000)

In [18]:
feature_names = text_vec.get_feature_names()

In [20]:
def extract_features(post):
    features = {}
    for word in nltk.word_tokenize(post):
        if word in feature_names:
            features['contains({})'.format(word)] = True
    return features

In [21]:
featureset_train = [(extract_features(train['processed_text'][i]), train['label'][i])
              for i in range(len(train['processed_text']))]

featureset_test = [(extract_features(test['processed_text'][i]), test['label'][i])
              for i in range(len(test['processed_text']))]

### Decision Tree

In [23]:
classifier = nltk.DecisionTreeClassifier.train(featureset_train)

In [24]:
print(nltk.classify.accuracy(classifier, featureset_train))

0.9291586073500967


In [25]:
print(nltk.classify.accuracy(classifier, featureset_test))

0.8908212560386474


In [27]:
predictions = [classifier.classify(featureset_test[i][0]) for i in range(len(featureset_test))]

from sklearn.metrics import confusion_matrix

confusion_matrix(test['label'], predictions, labels=None, sample_weight=None, normalize=None)

array([[717,  15],
       [ 98, 205]], dtype=int64)

In [28]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(test['label'], predictions,
                                average = 'binary')


(0.9318181818181818, 0.6765676567656765, 0.7839388145315488, None)

### Precision = 0.9318
### Recall = 0.6766
### F-score = 0.7839

### Maximum Entropy

In [30]:
ME_classifier = nltk.MaxentClassifier.train(featureset_train, max_iter=30)

  ==> Training (30 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -0.69315        0.289
             2          -0.31677        0.781
             3          -0.24900        0.907
             4          -0.21770        0.932
             5          -0.19894        0.939
             6          -0.18602        0.942
             7          -0.17634        0.946
             8          -0.16868        0.955
             9          -0.16237        0.957
            10          -0.15703        0.958
            11          -0.15242        0.959
            12          -0.14836        0.959
            13          -0.14474        0.961
            14          -0.14147        0.962
            15          -0.13850        0.963
            16          -0.13577        0.963
            17          -0.13326        0.964
            18          -0.13093        0.965
            19          -0.12876        0.966
  

In [31]:
print(nltk.classify.accuracy(ME_classifier, featureset_train))

0.9717117988394585


In [32]:
print(nltk.classify.accuracy(ME_classifier, featureset_test))

0.9603864734299516


In [33]:
predictions = [ME_classifier.classify(featureset_test[i][0]) for i in range(len(featureset_test))]

from sklearn.metrics import confusion_matrix

confusion_matrix(test['label'], predictions, labels=None, sample_weight=None, normalize=None)

array([[705,  27],
       [ 14, 289]], dtype=int64)

In [34]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(test['label'], predictions,
                                average = 'binary')

(0.9145569620253164, 0.9537953795379538, 0.9337641357027463, None)

### Precision = 0.9146
### Recall = 0.9538
### F-score = 0.9338