# Assignment 3 Classification
## Fabian Hansch Mauritzson
## April 25, 2021

In [1]:
import matplotlib.pyplot as plt
import re
import os
import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import word_tokenize
from nltk import PorterStemmer
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics

import gensim

In [42]:
data = pd.read_csv("Spam Email.csv", usecols=["CATEGORY", "MESSAGE"])

In [43]:
# remove non alphabets
remove_non_alphabets = lambda x: re.sub(r'[^a-zA-Z]',' ',x)

# tokenn alphabets-only list
tokenize = lambda x: word_tokenize(x)

# assign ps to a lambda function to run on each line of value
ps = PorterStemmer()
stem = lambda w: [ ps.stem(x) for x in w ]

# assign lemmatizer to a lambda function to run on each line of value
lemmatizer = WordNetLemmatizer()
leammtizer = lambda x: [ lemmatizer.lemmatize(word) for word in x ]

In [44]:
# apply all above methods to the column ''
data['MESSAGE'] = data['MESSAGE'].apply(remove_non_alphabets)
data['MESSAGE'] = data['MESSAGE'].apply(tokenize)
data['MESSAGE'] = data['MESSAGE'].apply(stem)
data['MESSAGE'] = data['MESSAGE'].apply(leammtizer)
data['MESSAGE'] = data['MESSAGE'].apply(lambda x: ' '.join(x))
data.head()

Unnamed: 0,CATEGORY,MESSAGE
0,1,dear homeown interest rate are at their lowest...
1,1,attent thi is a must for all comput user new s...
2,1,thi is a multi part messag in mime format next...
3,1,import inform the new domain name are final av...
4,1,thi is the bottom line If you can give away CD...


In [5]:
#nltk.download("wordnet")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\fabia\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

## Split the data into training and testing sets

In [45]:
train_message, test_message, train_category, test_category = train_test_split(data["MESSAGE"], data["CATEGORY"], test_size=0.3)

## Frequency feature representation

In [46]:
freq_vectorizer=CountVectorizer(min_df=1, ngram_range=(1,1))
freq_train_features = freq_vectorizer.fit_transform(train_message)
freq_test_features = freq_vectorizer.transform(test_message)

In [47]:
print(freq_train_features)

  (0, 72457)	1
  (0, 22404)	1
  (0, 89868)	12
  (0, 52529)	1
  (0, 75207)	16
  (0, 42558)	1
  (0, 1006)	1
  (0, 22397)	1
  (0, 65531)	1
  (0, 79081)	7
  (0, 9976)	19
  (0, 37045)	6
  (0, 56512)	4
  (0, 56529)	2
  (0, 70201)	1
  (0, 11334)	12
  (0, 67109)	3
  (0, 74295)	16
  (0, 44073)	1
  (0, 80540)	1
  (0, 55730)	7
  (0, 79082)	17
  (0, 69132)	13
  (0, 13768)	1
  (0, 83733)	1
  :	:
  (4056, 59120)	2
  (4056, 5212)	2
  (4056, 30004)	1
  (4056, 60425)	8
  (4056, 6728)	8
  (4056, 59736)	1
  (4056, 9303)	1
  (4056, 39276)	2
  (4056, 52676)	2
  (4056, 1671)	1
  (4056, 27233)	1
  (4056, 43979)	1
  (4056, 8913)	1
  (4056, 65626)	8
  (4056, 59122)	5
  (4056, 86181)	1
  (4056, 88322)	1
  (4056, 66413)	1
  (4056, 9052)	1
  (4056, 28991)	1
  (4056, 29421)	1
  (4056, 84564)	1
  (4056, 78313)	1
  (4056, 52686)	8
  (4056, 34084)	8


## Binary feature representation

In [48]:
bin_vectorizer=CountVectorizer(min_df=1, ngram_range=(1,1), binary = True)
bin_train_features = bin_vectorizer.fit_transform(train_message)
bin_test_features = bin_vectorizer.transform(test_message)

In [49]:
print(bin_train_features)

  (0, 72457)	1
  (0, 22404)	1
  (0, 89868)	1
  (0, 52529)	1
  (0, 75207)	1
  (0, 42558)	1
  (0, 1006)	1
  (0, 22397)	1
  (0, 65531)	1
  (0, 79081)	1
  (0, 9976)	1
  (0, 37045)	1
  (0, 56512)	1
  (0, 56529)	1
  (0, 70201)	1
  (0, 11334)	1
  (0, 67109)	1
  (0, 74295)	1
  (0, 44073)	1
  (0, 80540)	1
  (0, 55730)	1
  (0, 79082)	1
  (0, 69132)	1
  (0, 13768)	1
  (0, 83733)	1
  :	:
  (4056, 59120)	1
  (4056, 5212)	1
  (4056, 30004)	1
  (4056, 60425)	1
  (4056, 6728)	1
  (4056, 59736)	1
  (4056, 9303)	1
  (4056, 39276)	1
  (4056, 52676)	1
  (4056, 1671)	1
  (4056, 27233)	1
  (4056, 43979)	1
  (4056, 8913)	1
  (4056, 65626)	1
  (4056, 59122)	1
  (4056, 86181)	1
  (4056, 88322)	1
  (4056, 66413)	1
  (4056, 9052)	1
  (4056, 28991)	1
  (4056, 29421)	1
  (4056, 84564)	1
  (4056, 78313)	1
  (4056, 52686)	1
  (4056, 34084)	1


## Tf-idf feature representation

In [50]:
tfidf_vectorizer=TfidfVectorizer(min_df=1, norm='l2', smooth_idf=True, use_idf=True, ngram_range=(1,1))
tfidf_train_features = tfidf_vectorizer.fit_transform(train_message)  
tfidf_test_features = tfidf_vectorizer.transform(test_message)   

In [51]:
print(tfidf_train_features)

  (0, 74276)	0.017045493721165563
  (0, 13097)	0.025856167847622086
  (0, 52638)	0.011573476466212014
  (0, 9674)	0.026331569081591885
  (0, 36239)	0.03640525610661432
  (0, 47574)	0.01698243999361684
  (0, 32344)	0.015167372829069619
  (0, 65687)	0.06989131620605618
  (0, 30850)	0.01228783092834959
  (0, 12800)	0.013282137615452771
  (0, 30682)	0.024924748890498793
  (0, 2203)	0.019612383004138808
  (0, 52718)	0.01869116149108296
  (0, 29309)	0.021359180021985398
  (0, 36509)	0.03749684622074196
  (0, 82795)	0.01293079765571175
  (0, 48677)	0.013347449851699476
  (0, 34177)	0.01006593885831953
  (0, 4477)	0.02644290404204884
  (0, 7994)	0.022298396805292456
  (0, 4497)	0.023014660668629304
  (0, 70092)	0.02316955288292143
  (0, 3840)	0.023361238441353047
  (0, 51325)	0.01232422090508157
  (0, 5595)	0.024276869529756984
  :	:
  (4056, 24911)	0.0447111178773528
  (4056, 4852)	0.00824185552042759
  (4056, 84124)	0.010852403902551093
  (4056, 84417)	0.009749057207029404
  (4056, 57809)	0.

## Function for train/test/evaluate

In [52]:
def train_predict_evaluate_model(classifier, 
                                 train_features, train_labels, 
                                 test_features, test_labels):
    # build model    
    classifier.fit(train_features, train_labels)
    # predict using model
    predictions = classifier.predict(test_features) 
    
    print(metrics.classification_report(test_labels,predictions))
    return predictions, get_metrics(true_labels=test_labels, predicted_labels=predictions)

## Import classifiers

In [86]:
from sklearn.naive_bayes import MultinomialNB # import naive bayes
from sklearn.tree import DecisionTreeClassifier # import Decision Tree
from sklearn.ensemble import RandomForestClassifier # import random forest
from sklearn.metrics import confusion_matrix

## Frequency Evaluations
### Naive Bayes

In [54]:
mnb = MultinomialNB()
mnb_freq_predictions, mnb_freq_metrics = train_predict_evaluate_model(classifier=mnb,
                                           train_features=freq_train_features,
                                           train_labels=train_category,
                                           test_features=freq_test_features,
                                           test_labels=test_category)

              precision    recall  f1-score   support

           0       0.90      0.99      0.95      1177
           1       0.99      0.78      0.87       562

    accuracy                           0.92      1739
   macro avg       0.95      0.89      0.91      1739
weighted avg       0.93      0.92      0.92      1739



In [85]:
confusion_matrix(test_category, mnb_freq_predictions)

array([[1171,    6],
       [ 125,  437]], dtype=int64)

### Decision Tree

In [55]:
dt = DecisionTreeClassifier()
dt_freq_predictions, dt_freq_metrics = train_predict_evaluate_model(classifier=dt,
                                           train_features=freq_train_features,
                                           train_labels=train_category,
                                           test_features=freq_test_features,
                                           test_labels=test_category)

              precision    recall  f1-score   support

           0       0.96      0.96      0.96      1177
           1       0.92      0.92      0.92       562

    accuracy                           0.95      1739
   macro avg       0.94      0.94      0.94      1739
weighted avg       0.95      0.95      0.95      1739



In [84]:
confusion_matrix(test_category, dt_freq_predictions)

array([[1132,   45],
       [  43,  519]], dtype=int64)

### Random Forest

In [57]:
rf = RandomForestClassifier(criterion="entropy")
rf_freq_predictions, rf_freq_metrics = train_predict_evaluate_model(classifier=rf,
                                           train_features=freq_train_features,
                                           train_labels=train_category,
                                           test_features=freq_test_features,
                                           test_labels=test_category)

              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1177
           1       0.99      0.93      0.96       562

    accuracy                           0.98      1739
   macro avg       0.98      0.96      0.97      1739
weighted avg       0.98      0.98      0.98      1739



In [83]:
confusion_matrix(test_category, rf_freq_predictions)

array([[1173,    4],
       [  39,  523]], dtype=int64)

## Binary Evaluation
### Naive Bayes

In [58]:
mnb = MultinomialNB()
mnb_bin_predictions, mnb_bin_metrics = train_predict_evaluate_model(classifier=mnb,
                                           train_features=bin_train_features,
                                           train_labels=train_category,
                                           test_features=bin_test_features,
                                           test_labels=test_category)

              precision    recall  f1-score   support

           0       0.91      1.00      0.95      1177
           1       0.99      0.79      0.88       562

    accuracy                           0.93      1739
   macro avg       0.95      0.90      0.92      1739
weighted avg       0.94      0.93      0.93      1739



In [82]:
confusion_matrix(test_category, mnb_bin_predictions)

array([[1173,    4],
       [ 116,  446]], dtype=int64)

### Decision Tree

In [59]:
dt = DecisionTreeClassifier()
dt_bin_predictions, dt_bin_metrics = train_predict_evaluate_model(classifier=dt,
                                           train_features=bin_train_features,
                                           train_labels=train_category,
                                           test_features=bin_test_features,
                                           test_labels=test_category)

              precision    recall  f1-score   support

           0       0.97      0.97      0.97      1177
           1       0.94      0.93      0.93       562

    accuracy                           0.96      1739
   macro avg       0.95      0.95      0.95      1739
weighted avg       0.96      0.96      0.96      1739



In [81]:
confusion_matrix(test_category, dt_bin_predictions)

array([[1142,   35],
       [  41,  521]], dtype=int64)

### Random Forest

In [60]:
rf = RandomForestClassifier(criterion="entropy")
rf_bin_predictions, rf_bin_metrics = train_predict_evaluate_model(classifier=rf,
                                           train_features=bin_train_features,
                                           train_labels=train_category,
                                           test_features=bin_test_features,
                                           test_labels=test_category)

              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1177
           1       0.99      0.93      0.96       562

    accuracy                           0.97      1739
   macro avg       0.98      0.96      0.97      1739
weighted avg       0.97      0.97      0.97      1739



In [80]:
confusion_matrix(test_category, rf_bin_predictions)

array([[1174,    3],
       [  42,  520]], dtype=int64)

## TF-IDF Evaluation
### Naive Bayes

In [61]:
mnb = MultinomialNB()
mnb_tf_predictions, mnb_tf_metrics = train_predict_evaluate_model(classifier=mnb,
                                           train_features=tfidf_train_features,
                                           train_labels=train_category,
                                           test_features=tfidf_test_features,
                                           test_labels=test_category)

              precision    recall  f1-score   support

           0       0.84      1.00      0.91      1177
           1       0.99      0.62      0.76       562

    accuracy                           0.87      1739
   macro avg       0.92      0.81      0.84      1739
weighted avg       0.89      0.87      0.86      1739



In [79]:
confusion_matrix(test_category, mnb_tf_predictions)

array([[1173,    4],
       [ 216,  346]], dtype=int64)

### Decision Tree

In [62]:
dt = DecisionTreeClassifier()
dt_tf_predictions, dt_tf_metrics = train_predict_evaluate_model(classifier=dt,
                                           train_features=tfidf_train_features,
                                           train_labels=train_category,
                                           test_features=tfidf_test_features,
                                           test_labels=test_category)

              precision    recall  f1-score   support

           0       0.96      0.96      0.96      1177
           1       0.91      0.92      0.92       562

    accuracy                           0.95      1739
   macro avg       0.94      0.94      0.94      1739
weighted avg       0.95      0.95      0.95      1739



In [78]:
confusion_matrix(test_category, dt_tf_predictions)

array([[1126,   51],
       [  43,  519]], dtype=int64)

### Random Forest

In [63]:
rf = RandomForestClassifier(criterion="entropy")
rf_tf_predictions, rf_tf_metrics = train_predict_evaluate_model(classifier=rf,
                                           train_features=tfidf_train_features,
                                           train_labels=train_category,
                                           test_features=tfidf_test_features,
                                           test_labels=test_category)

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1177
           1       0.99      0.90      0.95       562

    accuracy                           0.97      1739
   macro avg       0.97      0.95      0.96      1739
weighted avg       0.97      0.97      0.97      1739



In [77]:
confusion_matrix(test_category, rf_tf_predictions)

array([[1173,    4],
       [  55,  507]], dtype=int64)