# Assignment 3 | Text Classification | Chris Tarzian

In [1]:
import matplotlib.pyplot as plt
import re
import os
import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import word_tokenize
from nltk import PorterStemmer
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
import gensim

In [2]:
emaildata = pd.read_csv("Spam Email.csv")

In [3]:
emaildf = pd.read_csv("Spam Email.csv", usecols=["CATEGORY", "MESSAGE"])

In [4]:
emaildf

Unnamed: 0,CATEGORY,MESSAGE
0,1,"Dear Homeowner,\n\n \n\nInterest Rates are at ..."
1,1,ATTENTION: This is a MUST for ALL Computer Use...
2,1,This is a multi-part message in MIME format.\n...
3,1,IMPORTANT INFORMATION:\n\n\n\nThe new domain n...
4,1,This is the bottom line. If you can GIVE AWAY...
...,...,...
5791,0,"I'm one of the 30,000 but it's not working ver..."
5792,0,Damien Morton quoted:\n\n>W3C approves HTML 4 ...
5793,0,"On Mon, 2002-07-22 at 06:50, che wrote:\n\n\n\..."
5794,0,"Once upon a time, Manfred wrote :\n\n\n\n> I w..."


In [5]:
emaildf['CATEGORY'].value_counts()

0    3900
1    1896
Name: CATEGORY, dtype: int64

In [6]:
emaildf.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5796 entries, 0 to 5795
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   CATEGORY  5796 non-null   int64 
 1   MESSAGE   5796 non-null   object
dtypes: int64(1), object(1)
memory usage: 90.7+ KB


### Text Preprocessing

In [7]:
# remove non alphabets
remove_non_alphabets = lambda x: re.sub(r'[^a-zA-Z]',' ',x)

# tokenn alphabets-only list
tokenize = lambda x: word_tokenize(x)

# assign ps to a lambda function to run on each line of value
ps = PorterStemmer()
stem = lambda w: [ ps.stem(x) for x in w ]


lemmatizer = WordNetLemmatizer()
lemmtizer = lambda x: [ lemmatizer.lemmatize(word) for word in x ]

In [8]:
print('Processing : [=', end='')
emaildf['MESSAGE'] = emaildf['MESSAGE'].apply(remove_non_alphabets)
print('=', end='')
emaildf['MESSAGE'] = emaildf['MESSAGE'].apply(tokenize)
print('=', end='')
emaildf['MESSAGE'] = emaildf['MESSAGE'].apply(stem)
print('=', end='')
emaildf['MESSAGE'] = emaildf['MESSAGE'].apply(lemmtizer)
print('=', end='')
emaildf['MESSAGE'] = emaildf['MESSAGE'].apply(lambda x: ' '.join(x))
print('] : Completed', end='')
emaildf.head()

Processing : [=====] : Completed

Unnamed: 0,CATEGORY,MESSAGE
0,1,dear homeown interest rate are at their lowest...
1,1,attent thi is a must for all comput user new s...
2,1,thi is a multi part messag in mime format next...
3,1,import inform the new domain name are final av...
4,1,thi is the bottom line if you can give away cd...


### Split Train-Test Splits

In [9]:
train_corpus, test_corpus, train_labels, test_labels = train_test_split(emaildf["MESSAGE"], emaildf["CATEGORY"], test_size=0.3)

In [10]:
print("Train Corpus Length:", len(train_corpus))
print("Test Corpus Length:", len(test_corpus))

Train Corpus Length: 4057
Test Corpus Length: 1739


## Question 1

### 1. Feature Representation/Extraction

#### Bag-of-Words

In [11]:
bag_vectorizer=CountVectorizer(min_df=30, max_df=0.90,max_features=5000,ngram_range=(1,1))
bag_train_features = bag_vectorizer.fit_transform(train_corpus)
bag_test_features = bag_vectorizer.transform(test_corpus)

In [12]:
print(bag_train_features.shape)
print(bag_test_features.shape)

(4057, 2458)
(1739, 2458)


In [13]:
count_array = bag_train_features.toarray()
bagdf = pd.DataFrame(data=count_array,columns = bag_vectorizer.get_feature_names())
print(bagdf.shape)
bagdf.head(10)

(4057, 2458)




Unnamed: 0,aa,ab,abil,abl,about,abov,absolut,abus,ac,accept,...,your,yourself,yr,yw,zero,zip,zm,zt,zw,zzzz
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,3,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,3,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,1,0,0,0,0,0,...,4,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,1,...,8,0,0,0,0,0,0,0,0,0
7,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0


#### Bag-of-Words (Binary)

In [14]:
bag2_vectorizer=CountVectorizer(min_df=30, max_df=0.90,max_features=5000,ngram_range=(1,1), binary=True)
bag2_train_features = bag2_vectorizer.fit_transform(train_corpus)
bag2_test_features = bag2_vectorizer.transform(test_corpus)

In [15]:
print(bag2_train_features.shape)
print(bag2_test_features.shape)

(4057, 2458)
(1739, 2458)


In [16]:
count2_array = bag2_train_features.toarray()
bag2df = pd.DataFrame(data=count2_array,columns = bag2_vectorizer.get_feature_names())
print(bag2df.shape)
bag2df.head(10)

(4057, 2458)




Unnamed: 0,aa,ab,abil,abl,about,abov,absolut,abus,ac,accept,...,your,yourself,yr,yw,zero,zip,zm,zt,zw,zzzz
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
7,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0


#### TF-IDF

In [17]:
tfidf_vectorizer=TfidfVectorizer(min_df=30, 
                                 max_df=0.90,
                                 max_features=5000,
                                 norm='l2',
                                 smooth_idf=True,
#                                  th_idf=True,
                                 use_idf=True,
                                 ngram_range=(1,1))
tfidf_train_features = tfidf_vectorizer.fit_transform(train_corpus)  
tfidf_test_features = tfidf_vectorizer.transform(test_corpus) 

In [18]:
count_array1 = tfidf_test_features.toarray()
tfidf_df = pd.DataFrame(data=count_array1,columns = tfidf_vectorizer.get_feature_names())
print(tfidf_df.shape)
tfidf_df.head(10)

(1739, 2458)




Unnamed: 0,aa,ab,abil,abl,about,abov,absolut,abus,ac,accept,...,your,yourself,yr,yw,zero,zip,zm,zt,zw,zzzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.042243,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.009711,0.0,0.0,0.0,0.0,0.0,...,0.023097,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.049335,0.0,0.025285,0.0,0.0,0.0,0.0,0.0,...,0.020046,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.016446,0.0,0.0,0.0,0.0,0.0,...,0.052151,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.034419,0.0,0.107124,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.110259,0.061117,0.0,0.0,0.0,0.0,...,0.029137,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.007911,0.013202,0.016217,0.0,0.0,0.0,0.0,0.0,...,0.025714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Word2Vec

In [19]:
import numpy as np
np.zeros((5))
from sklearn.preprocessing import MinMaxScaler

In [20]:
#before we use wor2vec, we want to do some pre-processing here
# tokenize documents for word2vec

tokenized_train = [nltk.word_tokenize(text)
                   for text in train_corpus]
tokenized_test = [nltk.word_tokenize(text)
                   for text in test_corpus]  

# build word2vec model
#####make sure you change 'size' to 'vector_size',otherwise it will return error#####
wv_model = gensim.models.Word2Vec(tokenized_train,
                               vector_size=200,                          #set the size or dimension for the word vectors 
                               window=60,                        #specify the length of the window of words taken as context
                               min_count=10)                   #ignores all words with total frequency lower than 10
#Define functions to create average word vectors
def average_word_vectors(words, model, vocabulary, num_features):
#Function to average all of the word vectors in a given text
#Pre-initialize an empty numpy array
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
            # Loop over each word in the email and, if it is in the model's
            # vocaublary, add its feature vector to the total 
#     print(' first feature_vector:',feature_vector)
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            ######add .wv after model#####
            feature_vector = np.add(feature_vector, model.wv[word])
#             print(' second feature_vector:',feature_vector)
    if nwords:
        # Divide the result by the number of words to get the average
        feature_vector = np.divide(feature_vector, nwords)
#         print(' third feature_vector:',feature_vector)
    return feature_vector 
   
#####change index2word attribute to index_to_key#####
def averaged_word_vectorizer(corpus, model, num_features):
    # index_to_key is a list that contains the names of the words in
    # the model's vocabulary. Convert it to a set, for speed
    vocabulary = set(model.wv.index_to_key)
#     print('vocabulary:',len(vocabulary))
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
#     print('last:',features)
    return np.array(features)

# averaged word vector features from word2vec
# Try using MinMaxScaler() to preprocess the data before sending the data to the model. 
# This normalizes it to the range 0 to 1 thus removing the negative numbers
avg_wv_train_features = averaged_word_vectorizer(corpus=tokenized_train,
                                                 model=wv_model,
                                                 num_features=200)                   
avg_wv_test_features = averaged_word_vectorizer(corpus=tokenized_test,
                                                model=wv_model,
                                                num_features=200) 

# run this code
scaler = MinMaxScaler()
avg_wv_train_features_scaler = scaler.fit_transform(avg_wv_train_features)
avg_wv_test_features_scaler = scaler.transform(avg_wv_test_features)

In [21]:
pd.DataFrame(avg_wv_train_features)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0.020830,-0.090916,-0.077960,-0.542719,-0.138394,0.210625,0.132870,-0.533056,-0.191217,-0.276929,...,0.525298,-0.266736,0.220522,0.114090,-0.252882,0.081325,1.106789,-0.111848,-0.458899,0.692955
1,0.283822,0.456046,2.317006,-0.939648,-0.371450,0.091812,0.140681,0.471788,-0.871544,-0.494691,...,0.169485,0.085155,0.387532,0.090188,-0.860073,0.791203,1.093724,-0.245292,-0.531343,1.263717
2,-0.205058,0.347238,0.180745,-0.538718,-0.225323,0.283931,0.057406,-0.390775,-0.601143,-0.245837,...,0.171305,0.008426,0.125545,0.276020,-0.309416,0.243303,0.798983,0.034007,-0.331922,0.382235
3,-1.351485,0.308319,1.373512,-0.341780,-0.258717,0.305222,-0.143745,-0.465761,0.118020,0.035320,...,-0.163164,-0.464435,-0.467549,0.099568,-0.861572,0.310051,0.711336,-0.210656,-0.851300,0.436488
4,-0.631383,-1.192439,-0.631882,0.334837,0.494901,-1.572959,0.813340,1.563377,0.501319,-0.349377,...,0.693904,-1.157617,0.839379,0.541779,1.042608,0.073151,-0.651280,0.148200,0.615355,-0.656976
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4052,-0.239798,-1.002438,-0.273028,0.136320,-0.415526,-0.744722,0.476911,-0.050268,1.085227,-0.063664,...,-0.085857,-0.552328,0.450121,0.150869,-0.210383,0.005619,-0.393316,0.110278,0.442176,0.812852
4053,-0.525101,-0.102502,-0.023737,0.597827,-0.058280,0.080674,-0.424179,-0.449790,0.095824,0.103770,...,0.293145,-0.362391,-0.342751,-0.399505,-0.618386,0.046709,0.716169,-0.071496,-0.324410,0.400345
4054,-0.199933,-0.338362,-0.512755,0.242397,-0.106683,0.198068,0.294813,-0.154822,-0.020483,-0.073104,...,0.379680,-0.559997,-0.131903,-0.168135,-0.252855,-0.245767,1.019932,-0.266154,-0.058042,0.568838
4055,-0.341309,0.516579,0.784490,-0.483459,-0.152356,-0.579861,0.514813,-0.443969,0.183419,-0.289063,...,-0.724324,0.368919,0.100150,0.736764,-0.574608,-0.591079,0.740399,-0.184911,-0.665805,0.331005


### Define Evaluation Function

In [22]:
def get_metrics(true_labels, predicted_labels):
    
    print ('Accuracy:', np.round(                                                    
                        metrics.accuracy_score(true_labels, 
                                               predicted_labels),
                        2))
    print ('Precision:', np.round(
                        metrics.precision_score(true_labels, 
                                               predicted_labels),
                        2))
    print ('Recall:', np.round(
                        metrics.recall_score(true_labels, 
                                               predicted_labels),
                        2))
    print ('F1 Score:', np.round(
                        metrics.f1_score(true_labels, 
                                               predicted_labels),
                        2))

### Define an Easy-to-use Function for Train/Test/Evaluate

In [23]:
def train_predict_evaluate_model(title, classifier, 
                                 train_features, train_labels, 
                                 test_features, test_labels):
    # build model    
    classifier.fit(train_features, train_labels)
    # predict using model
    predictions = classifier.predict(test_features)
    confusion_matrix = metrics.confusion_matrix(test_labels,predictions, labels = [0,1])
    cost_matrix = np.array([[0,100],[5,0]])
    cost = np.multiply(confusion_matrix,cost_matrix).sum()
    # evaluate model prediction performance   
    print(metrics.classification_report(test_labels,predictions))
    #print()
    #metrics1 = get_metrics(true_labels=test_labels, predicted_labels=predictions)
    #print(metrics1)
    #print()
    print("Confusion Matrix:", title)
    print(confusion_matrix)
    print()
    print("Total",title, "Email Cost:", '${:,.2f}'.format(cost))
    return predictions, metrics.accuracy_score(test_labels,predictions), metrics.precision_score(test_labels,predictions), metrics.recall_score(test_labels,predictions), cost

### 2. Classifiers

In [24]:
from sklearn.naive_bayes import MultinomialNB # import naive bayes
from sklearn.tree import DecisionTreeClassifier # import Decision Tree
from sklearn.ensemble import RandomForestClassifier # import random forest

#### Train/Test on Bag-of-Words Features

##### 1. Naive Bayes

In [25]:
# assign naive bayes function to an object
mnb = MultinomialNB()
title1 = "bag_mnb"

# predict and evaluate naive bayes
mnb_bag_predictions, mnb_bag_accuracy, mnb_bag_precision, mnb_bag_recall, mnb_bag_cost= train_predict_evaluate_model(title=title1, classifier=mnb,
                                           train_features=bag_train_features,
                                           train_labels=train_labels,
                                           test_features=bag_test_features,
                                           test_labels=test_labels)

              precision    recall  f1-score   support

           0       0.89      1.00      0.94      1152
           1       0.99      0.75      0.85       587

    accuracy                           0.91      1739
   macro avg       0.94      0.87      0.90      1739
weighted avg       0.92      0.91      0.91      1739

Confusion Matrix: bag_mnb
[[1147    5]
 [ 147  440]]

Total bag_mnb Email Cost: $1,235.00


##### 2. Decision Tree

In [26]:
# assign decision tree function to an object
dt = DecisionTreeClassifier()
title2 = "bag_dt"

# predict and evaluate decision tree
dt_bag_predictions, dt_bag_accuracy, dt_bag_precision, dt_bag_recall, dt_bag_cost = train_predict_evaluate_model(title=title2,classifier=dt,
                                                               train_features=bag_train_features,
                                                               train_labels=train_labels,
                                                               test_features=bag_test_features,
                                                               test_labels=test_labels)

              precision    recall  f1-score   support

           0       0.97      0.96      0.96      1152
           1       0.93      0.93      0.93       587

    accuracy                           0.95      1739
   macro avg       0.95      0.95      0.95      1739
weighted avg       0.95      0.95      0.95      1739

Confusion Matrix: bag_dt
[[1110   42]
 [  39  548]]

Total bag_dt Email Cost: $4,395.00


##### 3. Random Forest

In [27]:
# assign random forest function to an object
rf = RandomForestClassifier(criterion="entropy")
title3 = "bag_rf"

# predict and evaluate random forest
rf_bag_predictions, rf_bag_accuracy, rf_bag_precision, rf_bag_recall, rf_bag_cost = train_predict_evaluate_model(title=title3, classifier=rf,
                                           train_features=bag_train_features,
                                           train_labels=train_labels,
                                           test_features=bag_test_features,
                                           test_labels=test_labels)

              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1152
           1       0.99      0.95      0.97       587

    accuracy                           0.98      1739
   macro avg       0.98      0.97      0.98      1739
weighted avg       0.98      0.98      0.98      1739

Confusion Matrix: bag_rf
[[1149    3]
 [  32  555]]

Total bag_rf Email Cost: $460.00


#### Train/Test on Bag-of-Words (Binary) Features

##### 1b. Naive Bayes

In [28]:
# assign naive bayes function to an object
mnb = MultinomialNB()
title10 = "bag2_mnb"

# predict and evaluate naive bayes
mnb_bag2_predictions, mnb_bag2_accuracy, mnb_bag2_precision, mnb_bag2_recall, mnb_bag2_cost= train_predict_evaluate_model(title=title10, classifier=mnb,
                                           train_features=bag2_train_features,
                                           train_labels=train_labels,
                                           test_features=bag2_test_features,
                                           test_labels=test_labels)

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1152
           1       0.99      0.94      0.96       587

    accuracy                           0.97      1739
   macro avg       0.98      0.97      0.97      1739
weighted avg       0.98      0.97      0.97      1739

Confusion Matrix: bag2_mnb
[[1145    7]
 [  37  550]]

Total bag2_mnb Email Cost: $885.00


##### 2b. Decision Tree

In [29]:
# assign decision tree function to an object
dt = DecisionTreeClassifier()
title11 = "bag2_dt"

# predict and evaluate decision tree
dt_bag2_predictions, dt_bag2_accuracy, dt_bag2_precision, dt_bag2_recall, dt_bag2_cost = train_predict_evaluate_model(title=title11,classifier=dt,
                                                               train_features=bag2_train_features,
                                                               train_labels=train_labels,
                                                               test_features=bag2_test_features,
                                                               test_labels=test_labels)

              precision    recall  f1-score   support

           0       0.96      0.96      0.96      1152
           1       0.92      0.93      0.92       587

    accuracy                           0.95      1739
   macro avg       0.94      0.94      0.94      1739
weighted avg       0.95      0.95      0.95      1739

Confusion Matrix: bag2_dt
[[1105   47]
 [  43  544]]

Total bag2_dt Email Cost: $4,915.00


##### 3b. Random Forest

In [30]:
# assign random forest function to an object
rf = RandomForestClassifier(criterion="entropy")
title12 = "bag2_rf"

# predict and evaluate random forest
rf_bag2_predictions, rf_bag2_accuracy, rf_bag2_precision, rf_bag2_recall, rf_bag2_cost = train_predict_evaluate_model(title=title12, classifier=rf,
                                           train_features=bag2_train_features,
                                           train_labels=train_labels,
                                           test_features=bag2_test_features,
                                           test_labels=test_labels)

              precision    recall  f1-score   support

           0       0.97      1.00      0.99      1152
           1       0.99      0.95      0.97       587

    accuracy                           0.98      1739
   macro avg       0.98      0.97      0.98      1739
weighted avg       0.98      0.98      0.98      1739

Confusion Matrix: bag2_rf
[[1148    4]
 [  30  557]]

Total bag2_rf Email Cost: $550.00


#### Train/Test on TF-IDF Features

##### 4. Naive Bayes

In [31]:
# predict and evaluate naive bayes
title4 = "tfidf_mnb"

mnb_tfidf_predictions, mnb_tfidf_accuracy, mnb_tfidf_precision, mnb_tfidf_recall, mnb_tfidf_cost = train_predict_evaluate_model(title=title4, classifier=mnb,
                                           train_features=tfidf_train_features,
                                           train_labels=train_labels,
                                           test_features=tfidf_test_features,
                                           test_labels=test_labels)

              precision    recall  f1-score   support

           0       0.95      0.99      0.97      1152
           1       0.99      0.89      0.94       587

    accuracy                           0.96      1739
   macro avg       0.97      0.94      0.95      1739
weighted avg       0.96      0.96      0.96      1739

Confusion Matrix: tfidf_mnb
[[1146    6]
 [  65  522]]

Total tfidf_mnb Email Cost: $925.00


##### 5. Decision Tree

In [32]:
# predict and evaluate decision tree
title5 = "tfidf_dt"

dt_tfidf_predictions, dt_tfidf_accuracy, dt_tfidf_precision, dt_tfidf_recall, dt_tfidf_cost = train_predict_evaluate_model(title=title5, classifier=dt,
                                           train_features=tfidf_train_features,
                                           train_labels=train_labels,
                                           test_features=tfidf_test_features,
                                           test_labels=test_labels)

              precision    recall  f1-score   support

           0       0.96      0.95      0.96      1152
           1       0.91      0.93      0.92       587

    accuracy                           0.95      1739
   macro avg       0.94      0.94      0.94      1739
weighted avg       0.95      0.95      0.95      1739

Confusion Matrix: tfidf_dt
[[1100   52]
 [  41  546]]

Total tfidf_dt Email Cost: $5,405.00


##### 6. Random Forest

In [33]:
# predict and evaluate random forest
title6 = "tfidf_rf"

rf_tfidf_predictions, rf_tfidf_accuracy,rf_tfidf_precision, rf_tfidf_recall, rf_tfidf_cost = train_predict_evaluate_model(title=title6, classifier=rf,
                                           train_features=tfidf_train_features,
                                           train_labels=train_labels,
                                           test_features=tfidf_test_features,
                                           test_labels=test_labels)

              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1152
           1       0.99      0.95      0.97       587

    accuracy                           0.98      1739
   macro avg       0.98      0.97      0.98      1739
weighted avg       0.98      0.98      0.98      1739

Confusion Matrix: tfidf_rf
[[1147    5]
 [  31  556]]

Total tfidf_rf Email Cost: $655.00


#### Train/Test on Word2Vec Features

##### 7. Naive Bayes

In [34]:
# predict and evaluate naive bayes
title7 = "avgwv_mnb"

mnb_avgwv_predictions, mnb_avgwv_accuracy, mnb_avgwv_precision, mnb_avgwv_recall, mnb_avgwv_cost = train_predict_evaluate_model(title=title7, classifier=mnb,
                                            train_features=avg_wv_train_features_scaler,
                                            train_labels=train_labels,
                                            test_features=avg_wv_test_features_scaler,
                                            test_labels=test_labels)

              precision    recall  f1-score   support

           0       0.84      1.00      0.91      1152
           1       0.99      0.64      0.77       587

    accuracy                           0.87      1739
   macro avg       0.92      0.82      0.84      1739
weighted avg       0.89      0.87      0.87      1739

Confusion Matrix: avgwv_mnb
[[1148    4]
 [ 214  373]]

Total avgwv_mnb Email Cost: $1,470.00


##### 8. Decision Tree

In [35]:
# predict and evaluate decision tree
title8 = "avgwv_dt"

dt_avgwv_predictions, dt_avgwv_accuracy, dt_avgwv_precision, dt_avgwv_recall, dt_avgwv_cost = train_predict_evaluate_model(title=title8, classifier=dt,
                                            train_features=avg_wv_train_features,
                                            train_labels=train_labels,
                                            test_features=avg_wv_test_features,
                                            test_labels=test_labels)

              precision    recall  f1-score   support

           0       0.97      0.98      0.98      1152
           1       0.96      0.95      0.95       587

    accuracy                           0.97      1739
   macro avg       0.97      0.96      0.97      1739
weighted avg       0.97      0.97      0.97      1739

Confusion Matrix: avgwv_dt
[[1131   21]
 [  32  555]]

Total avgwv_dt Email Cost: $2,260.00


##### 9. Random Forest

In [36]:
# predict and evaluate random forest
title9 = "avgwv_rf"

rf_avgwv_predictions, rf_avgwv_accuracy, rf_avgwv_precision, rf_avgwv_recall, rf_avgwv_cost = train_predict_evaluate_model(title=title9, classifier=rf,
                                            train_features=avg_wv_train_features,
                                            train_labels=train_labels,
                                            test_features=avg_wv_test_features,
                                            test_labels=test_labels)

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1152
           1       0.99      0.97      0.98       587

    accuracy                           0.99      1739
   macro avg       0.99      0.98      0.99      1739
weighted avg       0.99      0.99      0.99      1739

Confusion Matrix: avgwv_rf
[[1147    5]
 [  17  570]]

Total avgwv_rf Email Cost: $585.00


### Accuracy Matrix

In [37]:
def highlight_max(s):
    '''
    highlight the maximum in a Series yellow.
    '''
    is_max = s == s.max()
    return ['background-color: lightgreen' if v else '' for v in is_max]
def highlight_mins(s):
    '''
    highlight the minimum in a Series green.
    '''
    is_min = s == s.min()
    return ['background-color: red' if v else '' for v in is_min]

# create a dictionary that stores all the accuracy information
accuracy_dict = {}
for m in ["mnb","dt","rf"]:
    accuracy_dict[m] = {}
    for f in ["bag","bag2","tfidf", "avgwv"]:
        exec('accuracy_dict["{}"]["{}"] = {}_{}_accuracy'.format(m, f, m, f))
        
#Accuracy Matrix
accuracy_matrix=pd.DataFrame(accuracy_dict).rename(columns={"mnb":"Naive Bayes", 
                                            "dt":"Decision Tree", 
                                            "rf":"Random Forest"}, 
                                   index={"bag":"Bag-of-Words", "bag2":"Bag-of-Words (Binary)",
                                          "tfidf":"TF-IDF", 
                                          "avgwv":"Word2Vec"})
accuracy_matrix = accuracy_matrix.applymap(lambda x: f"{x*100:.2f}%").style.set_caption("Accuracy Matrix").apply(highlight_max, subset = accuracy_matrix.columns[-1:]).apply(highlight_mins, subset = accuracy_matrix.columns[0])
accuracy_matrix

Unnamed: 0,Naive Bayes,Decision Tree,Random Forest
Bag-of-Words,91.26%,95.34%,97.99%
Bag-of-Words (Binary),97.47%,94.82%,98.04%
TF-IDF,95.92%,94.65%,97.93%
Word2Vec,87.46%,96.95%,98.73%


### Precision Matrix

In [38]:
# create a dictionary that stores all the accuracy information
precision_dict = {}
for m in ["mnb","dt","rf"]:
    precision_dict[m] = {}
    for f in ["bag","bag2","tfidf", "avgwv"]:
        exec('precision_dict["{}"]["{}"] = {}_{}_precision'.format(m, f, m, f))
        
#Accuracy Matrix
precision_matrix=pd.DataFrame(precision_dict).rename(columns={"mnb":"Naive Bayes", 
                                            "dt":"Decision Tree", 
                                            "rf":"Random Forest"}, 
                                   index={"bag":"Bag-of-Words","bag2":"Bag-of-Words (Binary)", 
                                          "tfidf":"TF-IDF", 
                                          "avgwv":"Word2Vec"})
precision_matrix = precision_matrix.applymap(lambda x: f"{x*100:.2f}%").style.set_caption("Precision Matrix").apply(highlight_max, subset = accuracy_matrix.columns[-1]).apply(highlight_mins, subset = accuracy_matrix.columns[1])
precision_matrix

Unnamed: 0,Naive Bayes,Decision Tree,Random Forest
Bag-of-Words,98.88%,92.88%,99.46%
Bag-of-Words (Binary),98.74%,92.05%,99.29%
TF-IDF,98.86%,91.30%,99.11%
Word2Vec,98.94%,96.35%,99.13%


### Recall Matrix

In [39]:
# create a dictionary that stores all the accuracy information
recall_dict = {}
for m in ["mnb","dt","rf"]:
    recall_dict[m] = {}
    for f in ["bag", "bag2","tfidf", "avgwv"]:
        exec('recall_dict["{}"]["{}"] = {}_{}_recall'.format(m, f, m, f))
        
#Accuracy Matrix
recall_matrix=pd.DataFrame(recall_dict).rename(columns={"mnb":"Naive Bayes", 
                                            "dt":"Decision Tree", 
                                            "rf":"Random Forest"}, 
                                   index={"bag":"Bag-of-Words", "bag2":"Bag-of-Words (Binary)",
                                          "tfidf":"TF-IDF", 
                                          "avgwv":"Word2Vec"})
recall_matrix = recall_matrix.applymap(lambda x: f"{x*100:.2f}%").style.set_caption("Recall Matrix").apply(highlight_max, subset = accuracy_matrix.columns[-1:]).apply(highlight_mins, subset = accuracy_matrix.columns[0])
recall_matrix

Unnamed: 0,Naive Bayes,Decision Tree,Random Forest
Bag-of-Words,74.96%,93.36%,94.55%
Bag-of-Words (Binary),93.70%,92.67%,94.89%
TF-IDF,88.93%,93.02%,94.72%
Word2Vec,63.54%,94.55%,97.10%


## Question 2

### Cost Matrix

In [40]:
def highlight_maxs(s):
    '''
    highlight the maximum in a Series yellow.
    '''
    is_max = s == s.max()
    return ['background-color: red' if v else '' for v in is_max]
def highlight_min(s):
    '''
    highlight the minimum in a Series green.
    '''
    is_min = s == s.min()
    return ['background-color: lightgreen' if v else '' for v in is_min]

Cost_dict = {}
for m in ["mnb","dt","rf"]:
    Cost_dict[m] = {}
    for f in ["bag","bag2","tfidf", "avgwv"]:
        exec('Cost_dict["{}"]["{}"] = {}_{}_cost'.format(m, f, m, f))
        
#Cost Matrix
cost_matrix=pd.DataFrame(Cost_dict).rename(columns={"mnb":"Naive Bayes", 
                                            "dt":"Decision Tree", 
                                            "rf":"Random Forest"}, 
                                   index={"bag":"Bag-of-Words", "bag2":"Bag-of-Words (Binary)",
                                          "tfidf":"TF-IDF", 
                                          "avgwv":"Word2Vec"})
cost_matrix = cost_matrix.applymap(lambda x: f"${x:.2f}").style.set_caption("Cost Matrix").apply(highlight_min, subset = accuracy_matrix.columns[-1:]).apply(highlight_maxs, subset = accuracy_matrix.columns[1])
cost_matrix

Unnamed: 0,Naive Bayes,Decision Tree,Random Forest
Bag-of-Words,$1235.00,$4395.00,$460.00
Bag-of-Words (Binary),$885.00,$4915.00,$550.00
TF-IDF,$925.00,$5405.00,$655.00
Word2Vec,$1470.00,$2260.00,$585.00
