In [1]:
import warnings
warnings.filterwarnings('ignore')

# 0. Import Data

In [2]:
import pandas as pd

In [3]:
train_data = pd.read_csv('part2-data/train.csv')

# 1. Data Preprocessing

In [4]:
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
import string

In [5]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return 'a'
    elif tag.startswith('V'):
        return 'v'
    elif tag.startswith('N'):
        return 'n'
    elif tag.startswith('R'):
        return 'r'
    else:
        return 'n'

In [6]:
def preprocessing(df):
    
    text_list = [text.lower() for text in df['text']]
    stop_words = [item for item in ENGLISH_STOP_WORDS]
    punctuations = [item for item in string.punctuation]
    
    subject = [re.findall(f'subject: (.*)\r', text)[0] for text in text_list]
    df['subject'] = subject
    
    subject_word = [re.findall(f'subject: (.*)\r', text)[0].split(' ') for text in text_list]
    df['subject_word'] = subject_word
    
    lemmatizer = WordNetLemmatizer()
    subject_lemma = []
    subject_lemma_concat = []
    for item in subject_word:
        subject_tagged = pos_tag(item)
        subject_list = [
            lemmatizer.lemmatize(word, pos = get_wordnet_pos(pos)) for word, pos in subject_tagged 
            if not word in stop_words + punctuations
        ]
        subject_lemma.append(subject_list)
        subject_lemma_concat.append(' '.join(subject_list))
    df['subject_lemma'] = subject_lemma
    df['subject_lemma_concat'] = subject_lemma_concat
    
    content = [re.sub(f'subject: .*\r\n', '', text).replace('\r\n', ' ') for text in text_list]
    df['content'] = content
    
    content_word = [re.sub(f'subject: .*\r\n', '', text).replace('\r\n', ' ').split(' ') for text in text_list]
    df['content_word'] = content_word
    
    content_lemma = []
    content_lemma_concat = []
    for item in content_word:
        content_tagged = pos_tag(item)
        content_list = [
            lemmatizer.lemmatize(word, pos = get_wordnet_pos(pos)) for word, pos in content_tagged 
            if not word in stop_words + punctuations
        ]
        content_lemma.append(content_list)
        content_lemma_concat.append(' '.join(content_list))
    df['content_lemma'] = content_lemma
    df['content_lemma_concat'] = content_lemma_concat
    
    content_sent_raw = [re.sub(f'subject: .*\r\n', '', text).split('\r\n') for text in text_list]
    content_sent = []
    for item in content_sent_raw:
        word = []
        for sent in item:
            word.append(sent.split(' '))
        content_sent.append(word)
    df['content_sent'] = content_sent
    
    full_text = [subject_word[i] + content_word[i] for i in range(0 ,len(df))]
    df['full_text'] = full_text
    
    full_text_concat = [' '.join(df['full_text'][i]) for i in range(0 ,len(df))]
    df['full_text_concat'] = full_text_concat
    
    full_lemma = [subject_lemma[i] + content_lemma[i] for i in range(0 ,len(df))]
    df['full_lemma'] = full_lemma
    
    full_lemma_concat = [subject_lemma_concat[i] + ' ' + content_lemma_concat[i] for i in range(0 ,len(df))]
    df['full_lemma_concat'] = full_lemma_concat

In [7]:
preprocessing(train_data)

In [8]:
train_data.shape

(3000, 15)

In [9]:
train_data.head()

Unnamed: 0,text,label,subject,subject_word,subject_lemma,subject_lemma_concat,content,content_word,content_lemma,content_lemma_concat,content_sent,full_text,full_text_concat,full_lemma,full_lemma_concat
0,Subject: enron methanol ; meter # : 988291\r\n...,ham,enron methanol ; meter # : 988291,"[enron, methanol, ;, meter, #, :, 988291]","[enron, methanol, meter, 988291]",enron methanol meter 988291,this is a follow up to the note i gave you on ...,"[this, is, a, follow, up, to, the, note, i, ga...","[follow, note, give, monday, 4, 3, 00, prelimi...",follow note give monday 4 3 00 preliminary flo...,"[[this, is, a, follow, up, to, the, note, i, g...","[enron, methanol, ;, meter, #, :, 988291, this...",enron methanol ; meter # : 988291 this is a fo...,"[enron, methanol, meter, 988291, follow, note,...",enron methanol meter 988291 follow note give m...
1,"Subject: hpl nom for january 9 , 2001\r\n( see...",ham,"hpl nom for january 9 , 2001","[hpl, nom, for, january, 9, ,, 2001]","[hpl, nom, january, 9, 2001]",hpl nom january 9 2001,( see attached file : hplnol 09 . xls ) - hpln...,"[(, see, attached, file, :, hplnol, 09, ., xls...","[attach, file, hplnol, 09, xl, hplnol, 09, xl]",attach file hplnol 09 xl hplnol 09 xl,"[[(, see, attached, file, :, hplnol, 09, ., xl...","[hpl, nom, for, january, 9, ,, 2001, (, see, a...","hpl nom for january 9 , 2001 ( see attached fi...","[hpl, nom, january, 9, 2001, attach, file, hpl...",hpl nom january 9 2001 attach file hplnol 09 x...
2,"Subject: neon retreat\r\nho ho ho , we ' re ar...",ham,neon retreat,"[neon, retreat]","[neon, retreat]",neon retreat,"ho ho ho , we ' re around to that most wonderf...","[ho, ho, ho, ,, we, ', re, around, to, that, m...","[ho, ho, ho, wonderful, time, year, neon, lead...",ho ho ho wonderful time year neon leader retre...,"[[ho, ho, ho, ,, we, ', re, around, to, that, ...","[neon, retreat, ho, ho, ho, ,, we, ', re, arou...","neon retreat ho ho ho , we ' re around to that...","[neon, retreat, ho, ho, ho, wonderful, time, y...",neon retreat ho ho ho wonderful time year neon...
3,"Subject: photoshop , windows , office . cheap ...",spam,"photoshop , windows , office . cheap . main tr...","[photoshop, ,, windows, ,, office, ., cheap, ....","[photoshop, window, office, cheap, main, trend...",photoshop window office cheap main trending,abasements darer prudently fortuitous undergon...,"[abasements, darer, prudently, fortuitous, und...","[abasement, darer, prudently, fortuitous, unde...",abasement darer prudently fortuitous undergone...,"[[abasements, darer, prudently, fortuitous, un...","[photoshop, ,, windows, ,, office, ., cheap, ....","photoshop , windows , office . cheap . main tr...","[photoshop, window, office, cheap, main, trend...",photoshop window office cheap main trending ab...
4,Subject: re : indian springs\r\nthis deal is t...,ham,re : indian springs,"[re, :, indian, springs]","[indian, spring]",indian spring,this deal is to book the teco pvr revenue . it...,"[this, deal, is, to, book, the, teco, pvr, rev...","[deal, book, teco, pvr, revenue, understanding...",deal book teco pvr revenue understanding teco ...,"[[this, deal, is, to, book, the, teco, pvr, re...","[re, :, indian, springs, this, deal, is, to, b...",re : indian springs this deal is to book the t...,"[indian, spring, deal, book, teco, pvr, revenu...",indian spring deal book teco pvr revenue under...


# 2. Feature Extraction

## 2-1. First Type of Feature: Length

In [10]:
def length_count(df):
    
    length = pd.DataFrame()
    
    subject_c_count = []
    subject_c_avg_w = []
    subject_w_count = []
    for item in df['subject_word']:
        count = 0
        for word in item:
            count += len(word)
        c_avg_w = count / len(item)
        subject_c_count.append(count)
        subject_c_avg_w.append(c_avg_w)
        subject_w_count.append(len(item))
    length['subject_c_count'] = subject_c_count
    length['subject_c_avg_w'] = subject_c_avg_w
    length['subject_w_count'] = subject_w_count

    content_c_count = []
    content_c_avg_w = []
    content_w_count = []
    for item in df['content_word']:
        count = 0
        for word in item:
            count += len(word)
        c_avg_w = count / len(item)
        content_c_count.append(count)
        content_c_avg_w.append(c_avg_w)
        content_w_count.append(len(item))
    length['content_c_count'] = content_c_count
    length['content_c_avg_w'] = content_c_avg_w
    length['content_w_count'] = content_w_count
        
    content_c_avg_s = []
    content_w_avg_s = []
    for i in range(0, len(df)):
        count = 0
        for word in df['content_word'][i]:
            count += len(word)
        count = count / len(df['content_sent'][i])
        content_c_avg_s.append(count)
        content_w_avg_s.append(len(df['content_word'][i]) / len(df['content_sent'][i]))
    length['content_c_avg_s'] = content_c_avg_s
    length['content_w_avg_s'] = content_w_avg_s
        
    content_s_count = [len(item) for item in df['content_sent']]
    length['content_s_count'] = content_s_count
    
    subject_pun_count = []
    subject_pun_w_pro = []
    for item in df['subject_word']:
        i = 0
        for word in item:
            if word in string.punctuation:
                i += 1
        subject_pun_count.append(i)
        subject_pun_w_pro.append(i / len(item))
    length['subject_pun_count'] = subject_pun_count
    length['subject_pun_w_pro'] = subject_pun_w_pro

    
    subject_pun_c_pro = []
    for i in range(0 ,len(df)):
        if not subject_c_count[i] == 0:
            subject_pun_c_pro.append(subject_pun_count[i] / subject_c_count[i])
        else:
            subject_pun_c_pro.append(0)
    length['subject_pun_c_pro'] = subject_pun_c_pro
            
    content_pun_count = []
    content_pun_w_pro = []
    for item in df['content_word']:
        i = 0
        for word in item:
            if word in string.punctuation:
                i += 1
        content_pun_count.append(i)
        content_pun_w_pro.append(i / len(item))
    length['content_pun_count'] = content_pun_count
    length['content_pun_w_pro'] = content_pun_w_pro
    
    content_pun_c_pro = []
    for i in range(0 ,len(df)):
        if not content_c_count[i] == 0:
            content_pun_c_pro.append(content_pun_count[i] / content_c_count[i])
        else:
            content_pun_c_pro.append(0)
    length['content_pun_c_pro'] = content_pun_c_pro
    
    return length

In [11]:
length_train = length_count(train_data)

In [12]:
length_train.shape

(3000, 15)

In [13]:
length_train.head()

Unnamed: 0,subject_c_count,subject_c_avg_w,subject_w_count,content_c_count,content_c_avg_w,content_w_count,content_c_avg_s,content_w_avg_s,content_s_count,subject_pun_count,subject_pun_w_pro,subject_pun_c_pro,content_pun_count,content_pun_w_pro,content_pun_c_pro
0,27,3.857143,7,221,3.745763,59,44.2,11.8,5,3,0.428571,0.111111,11,0.186441,0.049774
1,22,3.142857,7,43,2.866667,15,21.5,7.5,2,1,0.142857,0.045455,6,0.4,0.139535
2,11,5.5,2,1947,3.559415,547,216.333333,60.777778,9,0,0.0,0.0,81,0.14808,0.041602
3,43,4.3,10,305,8.026316,38,30.5,3.8,10,4,0.4,0.093023,1,0.026316,0.003279
4,16,4.0,4,239,3.676923,65,59.75,16.25,4,1,0.25,0.0625,5,0.076923,0.020921


### Select the Best Feature\*<br>
\* a failed try because it seems that the strong correlation between a feature and label does not necessarily mean that it is a good feature.

In [14]:
from scipy.stats import stats

In [15]:
# label_train = []
# for item in train_data['label']:
#     if item == 'ham':
#         label_train.append(0)
#     elif item == 'spam':
#         label_train.append(1)

In [16]:
# length_column = [column for column in length_train.columns]

# length_column_need = []
# for column in length_column:
#     corr, p = stats.pearsonr([item for item in length_train[column]], label_train)
#     if p >= 0.05:
#         length_column_need.append(column)
        
# length_column_need

## 2-2. Second Type of Feature: Bag of Words

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [18]:
count_vect_subject = CountVectorizer()
count_vect_content = CountVectorizer()

In [19]:
def bag_of_words_train(df):
        
    subject_bow_train_counts = count_vect_subject.fit_transform(df['subject_lemma_concat'])
    transformer_subject = TfidfTransformer().fit(subject_bow_train_counts)
    subject_bow_train_tfidf = transformer_subject.transform(subject_bow_train_counts)
    subject_tfidf = pd.DataFrame.sparse.from_spmatrix(
        subject_bow_train_tfidf, columns = count_vect_subject.get_feature_names_out()
    )
    subject_tfidf = subject_tfidf.add_prefix('sj_')
    
    content_bow_train_counts = count_vect_content.fit_transform(df['content_lemma_concat'])
    transformer_content = TfidfTransformer().fit(content_bow_train_counts)
    content_bow_train_tfidf = transformer_content.transform(content_bow_train_counts)
    content_tfidf = pd.DataFrame.sparse.from_spmatrix(
        content_bow_train_tfidf, columns = count_vect_content.get_feature_names_out()
    )
    content_tfidf = content_tfidf.add_prefix('ct_')

    bagofwords = pd.concat([subject_tfidf, content_tfidf], axis = 1)
    
    return bagofwords

In [20]:
bag_of_words_train = bag_of_words_train(train_data)

In [21]:
bag_of_words_train.shape

(3000, 36378)

In [22]:
bag_of_words_train.head()

Unnamed: 0,sj_00,sj_000,sj_0004,sj_0067,sj_0071,sj_01,sj_01405,sj_02,sj_03,sj_0300,...,ct_zxzmcnbf,ct_zyban,ct_zyjvit,ct_zyl,ct_zynsdirnh,ct_zynve,ct_zzezrjok,ct_zzn,ct_zzso,ct_zzsyt
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 2-3. Third Type of Feature: Word Similarity

In [23]:
from sklearn.metrics.pairwise import cosine_similarity
# from sklearn.metrics import jaccard_score

In [24]:
def spam_extract(df):
    spam = df.loc[df['label'] == 'spam']
    spam_subject_raw = ' '.join([text for text in spam['subject']])
    spam_content_raw = ' '.join([text for text in spam['content']])
    spam_subject_lemma = ' '.join([text for text in spam['subject_lemma_concat']])
    spam_content_lemma = ' '.join([text for text in spam['content_lemma_concat']])
    spam_full_raw = ' '.join([text for text in spam['full_text_concat']])
    spam_full_lemma = ' '.join([text for text in spam['full_lemma_concat']])
    return [spam_subject_raw, spam_content_raw, spam_subject_lemma, spam_content_lemma]

In [25]:
spam_text = spam_extract(train_data)

In [26]:
count_vect_sim_subject_raw = CountVectorizer()
count_vect_sim_content_raw = CountVectorizer()
count_vect_sim_subject_lemma = CountVectorizer()
count_vect_sim_content_lemma = CountVectorizer()
count_vect_sim_full_raw = CountVectorizer()
count_vect_sim_full_lemma = CountVectorizer()

In [27]:
def similarity_vector_train(df):
    
    subject_raw_vect = count_vect_sim_subject_raw.fit_transform([spam_text[0]] + [text for text in df['subject']])
    subject_lemma_vect = count_vect_sim_subject_lemma.fit_transform(
        [spam_text[2]] + [text for text in df['subject_lemma_concat']]
    )
    
    content_raw_vect = count_vect_sim_content_raw.fit_transform([spam_text[1]] + [text for text in df['content']])
    content_lemma_vect = count_vect_sim_content_lemma.fit_transform(
        [spam_text[3]] + [text for text in df['content_lemma_concat']]
    )
    
    full_raw_vect = count_vect_sim_full_raw.fit_transform([spam_text[1]] + [text for text in df['full_text_concat']])
    full_lemma_vect = count_vect_sim_full_lemma.fit_transform(
        [spam_text[3]] + [text for text in df['full_lemma_concat']]
    )
    
    return [subject_raw_vect, content_raw_vect, subject_lemma_vect, content_lemma_vect, full_raw_vect, full_lemma_vect]

In [28]:
similarity_vect_train = similarity_vector_train(train_data)

In [29]:
def compute_cos_similarity(vect):
    return [
        cosine_similarity(vect[0], vect[i])[0][0] 
        for i in range(1, vect.shape[0])
           ]

In [30]:
def similarity_df(similarity_vect):
    
    similarity = pd.DataFrame()
    
    similarity['subject_raw_sim'] = compute_cos_similarity(similarity_vect[0])
    similarity['content_raw_sim'] = compute_cos_similarity(similarity_vect[1])
    similarity['subject_lemma_sim'] = compute_cos_similarity(similarity_vect[2])
    similarity['content_lemma_sim'] = compute_cos_similarity(similarity_vect[3])
    similarity['full_raw_sim'] = compute_cos_similarity(similarity_vect[4])
    similarity['full_lemma_sim'] = compute_cos_similarity(similarity_vect[5])

    return similarity

In [31]:
similarity_train = similarity_df(similarity_vect_train).multiply(100)

In [32]:
similarity_train.shape

(3000, 6)

In [33]:
similarity_train.head()

Unnamed: 0,subject_raw_sim,content_raw_sim,subject_lemma_sim,content_lemma_sim,full_raw_sim,full_lemma_sim
0,0.0,39.1889,0.0,9.289697,37.788841,8.708504
1,10.853957,0.438871,1.261484,0.619363,4.276629,0.676846
2,0.0,81.179939,0.0,19.960519,81.019409,19.700789
3,7.254263,0.126803,14.763301,0.927616,0.657822,2.811889
4,13.762176,40.069839,0.0,12.60166,39.529842,12.399019


## 2-4. Concat Three Type of Features

In [34]:
def feature_concat(length, bag_of_words, similarity):
    return pd.concat([length, bag_of_words, similarity], axis = 1)

In [35]:
features_train = feature_concat(length_train, bag_of_words_train, similarity_train)

In [36]:
features_train.shape

(3000, 36399)

In [37]:
features_train.head()

Unnamed: 0,subject_c_count,subject_c_avg_w,subject_w_count,content_c_count,content_c_avg_w,content_w_count,content_c_avg_s,content_w_avg_s,content_s_count,subject_pun_count,...,ct_zzezrjok,ct_zzn,ct_zzso,ct_zzsyt,subject_raw_sim,content_raw_sim,subject_lemma_sim,content_lemma_sim,full_raw_sim,full_lemma_sim
0,27,3.857143,7,221,3.745763,59,44.2,11.8,5,3,...,0.0,0.0,0.0,0.0,0.0,39.1889,0.0,9.289697,37.788841,8.708504
1,22,3.142857,7,43,2.866667,15,21.5,7.5,2,1,...,0.0,0.0,0.0,0.0,10.853957,0.438871,1.261484,0.619363,4.276629,0.676846
2,11,5.5,2,1947,3.559415,547,216.333333,60.777778,9,0,...,0.0,0.0,0.0,0.0,0.0,81.179939,0.0,19.960519,81.019409,19.700789
3,43,4.3,10,305,8.026316,38,30.5,3.8,10,4,...,0.0,0.0,0.0,0.0,7.254263,0.126803,14.763301,0.927616,0.657822,2.811889
4,16,4.0,4,239,3.676923,65,59.75,16.25,4,1,...,0.0,0.0,0.0,0.0,13.762176,40.069839,0.0,12.60166,39.529842,12.399019


In [38]:
features_train_num = features_train.to_numpy()
length_train_num = length_train.to_numpy()
bag_of_words_train_num = bag_of_words_train.to_numpy()
similarity_train_num = similarity_train.to_numpy()

## 2-5. Feature Extraction of Test Data

In [39]:
def bag_of_words_test(df):
        
    subject_bow_train_counts = count_vect_subject.transform(df['subject_lemma_concat'])
    transformer_subject = TfidfTransformer().fit(subject_bow_train_counts)
    subject_bow_train_tfidf = transformer_subject.transform(subject_bow_train_counts)
    subject_tfidf = pd.DataFrame.sparse.from_spmatrix(
        subject_bow_train_tfidf, columns = count_vect_subject.get_feature_names_out()
    )
    subject_tfidf = subject_tfidf.add_prefix('sj_')
    
    content_bow_train_counts = count_vect_content.transform(df['content_lemma_concat'])
    transformer_content = TfidfTransformer().fit(content_bow_train_counts)
    content_bow_train_tfidf = transformer_content.transform(content_bow_train_counts)
    content_tfidf = pd.DataFrame.sparse.from_spmatrix(
        content_bow_train_tfidf, columns = count_vect_content.get_feature_names_out()
    )
    content_tfidf = content_tfidf.add_prefix('ct_')

    bagofwords = pd.concat([subject_tfidf, content_tfidf], axis = 1)
    
    return bagofwords

In [40]:
def similarity_vector_test(df):
    
    subject_raw_vect = count_vect_sim_subject_raw.transform([spam_text[0]] + [text for text in df['subject']])
    subject_lemma_vect = count_vect_sim_subject_lemma.transform(
        [spam_text[2]] + [text for text in df['subject_lemma_concat']]
    )
    
    content_raw_vect = count_vect_sim_content_raw.transform([spam_text[1]] + [text for text in df['content']])
    content_lemma_vect = count_vect_sim_content_lemma.transform(
        [spam_text[3]] + [text for text in df['content_lemma_concat']]
    )
    
    full_raw_vect = count_vect_sim_full_raw.transform([spam_text[1]] + [text for text in df['full_text_concat']])
    full_lemma_vect = count_vect_sim_full_lemma.transform(
        [spam_text[3]] + [text for text in df['full_lemma_concat']]
    )
    
    return [subject_raw_vect, content_raw_vect, subject_lemma_vect, content_lemma_vect, full_raw_vect, full_lemma_vect]

In [41]:
test_data = pd.read_csv('part2-data/test.csv')

In [42]:
preprocessing(test_data)

In [43]:
test_data.shape

(2171, 15)

In [44]:
test_data.head()

Unnamed: 0,text,label,subject,subject_word,subject_lemma,subject_lemma_concat,content,content_word,content_lemma,content_lemma_concat,content_sent,full_text,full_text_concat,full_lemma,full_lemma_concat
0,"Subject: enron / hpl actuals for november 20 ,...",ham,"enron / hpl actuals for november 20 , 2000","[enron, /, hpl, actuals, for, november, 20, ,,...","[enron, hpl, actuals, november, 20, 2000]",enron hpl actuals november 20 2000,teco tap 30 . 000 / enron ; 68 . 542 / hpl gas...,"[teco, tap, 30, ., 000, /, enron, ;, 68, ., 54...","[teco, tap, 30, 000, enron, 68, 542, hpl, gas,...",teco tap 30 000 enron 68 542 hpl gas daily,"[[teco, tap, 30, ., 000, /, enron, ;, 68, ., 5...","[enron, /, hpl, actuals, for, november, 20, ,,...","enron / hpl actuals for november 20 , 2000 tec...","[enron, hpl, actuals, november, 20, 2000, teco...",enron hpl actuals november 20 2000 teco tap 30...
1,Subject: defs purchase of teco pipeline\r\neff...,ham,defs purchase of teco pipeline,"[defs, purchase, of, teco, pipeline]","[defs, purchase, teco, pipeline]",defs purchase teco pipeline,"effective february 1 , 2001 duke energy field ...","[effective, february, 1, ,, 2001, duke, energy...","[effective, february, 1, 2001, duke, energy, f...",effective february 1 2001 duke energy field se...,"[[effective, february, 1, ,, 2001, duke, energ...","[defs, purchase, of, teco, pipeline, effective...",defs purchase of teco pipeline effective febru...,"[defs, purchase, teco, pipeline, effective, fe...",defs purchase teco pipeline effective february...
2,Subject: your son knows you watch girls finger...,spam,your son knows you watch girls fingering their...,"[your, son, knows, you, watch, girls, fingerin...","[son, know, watch, girl, finger, asshole]",son know watch girl finger asshole,remove campanile dittyutile portent blatarchfo...,"[remove, campanile, dittyutile, portent, blata...","[remove, campanile, dittyutile, portent, blata...",remove campanile dittyutile portent blatarchfo...,"[[remove], [campanile, dittyutile, portent, bl...","[your, son, knows, you, watch, girls, fingerin...",your son knows you watch girls fingering their...,"[son, know, watch, girl, finger, asshole, remo...",son know watch girl finger asshole remove camp...
3,Subject: eex corporation - meter # 5999\r\nsit...,ham,eex corporation - meter # 5999,"[eex, corporation, -, meter, #, 5999]","[eex, corporation, meter, 5999]",eex corporation meter 5999,sitara deal ticket # 314349 has been created a...,"[sitara, deal, ticket, #, 314349, has, been, c...","[sitara, deal, ticket, 314349, create, enter, ...",sitara deal ticket 314349 create enter july 20...,"[[sitara, deal, ticket, #, 314349, has, been, ...","[eex, corporation, -, meter, #, 5999, sitara, ...",eex corporation - meter # 5999 sitara deal tic...,"[eex, corporation, meter, 5999, sitara, deal, ...",eex corporation meter 5999 sitara deal ticket ...
4,"Subject: deal 93481\r\ndaren ,\r\ni ' m lookin...",ham,deal 93481,"[deal, 93481]","[deal, 93481]",deal 93481,"daren , i ' m looking into this deal . there i...","[daren, ,, i, ', m, looking, into, this, deal,...","[daren, m, look, deal, volume, deal, january, ...",daren m look deal volume deal january 00 deal ...,"[[daren, ,], [i, ', m, looking, into, this, de...","[deal, 93481, daren, ,, i, ', m, looking, into...","deal 93481 daren , i ' m looking into this dea...","[deal, 93481, daren, m, look, deal, volume, de...",deal 93481 daren m look deal volume deal janua...


In [45]:
length_test = length_count(test_data)
bag_of_words_test = bag_of_words_test(test_data)

similarity_vect_test = similarity_vector_test(test_data)
similarity_test = similarity_df(similarity_vect_test).multiply(100)

features_test = feature_concat(length_test, bag_of_words_test, similarity_test)

In [46]:
features_test.shape

(2171, 36399)

In [47]:
features_test.head()

Unnamed: 0,subject_c_count,subject_c_avg_w,subject_w_count,content_c_count,content_c_avg_w,content_w_count,content_c_avg_s,content_w_avg_s,content_s_count,subject_pun_count,...,ct_zzezrjok,ct_zzn,ct_zzso,ct_zzsyt,subject_raw_sim,content_raw_sim,subject_lemma_sim,content_lemma_sim,full_raw_sim,full_lemma_sim
0,34,3.777778,9,38,2.533333,15,38.0,15.0,1,2,...,0.0,0.0,0.0,0.0,9.173268,1.075011,1.029998,3.79165,4.934092,4.213159
1,26,5.2,5,1215,3.944805,308,41.896552,10.62069,29,0,...,0.0,0.0,0.0,0.0,7.365185,53.366387,0.0,12.793772,53.4764,12.539463
2,46,5.111111,9,725,9.666667,75,34.52381,3.571429,21,0,...,0.0,0.0,0.0,0.0,25.881719,0.454217,7.898142,2.568845,6.758467,4.253398
3,25,4.166667,6,95,3.275862,29,23.75,7.25,4,2,...,0.0,0.0,0.0,0.0,0.0,15.656883,0.0,4.133573,14.343612,4.167947
4,9,4.5,2,289,3.6125,80,41.285714,11.428571,7,0,...,0.0,0.0,0.0,0.0,0.306457,42.806998,1.189339,10.29354,41.429493,9.828983


In [48]:
features_test_num = features_test.to_numpy()
length_test_num = length_test.to_numpy()
bag_of_words_test_num = bag_of_words_test.to_numpy()
similarity_test_num = similarity_test.to_numpy()

# 3. Training & Testing

In [49]:
import numpy as np
from sklearn.metrics import *
from sklearn.model_selection import cross_val_score
from math import sqrt

In [50]:
features_name = [feature for feature in features_test.columns]
features_type = []
for feature in features_name:
    if feature in [column for column in length_test.columns]:
        features_type.append('length')
    elif feature in [column for column in similarity_test.columns]:
        features_type.append('similarity')
    else:
        features_type.append('bag of words')

## 3-1. Dummy Classifier

In [51]:
from sklearn.dummy import DummyClassifier

#### Cross-Validation (Train_Data)

In [52]:
dummy_clf = DummyClassifier(strategy = "most_frequent").fit(features_train_num, train_data['label'])

In [53]:
dummy_scores = cross_val_score(dummy_clf, features_train_num, train_data['label'], cv = 10)

In [54]:
np.mean(dummy_scores)

0.7240000000000001

In [55]:
1.96 * sqrt((np.mean(dummy_scores) * (1 - np.mean(dummy_scores))) / len(train_data))

0.015996289969864887

#### Prediction

In [56]:
y_pred_dummy = dummy_clf.predict(features_test_num)

In [57]:
accuracy_dummy = dummy_clf.score(features_test_num, test_data['label'])
precision_dummy = precision_score(test_data['label'], y_pred_dummy, average = 'macro')
recall_dummy = recall_score(test_data['label'], y_pred_dummy, pos_label = 'spam')
f1_dummy = f1_score(test_data['label'], y_pred_dummy, pos_label = 'spam')

In [58]:
accuracy_dummy, precision_dummy, recall_dummy, f1_dummy

(0.6909258406264395, 0.34546292031321973, 0.0, 0.0)

In [59]:
accuracy_dummy_interval = 1.96 * sqrt((accuracy_dummy * (1 - accuracy_dummy)) / len(test_data))

In [60]:
accuracy_dummy_interval

0.019438968937633017

## 3-2. Linear Classifier

### Logistic Regression

In [61]:
from sklearn.linear_model import LogisticRegression

#### Cross-Validation (Train_Data)

In [62]:
LR_clf = LogisticRegression().fit(features_train_num, train_data['label'])

In [63]:
LR_scores = cross_val_score(LR_clf, features_train_num, train_data['label'], cv = 10)

In [64]:
np.mean(LR_scores)

0.857

In [65]:
1.96 * sqrt((np.mean(LR_scores) * (1 - np.mean(LR_scores))) / len(train_data))

0.012527196036357591

#### Prediction

In [66]:
y_pred_LR = LR_clf.predict(features_test_num)

In [67]:
accuracy_LR = LR_clf.score(features_test_num, test_data['label'])
precision_LR = precision_score(test_data['label'], y_pred_LR, average = 'macro')
recall_LR = recall_score(test_data['label'], y_pred_LR, pos_label = 'spam')
f1_LR = f1_score(test_data['label'], y_pred_LR, pos_label = 'spam')

In [68]:
accuracy_LR, precision_LR, recall_LR, f1_LR

(0.8420082911100876,
 0.8328891580860085,
 0.6318926974664679,
 0.7120067170445002)

In [69]:
accuracy_LR_interval = 1.96 * sqrt((accuracy_LR * (1 - accuracy_LR)) / len(test_data))

In [70]:
accuracy_LR_interval

0.015342689353946625

#### Contribution of Features

In [71]:
feature_importance_LR = pd.DataFrame()
feature_importance_LR['Feature'] = features_name
feature_importance_LR['Type'] = features_type
feature_importance_LR_sum = sum([np.abs(num) for num in LR_clf.coef_[0]])
feature_importance_LR['Importance'] = [np.abs(num) / feature_importance_LR_sum for num in LR_clf.coef_[0]]
feature_importance_LR.sort_values(by = 'Importance', ascending = False).head(10)

Unnamed: 0,Feature,Type,Importance
36395,subject_lemma_sim,similarity,0.108949
1,subject_c_avg_w,length,0.05258
7,content_w_avg_s,length,0.052439
4,content_c_avg_w,length,0.033101
36398,full_lemma_sim,similarity,0.0325
36396,content_lemma_sim,similarity,0.029338
9,subject_pun_count,length,0.019045
36394,content_raw_sim,similarity,0.018344
6,content_c_avg_s,length,0.013934
0,subject_c_count,length,0.012296


##### Length

In [72]:
LR_clf_length = LogisticRegression().fit(length_train_num, train_data['label'])
y_pred_LR_length = LR_clf_length.predict(length_test_num)
accuracy_LR_length = LR_clf_length.score(length_test_num, test_data['label'])
precision_LR_length = precision_score(test_data['label'], y_pred_LR_length, average = 'macro')
recall_LR_length = recall_score(test_data['label'], y_pred_LR_length, pos_label = 'spam')
f1_LR_length = f1_score(test_data['label'], y_pred_LR_length, pos_label = 'spam')
accuracy_LR_length, precision_LR_length, recall_LR_length, f1_LR_length

(0.7632427452786734,
 0.8260290148448044,
 0.26229508196721313,
 0.40646651270207856)

##### Bag of Words

In [73]:
LR_clf_bow = LogisticRegression().fit(bag_of_words_train_num, train_data['label'])
y_pred_LR_bow = LR_clf_bow.predict(bag_of_words_test_num)
accuracy_LR_bow = LR_clf_bow.score(bag_of_words_test_num, test_data['label'])
precision_LR_bow = precision_score(test_data['label'], y_pred_LR_bow, average = 'macro')
recall_LR_bow = recall_score(test_data['label'], y_pred_LR_bow, pos_label = 'spam')
f1_LR_bow = f1_score(test_data['label'], y_pred_LR_bow, pos_label = 'spam')
accuracy_LR_bow, precision_LR_bow, recall_LR_bow, f1_LR_bow

(0.9843390142791341,
 0.9820498428602421,
 0.9731743666169895,
 0.9746268656716417)

##### Similarity

In [74]:
LR_clf_sim = LogisticRegression().fit(similarity_train_num, train_data['label'])
y_pred_LR_sim = LR_clf_sim.predict(similarity_test_num)
accuracy_LR_sim = LR_clf_sim.score(similarity_test_num, test_data['label'])
precision_LR_sim = precision_score(test_data['label'], y_pred_LR_sim, average = 'macro')
recall_LR_sim = recall_score(test_data['label'], y_pred_LR_sim, pos_label = 'spam')
f1_LR_sim = f1_score(test_data['label'], y_pred_LR_sim, pos_label = 'spam')
accuracy_LR_sim, precision_LR_sim, recall_LR_sim, f1_LR_sim

(0.8198986642100414,
 0.8146889825302019,
 0.5499254843517138,
 0.6536758193091231)

## 3-3. Probabilistic Classifier

### Naïve Bayes

In [75]:
from sklearn.naive_bayes import MultinomialNB

#### Cross Validation (Train_Data)

In [76]:
NB_clf = MultinomialNB().fit(features_train_num, train_data['label'])

In [77]:
NB_scores = cross_val_score(NB_clf, features_train_num, train_data['label'], cv = 10)

In [78]:
np.mean(NB_scores)

0.851

In [79]:
1.96 * sqrt((np.mean(NB_scores) * (1 - np.mean(NB_scores))) / len(train_data))

0.012742462326149265

#### Predction

In [80]:
y_pred_NB = NB_clf.predict(features_test_num)

In [81]:
accuracy_NB = NB_clf.score(features_test_num, test_data['label'])
precision_NB = precision_score(test_data['label'], y_pred_NB, average = 'macro')
recall_NB = recall_score(test_data['label'], y_pred_NB, pos_label = 'spam')
f1_NB = f1_score(test_data['label'], y_pred_NB, pos_label = 'spam')

In [82]:
accuracy_NB, precision_NB, recall_NB, f1_NB

(0.8530631045601106,
 0.8526906044284859,
 0.6348733233979136,
 0.7275832621690864)

In [83]:
accuracy_NB_interval = 1.96 * sqrt((accuracy_NB * (1 - accuracy_NB)) / len(test_data))

In [84]:
accuracy_NB_interval

0.014892999292565825

#### Contribution of Features

In [85]:
log_prob_NB = NB_clf.feature_log_prob_
prob_NB = np.exp(log_prob_NB)
feature_importances_NB = np.mean(prob_NB, axis = 0)
feature_importances_NB_df = pd.DataFrame()
feature_importances_NB_df['Feature'] = features_name
feature_importances_NB_df['Type'] = features_type
feature_importances_NB_df['Importance'] = feature_importances_NB
feature_importances_NB_df.sort_values(by = 'Importance', ascending = False).head(10)

Unnamed: 0,Feature,Type,Importance
3,content_c_count,length,0.594845
5,content_w_count,length,0.162796
12,content_pun_count,length,0.042845
6,content_c_avg_s,length,0.03612
36397,full_raw_sim,similarity,0.025615
36394,content_raw_sim,similarity,0.025384
0,subject_c_count,length,0.020164
8,content_s_count,length,0.014405
7,content_w_avg_s,length,0.009848
36398,full_lemma_sim,similarity,0.008699


##### Length

In [86]:
NB_clf_length = MultinomialNB().fit(length_train_num, train_data['label'])
y_pred_NB_length = NB_clf_length.predict(length_test_num)
accuracy_NB_length = NB_clf_length.score(length_test_num, test_data['label'])
precision_NB_length = precision_score(test_data['label'], y_pred_NB_length, average = 'macro')
recall_NB_length = recall_score(test_data['label'], y_pred_NB_length, pos_label = 'spam')
f1_NB_length = f1_score(test_data['label'], y_pred_NB_length, pos_label = 'spam')
accuracy_NB_length, precision_NB_length, recall_NB_length, f1_NB_length

(0.5679410409949333,
 0.5633177593902216,
 0.5901639344262295,
 0.45780346820809253)

##### Bag of Words

In [87]:
NB_clf_bow = MultinomialNB().fit(bag_of_words_train_num, train_data['label'])
y_pred_NB_bow = NB_clf_bow.predict(bag_of_words_test_num)
accuracy_NB_bow = NB_clf_bow.score(bag_of_words_test_num, test_data['label'])
precision_NB_bow = precision_score(test_data['label'], y_pred_NB_bow, average = 'macro')
recall_NB_bow = recall_score(test_data['label'], y_pred_NB_bow, pos_label = 'spam')
f1_NB_bow = f1_score(test_data['label'], y_pred_NB_bow, pos_label = 'spam')
accuracy_NB_bow, precision_NB_bow, recall_NB_bow, f1_NB_bow

(0.9207738369415016,
 0.9478043912175649,
 0.7451564828614009,
 0.8532423208191126)

##### Similarity

In [88]:
NB_clf_sim = MultinomialNB().fit(similarity_train_num, train_data['label'])
y_pred_NB_sim = NB_clf_sim.predict(similarity_test_num)
accuracy_NB_sim = NB_clf_sim.score(similarity_test_num, test_data['label'])
precision_NB_sim = precision_score(test_data['label'], y_pred_NB_sim, average = 'macro')
recall_NB_sim = recall_score(test_data['label'], y_pred_NB_sim, pos_label = 'spam')
f1_NB_sim = f1_score(test_data['label'], y_pred_NB_sim, pos_label = 'spam')
accuracy_NB_sim, precision_NB_sim, recall_NB_sim, f1_NB_sim

(0.7010594196222939,
 0.6693778591843416,
 0.6616989567809239,
 0.5777488614183475)

## 3-4. Decision Tree

In [89]:
from sklearn.tree import DecisionTreeClassifier

#### Cross Validation (Train_Data)

In [90]:
DT_clf = DecisionTreeClassifier().fit(features_train_num, train_data['label'])

In [91]:
DT_scores = cross_val_score(DT_clf, features_train_num, train_data['label'], cv = 10)

In [92]:
np.mean(DT_scores)

0.9326666666666666

In [93]:
1.96 * sqrt((np.mean(DT_scores) * (1 - np.mean(DT_scores))) / len(train_data))

0.008967548394483714

#### Predction

In [94]:
y_pred_DT = DT_clf.predict(features_test_num)
accuracy_DT = DT_clf.score(features_test_num, test_data['label'])
precision_DT = precision_score(test_data['label'], y_pred_DT, average = 'macro')
recall_DT = recall_score(test_data['label'], y_pred_DT, pos_label = 'spam')
f1_DT = f1_score(test_data['label'], y_pred_DT, pos_label = 'spam')

In [95]:
accuracy_DT, precision_DT, recall_DT, f1_DT

(0.9253800092123445,
 0.9235122420861476,
 0.8315946348733234,
 0.8732394366197183)

In [141]:
accuracy_DT_interval = 1.96 * sqrt((accuracy_DT * (1 - accuracy_DT)) / len(train_data))

In [142]:
accuracy_DT_interval

0.009403360717751249

#### Contribution of Features

In [98]:
feature_importance_DT = pd.DataFrame()
feature_importance_DT['Feature'] = features_name
feature_importance_DT['Type'] = features_type
feature_importance_DT['Importance'] = [num for num in DT_clf.feature_importances_]
feature_importance_DT.sort_values(by = 'Importance', ascending = False).head(10)

Unnamed: 0,Feature,Type,Importance
36395,subject_lemma_sim,similarity,0.245599
16030,ct_enron,bag of words,0.12456
19792,ct_http,bag of words,0.077835
15,sj_00,bag of words,0.064877
1,subject_c_avg_w,length,0.055112
4,content_c_avg_w,length,0.042098
36397,full_raw_sim,similarity,0.031733
32856,ct_thanks,bag of words,0.022922
7,content_w_avg_s,length,0.019794
2028,sj_new,bag of words,0.018672


##### Length

In [99]:
NB_clf_length = MultinomialNB().fit(length_train_num, train_data['label'])
y_pred_NB_length = NB_clf_length.predict(length_test_num)
accuracy_NB_length = NB_clf_length.score(length_test_num, test_data['label'])
precision_NB_length = precision_score(test_data['label'], y_pred_NB_length, average = 'macro')
recall_NB_length = recall_score(test_data['label'], y_pred_NB_length, pos_label = 'spam')
f1_NB_length = f1_score(test_data['label'], y_pred_NB_length, pos_label = 'spam')
accuracy_NB_length, precision_NB_length, recall_NB_length, f1_NB_length

(0.5679410409949333,
 0.5633177593902216,
 0.5901639344262295,
 0.45780346820809253)

##### Bag of Words

In [100]:
NB_clf_bow = MultinomialNB().fit(bag_of_words_train_num, train_data['label'])
y_pred_NB_bow = NB_clf_bow.predict(bag_of_words_test_num)
accuracy_NB_bow = NB_clf_bow.score(bag_of_words_test_num, test_data['label'])
precision_NB_bow = precision_score(test_data['label'], y_pred_NB_bow, average = 'macro')
recall_NB_bow = recall_score(test_data['label'], y_pred_NB_bow, pos_label = 'spam')
f1_NB_bow = f1_score(test_data['label'], y_pred_NB_bow, pos_label = 'spam')
accuracy_NB_bow, precision_NB_bow, recall_NB_bow, f1_NB_bow

(0.9207738369415016,
 0.9478043912175649,
 0.7451564828614009,
 0.8532423208191126)

##### Similarity

In [101]:
NB_clf_sim = MultinomialNB().fit(similarity_train_num, train_data['label'])
y_pred_NB_sim = NB_clf_sim.predict(similarity_test_num)
accuracy_NB_sim = NB_clf_sim.score(similarity_test_num, test_data['label'])
precision_NB_sim = precision_score(test_data['label'], y_pred_NB_sim, average = 'macro')
recall_NB_sim = recall_score(test_data['label'], y_pred_NB_sim, pos_label = 'spam')
f1_NB_sim = f1_score(test_data['label'], y_pred_NB_sim, pos_label = 'spam')
accuracy_NB_sim, precision_NB_sim, recall_NB_sim, f1_NB_sim

(0.7010594196222939,
 0.6693778591843416,
 0.6616989567809239,
 0.5777488614183475)

## 3-5. Neural Network (Linear)

In [102]:
from sklearn.neural_network import MLPClassifier

#### Cross-Validation (Train_Data)

In [103]:
NN_clf = MLPClassifier(solver = 'adam', activation = 'logistic').fit(features_train_num, train_data['label'])

In [104]:
NN_scores = cross_val_score(NN_clf, features_train_num, train_data['label'], cv = 10)

In [105]:
np.mean(NN_scores)

0.9666666666666668

In [106]:
1.96 * sqrt((np.mean(NN_scores) * (1 - np.mean(NN_scores))) / len(train_data))

0.0064235215909678

#### Prediction

In [107]:
y_pred_NN = NN_clf.predict(features_test_num)

In [108]:
accuracy_NN = NN_clf.score(features_test_num, test_data['label'])
precision_NN = precision_score(test_data['label'], y_pred_NN, average = 'macro')
recall_NN = recall_score(test_data['label'], y_pred_NN, pos_label = 'spam')
f1_NN = f1_score(test_data['label'], y_pred_NN, pos_label = 'spam')

In [109]:
accuracy_NN, precision_NN, recall_NN, f1_NN

(0.9649930907415938, 0.9597156209129478, 0.940387481371088, 0.9431988041853514)

In [110]:
accuracy_NN_interval = 1.96 * sqrt((accuracy_NN * (1 - accuracy_NN)) / len(test_data))

In [111]:
accuracy_NN_interval

0.007731524669671372

#### Contribution of Features

In [112]:
feature_importance_NN = pd.DataFrame()
feature_NN_cal = [sum(np.abs(feature)) for feature in NN_clf.coefs_[0]]
feature_NN_cal_sum = sum(feature_NN_cal)
feature_NN_cal = [num / feature_NN_cal_sum for num in feature_NN_cal]
feature_importance_NN['Feature'] = features_name
feature_importance_NN['Type'] = features_type
feature_importance_NN['Importance'] = feature_NN_cal
feature_importance_NN.sort_values(by = 'Importance', ascending = False).head(10)

Unnamed: 0,Feature,Type,Importance
2028,sj_new,bag of words,0.000255
21773,ct_ken,bag of words,0.000235
15909,ct_employee,bag of words,0.000231
4028,ct_2004,bag of words,0.00023
28424,ct_question,bag of words,0.000223
16030,ct_enron,bag of words,0.000222
13710,ct_daren,bag of words,0.000216
22582,ct_let,bag of words,0.000215
27317,ct_pm,bag of words,0.000214
2220,sj_picture,bag of words,0.000214


##### Length

In [113]:
NN_clf_length = MLPClassifier(solver = 'adam', activation = 'logistic').fit(length_train_num, train_data['label'])
y_pred_NN_length = NN_clf_length.predict(length_test_num)
accuracy_NN_length = NN_clf_length.score(length_test_num, test_data['label'])
precision_NN_length = precision_score(test_data['label'], y_pred_NN_length, average = 'macro')
recall_NN_length = recall_score(test_data['label'], y_pred_NN_length, pos_label = 'spam')
f1_NN_length = f1_score(test_data['label'], y_pred_NN_length, pos_label = 'spam')
accuracy_NN_length, precision_NN_length, recall_NN_length, f1_NN_length

(0.8249654537079687,
 0.8144312387730974,
 0.5842026825633383,
 0.6735395189003437)

##### Bag of Words

In [114]:
NN_clf_bow = MLPClassifier(solver = 'adam', activation = 'logistic').fit(bag_of_words_train_num, train_data['label'])
y_pred_NN_bow = NN_clf_bow.predict(bag_of_words_test_num)
accuracy_NN_bow = NN_clf_bow.score(bag_of_words_test_num, test_data['label'])
precision_NN_bow = precision_score(test_data['label'], y_pred_NN_bow, average = 'macro')
recall_NN_bow = recall_score(test_data['label'], y_pred_NN_bow, pos_label = 'spam')
f1_NN_bow = f1_score(test_data['label'], y_pred_NN_bow, pos_label = 'spam')
accuracy_NN_bow, precision_NN_bow, recall_NN_bow, f1_NN_bow

(0.987563334868724, 0.9848643875287667, 0.9821162444113264, 0.9799256505576207)

##### Similarity

In [115]:
NN_clf_sim = MLPClassifier(solver = 'adam', activation = 'logistic').fit(similarity_train_num, train_data['label'])
y_pred_NN_sim = NN_clf_sim.predict(similarity_test_num)
accuracy_NN_sim = NN_clf_sim.score(similarity_test_num, test_data['label'])
precision_NN_sim = precision_score(test_data['label'], y_pred_NN_sim, average = 'macro')
recall_NN_sim = recall_score(test_data['label'], y_pred_NN_sim, pos_label = 'spam')
f1_NN_sim = f1_score(test_data['label'], y_pred_NN_sim, pos_label = 'spam')
accuracy_NN_sim, precision_NN_sim, recall_NN_sim, f1_NN_sim

(0.833256563795486, 0.8179673557024951, 0.6304023845007451, 0.7003311258278146)

## 3-6. kNN\*<br>
\* take a too long time to run (to find the best k) so I drop it.

In [116]:
# from sklearn.model_selection import GridSearchCV, KFold
# from sklearn.neighbors import KNeighborsClassifier
# knn = KNeighborsClassifier()
# k_range = list(range(1, 11))
# param_grid = dict(n_neighbors = k_range)
# kf = KFold(n_splits = 10)
# grid = GridSearchCV(knn, param_grid = param_grid, cv = kf)
# grid.fit(features_train_num, train_data['label'])
# cvKNN = grid.best_estimator_
# y_pred_knn = cvKNN.predict(features_test_num)
# accuracy_knn = cvKNN.score(features_test_num, test_data['label'])
# precision_knn = precision_score(test_data['label'], y_pred_knn, average = 'macro')
# accuracy_knn, precision_knn

I end up do not choose kNN and random forest classifier because they all need a extra step to fine the best parameter (number of k and decision trees) to explore their best potential, but it will take a too long time to run it.

## 3-7. Predictions

In [134]:
predictions = pd.DataFrame()
model_name = ['Dummy', 'LR', 'NB', 'DT', 'NN']
predict_list = [y_pred_dummy, y_pred_LR, y_pred_NB, y_pred_DT, y_pred_NN]
predictions['Real'] = test_data['label']
for i in range(0, len(model_name)):
    predictions[model_name[i]] = predict_list[i]

In [140]:
predictions.head(10)

Unnamed: 0,Real,Dummy,LR,NB,DT,NN
0,ham,ham,ham,ham,ham,ham
1,ham,ham,ham,ham,ham,ham
2,spam,ham,spam,spam,spam,spam
3,ham,ham,ham,ham,ham,ham
4,ham,ham,ham,ham,ham,ham
5,ham,ham,ham,ham,ham,ham
6,spam,ham,ham,ham,spam,spam
7,ham,ham,ham,ham,ham,ham
8,spam,ham,ham,ham,spam,spam
9,ham,ham,ham,ham,ham,ham


# 4. Classifier Evaluation

## 4-1. Train_Data

In [117]:
from scipy.stats import ttest_rel

In [118]:
ttest_name = ['Dummy', 'LR', 'NB', 'DT', 'NN']
ttest_list = [dummy_scores, LR_scores, NB_scores, DT_scores, NN_scores]
ttest_mean = []
ttest_result = []

for i in range(0, len(ttest_list)):
    ttest_mean.append(np.mean(ttest_list[i]))
    t_result = []
    for j in range(0, len(ttest_list)):
        if not i == j:
            t, p = ttest_rel(ttest_list[i], ttest_list[j])
            if p < 0.05:
                if np.mean(ttest_list[i]) > np.mean(ttest_list[j]):
                    t_result.append(j)
    ttest_result.append(t_result)

ttest_df = pd.DataFrame()
for n in range(0, len(ttest_name)):
    ttest_df[ttest_name[n]] = [ttest_mean[n], ttest_result[n]]

In [119]:
t, p = ttest_rel(ttest_list[1], ttest_list[0])

In [120]:
p < 0.05

True

In [121]:
ttest_df

Unnamed: 0,Dummy,LR,NB,DT,NN
0,0.724,0.857,0.851,0.932667,0.966667
1,[],[0],[0],"[0, 1, 2]","[0, 1, 2, 3]"


## 4-2. Test_Data

In [122]:
from scipy.stats import ttest_1samp

In [123]:
t_accuracy, p_accuracy = ttest_1samp([accuracy_LR, accuracy_NB, accuracy_DT, accuracy_NN], accuracy_dummy)
p_accuracy

0.006024237487153212

In [124]:
t_precision, p_precision = ttest_1samp([precision_LR, precision_NB, precision_DT, precision_NN], precision_dummy)
p_precision

0.0003513895130514171

In [125]:
t_recall, p_recall = ttest_1samp([recall_LR, recall_NB, recall_DT, recall_NN], recall_dummy)
p_recall

0.002149965699384519

In [126]:
t_f1, p_f1 = ttest_1samp([f1_LR, f1_NB, f1_DT, f1_NN], f1_dummy)
p_f1

0.0007182723445649311

In [127]:
test_result = pd.DataFrame()
test_result['Dummy'] = [accuracy_dummy, precision_dummy, recall_dummy, f1_dummy]
test_result['LR'] = [accuracy_LR, precision_LR, recall_LR, f1_LR]
test_result['NB'] = [accuracy_NB, precision_NB, recall_NB, f1_NB]
test_result['DT'] = [accuracy_DT, precision_DT, recall_DT, f1_DT]
test_result['NN'] = [accuracy_NN, precision_NN, recall_NN, f1_NN]

In [128]:
test_result

Unnamed: 0,Dummy,LR,NB,DT,NN
0,0.690926,0.842008,0.853063,0.92538,0.964993
1,0.345463,0.832889,0.852691,0.923512,0.959716
2,0.0,0.631893,0.634873,0.831595,0.940387
3,0.0,0.712007,0.727583,0.873239,0.943199


## 4-3. Test v.s. Train

In [129]:
t_dummy, p_dummy = ttest_1samp(dummy_scores, accuracy_dummy)
p_dummy

7.226762225904206e-14

In [130]:
t_LR, p_LR = ttest_1samp(LR_scores, accuracy_LR)
p_LR

0.10886247257610704

In [131]:
t_NB, p_NB = ttest_1samp(NB_scores, accuracy_NB)
p_NB

0.7789478693202416

In [132]:
t_DT, p_DT = ttest_1samp(DT_scores, accuracy_DT)
p_DT

0.11534852178155862

In [133]:
t_NN, p_NN = ttest_1samp(NN_scores, accuracy_NN)
p_NN

0.5882717853350015