In [None]:
import os, re, glob
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

#method for accessing files with spam/ham
def get_files(directory):
    return [directory + "/" + path for path in os.listdir(directory)]

#method for reading files found
def read_file(file):
    with open(file, "r", encoding="Latin-1") as f:
        return f.read()

#creating dataframe for ham mails
df_hham = pd.DataFrame([read_file(file) for file in get_files('C:\\Users\\sarah\\DAT405 assignments\\Assignment 4\\HardHam')])

#creating dataframe for spam mails
df_spam = pd.DataFrame([read_file(file) for file in get_files('C:\\Users\\sarah\\DAT405 assignments\\Assignment 4\\Spam')])

In [None]:
#adding labels, 0 for ham and 1 for spam
df_hham['labels'] = 0
df_spam['labels'] = 1
df_hham.columns = ['messages','labels']
df_spam.columns = ['messages', 'labels']

#merging together to one data frame for all emails
all_emails = df_hham.append(df_spam, ignore_index=True)

In [None]:
#Method for removing headers
for message in all_emails.messages:
    temp = message.split("\n\n",1)
    if len(temp) > 1:
        all_emails = all_emails.replace(message,temp[1])

In [None]:
#removing HTML tags and punctuation
def pre_processor(messages):
    messages = messages.lower()
    no_html = re.compile('<.*?>')
    messages = re.sub(no_html, '', messages)
    no_long_words = re.compile('\w{25,}')
    messages = re.sub(no_long_words, '', messages)
    no_special_signs = re.compile('\W')
    messages = re.sub(no_special_signs, ' ', messages)

    return messages

In [None]:
#cleaning with preprocessor before counting freq.
count_vect = CountVectorizer(preprocessor = pre_processor)

#counting freq. of different words
bag_of_words = count_vect.fit_transform(all_emails.messages)
sum_words = bag_of_words.sum(axis=0) 
words_freq = [(word, sum_words[0, idx]) for word, idx in count_vect.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)

#Top 20 most common words
print(f'The 20 most common words and their freq. are: {words_freq[:20]}')

#20 most uncommon words
print(f'The 20 least common words and their freq. are: {words_freq[-20:]}')

In [None]:
#Creating test and training data
X_train, X_test, y_train, y_test = train_test_split(all_emails.messages, all_emails.labels, test_size=0.35, random_state = 42)

In [None]:
#vectorizing data with CountVectorizer

#min_df eliminates too uncommon words, max_df remove too frequent words (in % of total)
count_vect = CountVectorizer(preprocessor = pre_processor, max_df=0.8, min_df=0.005, stop_words='english')

#training set
X_train_counts = count_vect.fit_transform(X_train)
print("X_train: ", X_train_counts.shape)

#testing set
X_test_counts = count_vect.transform(X_test)
print("X_test: ", X_test_counts.shape)

#counting freq. of different words in training set
sum_words = X_train_counts.sum(axis=0) 
words_freq = [(word, sum_words[0, idx]) for word, idx in count_vect.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)

#Top 20 most common words
print(f'The 20 most common words in the training set and their freq. are: {words_freq[:20]}')

#20 most uncommon words
print(f'The 20 least common words in the training set and their freq. are: {words_freq[-20:]}')

In [None]:
#term frequencies & inverse document frequency
tfidf_transformer = TfidfTransformer()

#training set
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print("X_train: ", X_train_tfidf.shape)

#testing set
X_test_tfidf = tfidf_transformer.transform(X_test_counts)
print("X_test: ", X_test_tfidf.shape)

In [None]:
#Multinomial Naive Bayes
clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [None]:
#Evaluation of model
predicted = clf.predict(X_test_tfidf)
acc_score_n = np.mean(predicted == y_test)
print(f'The accuraccy score of Multinomial naive bayes is: {acc_score_n}')

In [None]:
#Bernoulli Naive Bayes
clfB = BernoulliNB(binarize=0.0)
clfB.fit(X_train_tfidf, y_train)

In [None]:
#Evaluation of model
predicted = clfB.predict(X_test_tfidf)
acc_score_b = np.mean(predicted == y_test)
print(f'The accuraccy score of Bernoulli naive bayes is: {acc_score_b}')