In [3]:
import os, csv
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [4]:
# convert all txt files into a single csv file
corpus = './corpus.csv'
test_corpus = './test_corpus.csv'
train_path = './train-mails'
test_path = './test-mails'
with open(corpus, 'w', newline='', encoding='utf-8') as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(['File Name', 'Content', 'Spam or Valid'])

    for txt_filename in os.listdir(train_path):
        if txt_filename.endswith('.txt'):
            with open(os.path.join(train_path, txt_filename), 'r', encoding='utf-8') as txt_file:
                content = txt_file.read()
                spam_or_valid = 'spam' if txt_filename.startswith('spmsg') else 'valid'
                csv_writer.writerow([txt_filename, content, spam_or_valid])

with open(test_corpus, 'w', newline='', encoding='utf-8') as test_csv_file:
    csv_writer = csv.writer(test_csv_file)
    csv_writer.writerow(['File Name', 'Content', 'Spam or Valid'])

    for txt_filename in os.listdir(test_path):
        if txt_filename.endswith('.txt'):
            with open(os.path.join(test_path, txt_filename), 'r', encoding='utf-8') as txt_file:
                content = txt_file.read()
                spam_or_valid = 'spam' if txt_filename.startswith('spmsg') else 'valid'
                csv_writer.writerow([txt_filename, content, spam_or_valid])

In [5]:
# Create a dictionary of words (dropping all non-words like punctuation characters, single characters)
# choosing the 2000 most frequent words from training set
csv_file = pd.read_csv('./corpus.csv', encoding='utf-8')
test_csv_file = pd.read_csv('./test_corpus.csv', encoding='utf-8')
csv_file['Content'] = csv_file['Content'].str.replace(r'[^\w\s]+|(?<=\s)\S(?=\s)|\S(?<=\s)','',regex=True)
test_csv_file['Content'] = csv_file['Content'].str.replace(r'[^\w\s]+|(?<=\s)\S(?=\s)|\S(?<=\s)','',regex=True)
dictionary = {}
for i in range(len(csv_file['Content'])):
    for word in csv_file['Content'][i].split():
        if word not in dictionary:
            if len(word) > 1:
                dictionary[word] = 1
        elif word in dictionary:
            dictionary[word] += 1
    del dictionary['Subject']
    

top_2000_words = sorted(dictionary.items(), key=lambda x: x[1], reverse=True)[:2000]
sortedDictionary = dict(top_2000_words)
#for word in top_2000_words:
#   print(word) 

In [7]:
# Transform each mail into a word count vector basing on the dictionary of words
#word_count_matrix = CountVectorizer(vocabulary=sortedDictionary.keys()).transform(csv_file['Content'])

X_train = []
y_train = []
X_test = []
y_test = []

for mail_index in range(len(csv_file['Content'])):
    X_train.append([])
    for key in sortedDictionary.keys():
        word_counter = 0
        for word in csv_file['Content'][mail_index].split():
            if word == key:
                word_counter += 1
        X_train[-1].append(word_counter)
    y_train.append(csv_file['Spam or Valid'][mail_index])

for mail_index in range(len(test_csv_file['Content'])):
    X_test.append([])
    for key in sortedDictionary.keys():
        word_counter = 0
        for word in test_csv_file['Content'][mail_index].split():
            if word == key:
                word_counter += 1
        X_test[-1].append(word_counter)
    y_test.append(csv_file['Spam or Valid'][mail_index])

print('Matrix dimensions:', len(X_train), 'x', len(X_train[0]))

Matrix dimensions: 702 x 2000


In [9]:
# Train a Bayes classifier (MultinomialNB from sklearn.naive_bayes)
# Check with the test set the classifier

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2)
clf = MultinomialNB()
clf.fit(X_train, y_train)
print('Accuracy:', clf.score(X_test, y_test))
print(classification_report(y_test, clf.predict(X_test))    )

Accuracy: 0.9858156028368794
              precision    recall  f1-score   support

        spam       0.98      0.98      0.98        54
       valid       0.99      0.99      0.99        87

    accuracy                           0.99       141
   macro avg       0.98      0.98      0.98       141
weighted avg       0.99      0.99      0.99       141
