In [106]:
import numpy as np
import pandas as pd
import gzip
import re

#this is a naive Bayesian program for filtering spam emails based on the words found in the email's subject line.
#since the application of machine learning techniques to spam filtering has a long history, and given the unique
#circumstances of this semester, I opted to modify a publically available model to my own specifications 
#and train it on a dataset I created. relevent credits at bottom.

#data used here is a set of 340 emails from an old gmail account I used to use, trimmed down to just the
#subject line and a label -- 0 for regular email, 1 for spam

#retrieve data and randomize. should see some account messages, solicited promotions, and unsolicited spam
#we are attempting to distinguish the third category from the first two.
#dataset is ~72% regular email, 28% spam
all_data = pd.read_csv(r'C:\Users\danieltc\Downloads\comp 562\final\data.csv', encoding= 'unicode_escape', 
                      names=['subject', 'label'])
randomized_data= all_data.sample(frac=1, random_state=1)
print(randomized_data)
print(randomized_data['label'].value_counts(normalize=True))

#will be seperating data into 4 sets, fits well for data size of 340. Ensure that training and test sets
#are close to the same distribution of spam/not spam
training_test_index = 255
training_set = randomized_data[:training_test_index].reset_index(drop=True)
test_set = randomized_data[training_test_index:].reset_index(drop=True)
print(training_set.shape)
print(training_set['label'].value_counts(normalize=True))
print(test_set.shape)
print(test_set['label'].value_counts(normalize=True))

                                               subject  label
102  =?UTF-8?Q?=E2=9C=85_Name=2C_finish_setting_up_...      0
125  =?UTF-8?Q?=F0=9F=8E=89_We're_celebrating_278,2...      1
11   24Problem SolvingGadgets You Need [19435:ID OX...      1
248           The FREEdom is REAL, Secondary Username!      0
238                         New sign-in from Chrome OS      0
..                                                 ...    ...
203     New Champion: Inara, the Stone Warden is Here!      0
255  You Have (1) New T-Mobile Survey Reward Ready ...      1
72                       We're shelling out 125 points      0
235   'From: To: Reply-To: Message-Id: List-Unsubsc...      1
37       Last Chance: Save 25% off the Founder's Pack!      0

[340 rows x 2 columns]
0    0.720588
1    0.279412
Name: label, dtype: float64
(255, 2)
0    0.709804
1    0.290196
Name: label, dtype: float64
(85, 2)
0    0.752941
1    0.247059
Name: label, dtype: float64


In [107]:
#this filter will work by creating a list of words found in the dataset, then assigning frequencies to each word
#based on how likely they are to appear in emails marked as spam in the training set.
#punctuation not judged, so remove punc and lowercase everything.
training_set['subject'] = training_set['subject'].str.replace(
   '\W', ' '
training_set['subject'] = training_set['subject'].str.lower()
training_set.head(3)
print(training_set)

#create dictionary of all words used at least once in the dataset
training_set['subject'] = training_set['subject'].str.split()
dictionary = []
for subject_line in training_set['subject']:
   for word in subject_line:
      dictionary.append(word)

dictionary = list(set(dictionary))
print(dictionary)

SyntaxError: invalid syntax (<ipython-input-107-49872803c039>, line 6)

In [108]:
#add words from dataset to dictionary
word_counts_per_subject = {unique_word: [0] * len(training_set['subject']) for unique_word in dictionary}

for index, subject in enumerate(training_set['subject']):
   for word in subject:
      word_counts_per_subject[word][index] += 1

word_counts = pd.DataFrame(word_counts_per_subject)
word_counts.head()
training_set_clean = pd.concat([training_set, word_counts], axis=1)
training_set_clean.head()
#a problem is already emerging -- the words used in some subject lines, particularly the spam emails,
#often aren't words at all -- they're long strings of characters unlikely to be found anywhere else
#in the dataset. i have taken note of this and will mention it in the report.

KeyError: '='

In [113]:
#separate regular (reg) and spam messages in the training set
spam_messages = training_set_clean[training_set_clean['label'] == 1]
reg_messages = training_set_clean[training_set_clean['label'] == 0]

#proportions of regular and spam messages
p_spam = len(spam_messages) / len(training_set_clean)
p_reg = len(reg_messages) / len(training_set_clean)

n_words_per_spam_message = spam_messages['subject'].apply(len)
n_spam = n_words_per_spam_message.sum()
n_words_per_reg_message = reg_messages['subject'].apply(len)
n_reg = n_words_per_reg_message.sum()
n_dictionary = len(dictionary)

#laplace smoothing -- prevents the unlikely occurrence that any word is determined to have 0 probability
#of being found in a spam or regular subject line.
alpha = 1

#set up parameters for regular and spam categories.
parameters_spam = {unique_word:0 for unique_word in dictionary}
parameters_reg = {unique_word:0 for unique_word in dictionary}

#calc params -- for a word w, pr and ps represent the relative likelihoods that w is found in a regular or 
#spam email, respectively. parameters determined for all words in the dictionary.
for word in dictionary:
   n_word_given_spam = spam_messages[word].sum()
   if type(n_word_given_spam) is np.int64:
       p_word_given_spam = (n_word_given_spam + alpha) / (n_spam + alpha*n_dictionary)
       parameters_spam[word] = p_word_given_spam

       n_word_given_reg = reg_messages[word].sum()
       p_word_given_reg = (n_word_given_reg + alpha) / (n_reg + alpha*n_dictionary)
       parameters_reg[word] = p_word_given_reg
    
    
#quick check -- the parameter for a word likely to be found in spam emails, like "lawsuit" and "speechless",
#should be higher in the parameters_spam list than the parameters_reg list, while it should be the 
#other way around for words more likely to be in regular emails
print(parameters_spam)
print('''






''')
print(parameters_reg)

{'what': 0.0011682242990654205, 'select': 0.0011682242990654205, 'obamacare': 0.002336448598130841, 'march': 0.0011682242990654205, 'trying': 0.0035046728971962616, 'xp': 0.0011682242990654205, '94here_are_50_points': 0.0011682242990654205, 'yma': 0.005841121495327103, '4': 0.0011682242990654205, 'congratulations': 0.0011682242990654205, 'a9mon_from_alola': 0.0011682242990654205, 'haxfrq': 0.002336448598130841, '6': 0.002336448598130841, 'account': 0.004672897196261682, '8d': 0.0011682242990654205, 'february': 0.0011682242990654205, 'sick': 0.002336448598130841, 'worthy': 0.0011682242990654205, 'protect': 0.002336448598130841, 'party': 0.0011682242990654205, 'id_wmrl': 0.002336448598130841, '4pyliehpz2gtzml2zsegtmv3ihbvaw50cybhcmugd2fpdgluzy4': 0.002336448598130841, 'company': 0.0011682242990654205, 'purchase': 0.0011682242990654205, 'successful': 0.0011682242990654205, 'warfare': 0.0011682242990654205, '93': 0.0011682242990654205, 'happy': 0.0011682242990654205, 'q': 0.007009345794392

In [114]:
#classification function -- for each word in the subject line of a scrutinized email, this function adds that word's
#reg parameter and spam parameter to running reg and spam totals, respectively.
#at the end, a determination is made, based on which total is higher, whether to mark as reg or spam.
def classify_test_set(message):
   '''
   message: a string
   '''

   message = re.sub('\W', ' ', message)
   message = message.lower().split()

   p_spam_given_message = p_spam
   p_reg_given_message = p_reg

   for word in message:
      if word in parameters_spam:
         p_spam_given_message *= parameters_spam[word]

      if word in parameters_reg:
         p_reg_given_message *= parameters_reg[word]

#!!! subjective judgement. here I put my thumb on the scale to decrease the chance of a regular email being 
#incorrectly labeled as spam, with the tradeoff being that more spam emails will pass under the radar.
#I will elaborate why in the report.
   if p_reg_given_message >= 0.9*p_spam_given_message:
      return 0
   else:
      return 1

In [115]:
#accuracy check -- ideally I want an accuracy as high as possible, but anything over 0.5 means that this
#model is an improvement over random guessing. 
test_set['predicted'] = test_set['subject'].apply(classify_test_set)
test_set.head()

correct = 0
false_pos = 0
false_neg = 0
total = test_set.shape[0]

for row in test_set.iterrows():
   row = row[1]
   if row['label'] == row['predicted']:
      correct += 1
   elif row['predicted'] == 1:
      false_pos += 1
   else:
      false_neg += 1

print('Correct:', correct)
print('Incorrect:', total - correct)
print('Accuracy:', correct/total)
print('False Positives:', false_pos)
print('False Negatives:', false_neg)

#project done individually.
#this model is adapted from the naive Bayes approach proposed by Alex Olteanu at 
#https://www.kdnuggets.com/2020/07/spam-filter-python-naive-bayes-scratch.html

Correct: 72
Incorrect: 13
Accuracy: 0.8470588235294118
False Positives: 4
False Negatives: 9
