In [None]:
import os
import numpy as np
from collections import Counter
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [None]:
def make_dictionary(setup_dir):
  #This function is a setup for the next function that trains my model
  #loading in the emails from training directory
  emails = [os.path.join(setup_dir,a) for a in os.listdir(setup_dir)]
  #takes in all the words from every emails
  all_words = []
  for mail in emails:
    with open(mail) as m:
      for line in m:
        #splits words on space to turn each word into an item 
        words = line.split()
        #save each word as an item in the list created above
        all_words += words
  #Tallies up all the words that occured in all the mails combined
  dictionary = Counter(all_words)
  #print(type(dictionary)) - Dictionary is a counter type
  #Turn dictionary into a list and save it to items_to_remove list to allow iteration in upcoming for loop
  items_to_remove = list(dictionary)

  for item in items_to_remove:
    #Removes characters that are non alphabets
    if item.isalpha() == False:
      del dictionary[item]
    #Removes any item that has length of 1 such as "I, a, etc"
    elif len(item) == 1:
      del dictionary[item]
  #most_common returns 3000 items that are used the most and sorted from most to least
  #format it returns in eg. Counter('Hello').most_common(2) it would return l, 2 and H, 1.
  #'H' is prioritized over 'e' and 'o' because it came first.
  dictionary = dictionary.most_common(3000)
  #print(dictionary)
  return dictionary

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#This function is used to train the model, then test the accuracy of the gaussianNB that will be implemented below
def extract_features(mail_dir):
  #Joins path and creates a list with the files in specified directory
  files = [os.path.join(mail_dir,b) for b in os.listdir(mail_dir)]
  #sets up a features matrix with the rows being made of the number of files we will feed it
  #and the columns consisting of the 3000 most common words that was determined in the previous function
  features_matrix = np.zeros((len(files),3000))
  #Used to label spam/non spam emails, filled with 0 and 1s and list size is the number of files we feed it with
  training_labels = np.zeros(len(files))
  #Starts from 1 because array position starts from 0
  count = 1;
  docID = 0;
  for file in files:
    #opening list of all the file names 
    with open(file) as b:
      #It iterates through each line in the file and returns the line number and the content
      for i, line in enumerate(b):
        #Starts from line 3 (i==2) because that is where the mail content is located. this does not take content into account
        if i ==2:
          #separates each word through whitespaces and save it into a list named words
          words = line.split()
          #iterate through each word in the wordlist
          for word in words:
            #Loops through the words in the file, for each word we loop through each feature word in the dictionary
            #If word from file is equal to one of the top 3000 words in dictionary, the feature matrix updates the # of occurence of that word in the feature matrix by the amount of times it appeared in the message
            #Initializing wordID
            wordID = 0
            #enumerate iterates through dictionary index(i) and return key value pair (word and occurences)
            for i, d in enumerate(dictionary):
              #d is the tuple that contains the word and number of occurences
              #d[0] is the word and d[1] is the number of occurences
              #i iterates through each of the top 3000 word columns in the feature matrix
              if d[0] == word:
                #Indexing the wordID based on index in the dictionary to follow the words on the feature matrix columns
                wordID = i
                #Feature matrix updates the slot in the matrix based on the number of occurences of selected word in the file that is being read
                features_matrix[docID,wordID] = words.count(word)
      #Sets training label as non spam initially
      training_labels[docID] = 0;
      #Splits the file name on '/' into a word list
      filepathTokens = file.split('/')
      #Refers to the last word in the filepathtokens list 
      filename = filepathTokens[len(filepathTokens)-1]
      #if file name starts with spmsg
      if filename.startswith("spmsg"):
        #we would label it as spam
        training_labels[docID] = 1;
        #Number of spam mail increases by 1
        count = count + 1
      #After labelling the file, Iterate to the next file document and increases index by 1
      docID = docID + 1
  return features_matrix, training_labels    
  


In [None]:
TRAINING_DIR = '/content/drive/MyDrive/MSBA_Colab_2020/ML_Algorithms/CA02/test-mails'
TESTING_DIR = '/content/drive/MyDrive/MSBA_Colab_2020/ML_Algorithms/CA02/train-mails'

In [None]:
dictionary = make_dictionary(TRAINING_DIR)

print ("reading and processing emails from TRAIN and TEST folders")
features_matrix, labels = extract_features(TRAINING_DIR)
test_features_matrix, test_labels = extract_features(TESTING_DIR)




#The Naive Bayes takes into account the occurences 3000 most common words along with the probability of the email being a spam mail in the feature matrix to label whether a mail is spam or not
model = GaussianNB()

print ("Training Model using Gaussian Naibe Bayes algorithm .....")

#https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html
#On the GaussianNB documentation page, it mentioned that
#"This method is expected to be called several times consecutively on different chunks of a dataset so as to implement out-of-core or online learning."
#My interpretation of what this algorithm randomly samples my training dataset to 
#fit a smaller pool and then it fits the testing data into these pools and label them


# Fluctuation:
# from gaussianNb, has method called partial fit that randomly samples from the input to train the model
# due to the random sampling, the accuracy fluctutates around 90%
model.fit(features_matrix, labels)
print ("Training completed")
print ("testing trained model to predict Test Data labels")
predicted_labels = model.predict(test_features_matrix)
print ("Completed classification of the Test Data .... now printing Accuracy Score by comparing the Predicted Labels with the Test Labels:")
print (accuracy_score(test_labels, predicted_labels))

reading and processing emails from TRAIN and TEST folders
Training Model using Gaussian Naibe Bayes algorithm .....
Training completed
testing trained model to predict Test Data labels
Completed classification of the Test Data .... now printing Accuracy Score by comparing the Predicted Labels with the Test Labels:
0.9501424501424501
