In [36]:
"""
    I do not claim ownership of this code. This code has been adapted from the
    'Machine Learning, Data Science and Generative AI with Python' course by
    Frank Kane on Udemy.com. This is simply a playground with annotations for 
    personal understanding.
"""

import os
import io
import numpy as np
import pandas as pd
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

def readFiles(path):
    # os.walk() returns a tuple, recursively walking through each directory
    #  until sub-directories are exhausted
    for root, dirnames, filenames in os.walk(path):
        for filename in filenames:
            path = os.path.join(root, filename)

            inBody = False
            lines = []
            f = io.open(path, 'r', encoding='latin1')
            for line in f:
                if inBody:
                    lines.append(line)
                # a blank line exists before the body of the email 
                elif line == '\n':
                    inBody = True
            f.close()
            message = '\n'.join(lines)
            # yield suspends the function saving state and returning value; this function
            #  becomes a generator function and is iterable
            yield path, message 
                


def dataFrameFromDirectory(path, classification):
    rows = []
    index = []
    for filename, message in readFiles(path):
        rows.append({'message': message, 'class': classification})
        index.append(filename)
    return DataFrame(rows, index=index)

data = DataFrame({'message':[], 'class':[]})
data = pd.concat([data, dataFrameFromDirectory("emails/spam", "spam")])
data = pd.concat([data, dataFrameFromDirectory("emails/ham", "ham")])
data.head()

# CountVectorizer() convert collection of text documents to matrix of token 
#  count -> coverts each message into its list of words
vectorizer = CountVectorizer()
# fit_transform() learns vocabulary dictionary, returning document-term matrix
counts = vectorizer.fit_transform(data['message'].values)

# MultinomialNB is a naive bayes classifier for multinomial models
classifier = MultinomialNB()
targets = data['class'].values
# classifier.fit() fits naive bays classifier to X, y -> trains the filter based
#  token matrix from string
classifier.fit(counts, targets)

examples = ['Free Viagra now!!!', "Hi Bob, how about a game of golf tomorrow?"]
# vectorizer.transform converts each message into list of words and their frequencies
#  where the words are represented by positions in an array; this is needed to
#  obtain same format that was originally trained for the model
example_counts = vectorizer.transform(examples)
predictions = classifier.predict(example_counts)
predictions

# => array(['spam', 'ham'], dtype='<U4')




MultinomialNB()


array(['spam', 'ham'], dtype='<U4')