### For PSET 2, problem 6, spam classification
a) Implement code for processing the spam messages into numpy arrays that can be fed into ML models. Complete get_words, create_dictionary, transform_text functions within our provided src/p06_spam.py.  The provided code will then run your functions and save the resulting dictionary into output/p06_dictionary and a sample of the resulting training matrix into output/p06_sample_train_matrix. 

In [47]:
import collections

import numpy as np

import util
import svm

def get_words(message):
    """Get the normalized list of words from a message string.

    This function should split a message into words, normalize them, and return
    the resulting list. For splitting, you should split on spaces. For normalization,
    you should convert everything to lowercase.

    Args:
        message: A string containing an SMS message

    Returns:
       The list of normalized words from the message.
    """
    return message.lower().split(' ')


def create_dictionary(messages):
    """Create a dictionary mapping words to integer indices.

    This function should create a dictionary of word to indices using the provided
    training messages. Use get_words to process each message. 

    Rare words are often not useful for modeling. Please only add words to the dictionary
    if they occur in at least five messages.

    Args:
        messages: A list of strings containing SMS messages

    Returns:
        A python dict mapping words to integers.
    """
    messages = [get_words(x) for x in messages]
    # now they key is to iterate through every word, and counting them, and then when the count eventually reaches 5, we add them 
    occurrences = {}
    word_dict = {} # this is the dict, that we add, once occurrences is 5 
    
    for message in messages:
        message_as_set = set(message)
        # for each word in the set, update the status 
        for word in message_as_set:
            if word not in occurrences:
                occurrences[word] = 1
            else:
                # only care about this if not in word_dict already 
                if word not in word_dict:
                    occurrences[word] += 1 
                    if occurrences[word] == 5:
                        word_dict[word] = len(word_dict)
    return word_dict


def transform_text(messages, word_dictionary):
    """Transform a list of text messages into a numpy array for further processing.

    This function should create a numpy array that contains the number of times each word
    appears in each message. Each row in the resulting array should correspond to each 
    message and each column should correspond to a word.

    Use the provided word dictionary to map words to column indices. Ignore words that 
    are not present in the dictionary. Use get_words to get the words for a message.

    Args:
        messages: A list of strings where each string is an SMS message.
        word_dictionary: A python dict mapping words to integers.

    Returns:
        A numpy array marking the words present in each message.
    """
    # basically turn a message into 
    # first, return a list of indices, corresponding to words that exist, returning -1
    messages_indices = [get_words(message) for message in messages]
    # now turn them into a mask 
    # we can map each word to the index, and that index should "light up"
    # for each sentence, we should create the message index 
    # sentence to message index 
    # turn this into a count dictionary
    # counter dict maps index to occurrence
    # create a matrix for the embeddings
    embeddings = np.zeros((len(messages), len(word_dictionary)))
    for idx, message in enumerate(messages_indices):
        counter_dict = {}
        for word in message:
            if word in word_dictionary:
                if word_dictionary[word] in counter_dict:
                    counter_dict[word_dictionary[word]] += 1
                else:
                    counter_dict[word_dictionary[word]] = 1
        # with counter dict, we create an np.zero
        # take the index, and update the value to the key 
        print(counter_dict)
        for k, v in counter_dict.items():
            embeddings[idx][k] = v

    print(embeddings)

In [49]:
test_string = "Hello, my name is Emilia"
test_messages = [
    "Hello my name is Emilia hello",
    "Hello my name is Subaru",
    "Hello world",
    "Hello my name is Acero",
    "Hello I miss you",
    "MY dawg",
    "you are my friend"
]

word_dict = create_dictionary(test_messages)
# transform text
transform_text(test_messages, word_dict)


{0: 2, 1: 1}
{0: 1, 1: 1}
{0: 1}
{0: 1, 1: 1}
{0: 1}
{1: 1}
{1: 1}
[[2. 1.]
 [1. 1.]
 [1. 0.]
 [1. 1.]
 [1. 0.]
 [0. 1.]
 [0. 1.]]


In [14]:
def main():
    train_messages, train_labels = util.load_spam_dataset('../data/ds6_train.tsv')
    val_messages, val_labels = util.load_spam_dataset('../data/ds6_val.tsv')
    test_messages, test_labels = util.load_spam_dataset('../data/ds6_test.tsv')
    
    dictionary = create_dictionary(train_messages)

    util.write_json('./output/p06_dictionary', dictionary)

    train_matrix = transform_text(train_messages, dictionary)

    np.savetxt('./output/p06_sample_train_matrix', train_matrix[:100,:])

main()

FileNotFoundError: [Errno 2] No such file or directory: './output/p06_dictionary'