a) Implement code for processing the spam messages into numpy arrays that can be fed into ML models. Complete get_words, create_dictionary, transform_text functions within our provided src/p06_spam.py. The provided code will then run your functions and save the resulting dictionary into output/p06_dictionary and a sample of the resulting training matrix into output/p06_sample_train_matrix.

In [4]:
import collections

import numpy as np

import util
import svm

def get_words(message):
    """Get the normalized list of words from a message string.

    This function should split a message into words, normalize them, and return
    the resulting list. For splitting, you should split on spaces. For normalization,
    you should convert everything to lowercase.

    Args:
        message: A string containing an SMS message

    Returns:
       The list of normalized words from the message.
    """
    return message.lower().split(' ')


def create_dictionary(messages):
    """Create a dictionary mapping words to integer indices.

    This function should create a dictionary of word to indices using the provided
    training messages. Use get_words to process each message. 

    Rare words are often not useful for modeling. Please only add words to the dictionary
    if they occur in at least five messages.

    Args:
        messages: A list of strings containing SMS messages

    Returns:
        A python dict mapping words to integers.
    """
    messages = [get_words(x) for x in messages]
    # now they key is to iterate through every word, and counting them, and then when the count eventually reaches 5, we add them 
    occurrences = {}
    word_dict = {} # this is the dict, that we add, once occurrences is 5 
    
    for message in messages:
        message_as_set = set(message)
        # for each word in the set, update the status 
        for word in message_as_set:
            if word not in occurrences:
                occurrences[word] = 1
            else:
                # only care about this if not in word_dict already 
                if word not in word_dict:
                    occurrences[word] += 1 
                    if occurrences[word] == 5:
                        word_dict[word] = len(word_dict)
    return word_dict


def transform_text(messages, word_dictionary):
    """Transform a list of text messages into a numpy array for further processing.

    This function should create a numpy array that contains the number of times each word
    appears in each message. Each row in the resulting array should correspond to each 
    message and each column should correspond to a word.

    Use the provided word dictionary to map words to column indices. Ignore words that 
    are not present in the dictionary. Use get_words to get the words for a message.

    Args:
        messages: A list of strings where each string is an SMS message.
        word_dictionary: A python dict mapping words to integers.

    Returns:
        A numpy array marking the words present in each message.
    """
    # basically turn a message into 
    # first, return a list of indices, corresponding to words that exist, returning -1
    messages_indices = [get_words(message) for message in messages]
    # now turn them into a mask 
    # we can map each word to the index, and that index should "light up"
    # for each sentence, we should create the message index 
    # sentence to message index 
    # turn this into a count dictionary
    # counter dict maps index to occurrence
    # create a matrix for the embeddings
    embeddings = np.zeros((len(messages), len(word_dictionary)))
    for idx, message in enumerate(messages_indices):
        counter_dict = {}
        for word in message:
            if word in word_dictionary:
                if word_dictionary[word] in counter_dict:
                    counter_dict[word_dictionary[word]] += 1
                else:
                    counter_dict[word_dictionary[word]] = 1
        # with counter dict, we create an np.zero
        # take the index, and update the value to the key 
        for k, v in counter_dict.items():
            embeddings[idx][k] = v
    return embeddings



### Main naive bayes code 

In [2]:
def fit_naive_bayes_model(matrix, labels):
    """Fit a naive bayes model.

    This function should fit a Naive Bayes model given a training matrix and labels.

    The function should return the state of that model.

    Feel free to use whatever datatype you wish for the state of the model.

    Args:
        matrix: A numpy array containing word counts for the training data
        labels: The binary (0 or 1) labels for that training data

    Returns: The trained model
    """
    labels_counter = collections.Counter(labels)
    prior_y1 =  labels_counter[1] / len(labels)# calculate the priors, percentage of values where y=1 

    # now calculate the prior values 
    word_probs_y0 = np.zeros((len(matrix[0])))
    word_probs_y1 = np.zeros((len(matrix[0])))

    # denominator is the count of y = target label (0 or 1)
    # numerator is for all words
    # get the word probability 
    # count word_probs_y0
    for xj in range(len(matrix[0])):
        # xj is each word
        # for each word, we need to count all occurrences
        # xj is an index, so we increment the counter is xj is 1, and the label is 0
        # for all messages
        counter = 0
        for idx, message in enumerate(matrix):
            # each message is a row 
            label = labels[idx]
            if label == 0 and message[xj] > 0:
                counter += 1
        # denominator is just lable_counter_y0
        word_probs_y0[xj] = (counter + 1) / (labels_counter[0] + len(matrix[0]))
    # counter for word_probs_y1
    for xj in range(len(matrix[0])):
        counter = 0 
        for idx, message in enumerate(matrix):
            label = labels[idx]
            if label == 1 and message[xj] > 0:
                counter += 1
        word_probs_y1[xj] = (counter + 1) / (labels_counter[1] + len(matrix[0]))
    return word_probs_y1, word_probs_y0, prior_y1


def predict_from_naive_bayes_model(model, matrix):
    """Use a Naive Bayes model to compute predictions for a target matrix.

    This function should be able to predict on the models that fit_naive_bayes_model
    outputs.

    Args:
        model: A trained model from fit_naive_bayes_model
        matrix: A numpy array containing word counts

    Returns: A numpy array containg the predictions from the model
    """
    phi_k_y1, phi_k_y0, phi_y = model
    
    sum_log_p_x_y1 = (np.log(phi_k_y1) * matrix).sum(axis=1) + np.log(phi_y)
    sum_log_p_x_y0 = (np.log(phi_k_y0) * matrix).sum(axis=1) + np.log(1 - phi_y)

    return (sum_log_p_x_y1 > sum_log_p_x_y0).astype(int)

In [3]:
def train():
    train_messages, train_labels = util.load_spam_dataset('../data/ds6_train.tsv')
    dictionary = create_dictionary(train_messages)
    util.write_json('../output/p06_dictionary', dictionary)

    train_matrix = transform_text(train_messages, dictionary)

    np.savetxt('../output/p06_sample_train_matrix', train_matrix[:100,:])
    naive_bayes_model = fit_naive_bayes_model(train_matrix, train_labels)
    return naive_bayes_model, dictionary 

model, dictionary = train() 

Compute your prediction accuracy and then save your resulting predictions to output/p06_naive_bayes_predictions

In [4]:
def main():
    val_messages, val_labels = util.load_spam_dataset('../data/ds6_val.tsv')
    test_messages, test_labels = util.load_spam_dataset('../data/ds6_test.tsv')
    test_matrix = transform_text(test_messages, dictionary)

    naive_bayes_predictions = predict_from_naive_bayes_model(model, test_matrix)
    np.savetxt('../output/p06_predictions', naive_bayes_predictions)
    accuracy = (naive_bayes_predictions == test_labels).mean()
    print(f"Accuracy for test set {accuracy * 100}")
    
main()

Accuracy for test set 98.38709677419355


### C 
Complete the get_top_five_naive_bayes_words function within the provided code using the above formula in order to obtain the 5 most indicative tokens

In [5]:
### C 
## for each word, we can just take the prior values and multiply by the number we divided by
## so for the case of y=1, multiply by the number of y=1 in the training class 
## and for the case of y=0, multiple by the number of y=0 in the training class 
## 
def get_top_five_naive_bayes_words(model, dictionary):
    """Compute the top five words that are most indicative of the spam (i.e positive) class.

    Ues the metric given in 6c as a measure of how indicative a word is.
    Return the words in sorted form, with the most indicative word first.

    Args:
        model: The Naive Bayes model returned from fit_naive_bayes_model
        dictionary: A mapping of word to integer ids

    Returns: The top five most indicative words in sorted order with the most indicative first
    """
    # *** START CODE HERE ***
    word_probs_y1, word_probs_y0, _ = model
    ## we need to get the number of y=1 and y=0 ..
    ## it probably doesn't matter since we divide by the same number...
    word_indicativeness_tuple_lst = []
    # for each word get y=1 and y=0 
    for word, index in dictionary.items():
        indicativeness = (word_probs_y1[index] / word_probs_y0[index]).item()
        word_indicativeness_tuple_lst.append((word, indicativeness))
    ## now sort by the second value 
    word_indicativeness_tuple_lst_sorted = sorted(word_indicativeness_tuple_lst, key = lambda x:x[1], reverse=True)
    # get the top 5
    most_indicative = []
    for i in range(5):
        most_indicative.append(word_indicativeness_tuple_lst_sorted[i][0])
    util.write_json('../output/p06_top_indicative_words', most_indicative)
    return most_indicative
    # *** END CODE HERE ***
print("Top 5 most indicative words:")
get_top_five_naive_bayes_words(model=model, dictionary=dictionary)

Top 5 most indicative words:


['claim', 'won', 'prize', 'urgent!', 'awarded']

### D: compute best svm radius

In [2]:
### 
# Important note: you do not have to modify this file for your homework.

import numpy as np
np.random.seed(123)


def train_and_predict_svm(train_matrix, train_labels, test_matrix, radius):
    """Train an SVM model and predict the resulting labels on a test set.

    Args: 
        train_matrix: A numpy array containing the word counts for the train set
        train_labels: A numpy array containing the spam or not spam labels for the train set
        test_matrix: A numpy array containing the word counts for the test set
        radius: The RBF kernel radius to use for the SVM

    Return: 
        The predicted labels for each message
    """
    model = svm_train(train_matrix, train_labels, radius)
    return svm_predict(model, test_matrix, radius)


def svm_train(matrix, category, radius):
    state = {}
    M, N = matrix.shape
    Y = 2 * category - 1
    matrix = 1. * (matrix > 0)
    squared = np.sum(matrix * matrix, axis=1)
    gram = matrix.dot(matrix.T)
    K = np.exp(-(squared.reshape((1, -1)) + squared.reshape((-1, 1)) - 2 * gram) / (2 * (radius ** 2)))

    alpha = np.zeros(M)
    alpha_avg = np.zeros(M)
    L = 1. / (64 * M)
    outer_loops = 10

    alpha_avg = 0
    ii = 0
    while ii < outer_loops * M:
        i = int(np.random.rand() * M)
        margin = Y[i] * np.dot(K[i, :], alpha)
        grad = M * L * K[:, i] * alpha[i]
        if margin < 1:
            grad -= Y[i] * K[:, i]
        alpha -= grad / np.sqrt(ii + 1)
        alpha_avg += alpha
        ii += 1

    alpha_avg /= (ii + 1) * M

    state['alpha'] = alpha
    state['alpha_avg'] = alpha_avg
    state['Xtrain'] = matrix
    state['Sqtrain'] = squared
    return state


def svm_predict(state, matrix, radius):
    M, N = matrix.shape

    Xtrain = state['Xtrain']
    Sqtrain = state['Sqtrain']
    matrix = 1. * (matrix > 0)
    squared = np.sum(matrix * matrix, axis=1)
    gram = matrix.dot(Xtrain.T)
    K = np.exp(-(squared.reshape((-1, 1)) + Sqtrain.reshape((1, -1)) - 2 * gram) / (2 * (radius ** 2)))
    alpha_avg = state['alpha_avg']
    preds = K.dot(alpha_avg)
    output = (1 + np.sign(preds)) // 2

    return output


In [16]:
## actual code to compute the best svm radius 
def compute_best_svm_radius(train_matrix, train_labels, val_matrix, val_labels, radius_to_consider):
    """Compute the optimal SVM radius using the provided training and evaluation datasets.

    You should only consider radius values within the radius_to_consider list.
    You should use accuracy as a metric for comparing the different radius values.

    Args:
        train_matrix: The word counts for the training data
        train_labels: The spam or not spam labels for the training data
        val_matrix: The word counts for the validation data
        val_labels: The spam or not spam labels for the validation data
        radius_to_consider: The radius values to consider
    
    Returns:
        The best radius which maximizes SVM accuracy.
    """
    best_radius = -1 
    accuracy = -1
    for radius in radius_to_consider:
        predicts = train_and_predict_svm(train_matrix, train_labels, val_matrix, radius)
        # calculate accuracy
        model_accuracy = (predicts == val_labels).sum() / len(predicts)
        if model_accuracy > accuracy:
            accuracy = model_accuracy 
            best_radius = radius 
    return best_radius


def run_compute_best_svm_radius():
    val_messages, val_labels = util.load_spam_dataset('../data/ds6_val.tsv')
    test_messages, test_labels = util.load_spam_dataset('../data/ds6_test.tsv')
    train_messages, train_labels = util.load_spam_dataset('../data/ds6_train.tsv')
    dictionary = create_dictionary(train_messages)
    train_matrix = transform_text(train_messages, dictionary)
    val_matrix = transform_text(val_messages, dictionary)
    test_matrix = transform_text(test_messages, dictionary)
    # get thet train and val matrix 
    # straight from the main() function defined in the pset 
    optimal_radius = compute_best_svm_radius(train_matrix, train_labels, val_matrix, val_labels, [0.01, 0.1, 1, 10])
    util.write_json('../output/p06_optimal_radius', optimal_radius)
    print('The optimal SVM radius was {}'.format(optimal_radius))
    svm_predictions = svm.train_and_predict_svm(train_matrix, train_labels, test_matrix, optimal_radius)
    svm_accuracy = np.mean(svm_predictions == test_labels)
    print('The SVM model had an accuracy of {} on the testing set'.format(svm_accuracy, optimal_radius))


run_compute_best_svm_radius()

The optimal SVM radius was 0.1
The SVM model had an accuracy of 0.9713261648745519 on the testing set
