# Notebook Imports

In [1]:
import pandas as pd
import numpy as np

# Constants

In [2]:
TRAINING_FILE = 'SpamData/Training/training-data.txt'
TESTING_FILE = 'SpamData/Training/testing-data.txt'
VOCABULARY_LIMIT = 2500

TOKEN_PROBABILTY_SPAM = 'SpamData/Testing/probability_spam.txt'
TOKEN_PROBABILTY_NONSPAM = 'SpamData/Testing/probability_nonspam.txt'
TOKEN_PROBABILTY_ALL = 'SpamData/Testing/probability_all.txt'

TEST_FEATURE_MTX = 'SpamData/Testing/test_features.txt'
TEST_TARGET = 'SpamData/Testing/test_target.txt'

# Load & Read files

In [3]:
# Use numpy no load the training and testing data into our notebook

sparse_train_data = np.loadtxt(TRAINING_FILE, delimiter=' ', dtype=int)
sparse_test_data = np.loadtxt(TESTING_FILE, delimiter=' ', dtype=int)

In [4]:
# Example of what we got.
sparse_train_data[:5]

array([[ 0,  2,  1,  1],
       [ 0,  3,  1,  2],
       [ 0,  4,  1,  1],
       [ 0,  7,  1,  3],
       [ 0, 11,  1,  1]])

# Create full matrix from sparse matrix

In [5]:
# Function to create a full matrix from a sparse one.

def create_full_mtx(sparse_mtx, nr_words, doc_idx=0, word_idx=1, cat_idx=2, freq_idx=3):
    column_names = ['DOC_ID'] + ['CATEGORY'] + list(range(0,VOCABULARY_LIMIT))
    doc_id_names = np.unique(sparse_mtx[:, 0])
    
    train_dataset = pd.DataFrame(index=doc_id_names, columns=column_names)
    train_dataset.fillna(value=0, inplace=True)
    
    for i in range(sparse_mtx.shape[0]):
        doc_nr = sparse_mtx[i][doc_idx]
        word_id = sparse_mtx[i][word_idx]
        label = sparse_mtx[i][cat_idx]
        occcurrence = sparse_mtx[i][freq_idx]
        
        train_dataset.at[doc_nr, 'DOC_ID'] = doc_nr
        train_dataset.at[doc_nr, 'CATEGORY'] = label
        train_dataset.at[doc_nr, word_id] = occcurrence
    
    train_dataset.set_index('DOC_ID', inplace=True)
    return train_dataset

In [6]:
%%time

# Call the function to create a full matrix
full_training_pd = create_full_mtx(sparse_train_data, VOCABULARY_LIMIT)

Wall time: 8.03 s


In [21]:
# This is how it looks our final full matrix
full_training_pd.tail()

Unnamed: 0_level_0,CATEGORY,0,1,2,3,4,5,6,7,8,...,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5789,0,3,1,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
5790,0,1,1,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5791,0,3,1,0,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5794,0,1,1,1,0,0,1,2,0,0,...,0,0,0,0,0,0,0,0,0,0
5795,0,3,4,2,0,5,0,3,0,0,...,0,0,0,0,0,0,0,0,0,0


# Training the Naive Bayes algorithm

### Get all the necessary variables to use on the Bayes theorem.

In [23]:
# Find the probability of spam in our dataset.
spam_prob = full_training_pd.CATEGORY.sum() / full_training_pd.CATEGORY.size
print('Probability of spam is', spam_prob)

Probability of spam is 0.31133250311332505


In [26]:
# Total number of words or tokens.
token_num = full_training_pd.loc[:, full_training_pd.columns != 'CATEGORY']
token_num.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,1,2,1,0,0,3,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,1,2,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,6,1,1,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,6,0,0,2,4,0,3,13,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,1,2,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
# Total word count
email_length = token_num.sum(axis=1)
email_length.shape

(4015,)

In [31]:
word_count = email_length.sum()
word_count

431002

In [10]:
# Find the total number of token in spam and not spam
spam_lengths = email_length[full_training_pd.CATEGORY == 1]
spam_wc = spam_lengths.sum()

non_spam_lengths = email_length[full_training_pd.CATEGORY == 0]
non_spam_wc = non_spam_lengths.sum()

### Use the variables to sum the tokens

In [42]:
# Create a subset of all the rows that belong to spam messages
train_token_spam = token_num.loc[full_training_pd.CATEGORY == 1]
train_token_ham = token_num.loc[full_training_pd.CATEGORY == 0]

spam_summary_token = train_token_spam.sum(axis=0) + 1
ham_summary_token = train_token_ham.sum(axis=0) + 1
ham_summary_token.shape

(2500,)

In [34]:
# Testing if we have the correct data, result should be 0.
email_length.shape[0] - spam_lengths.shape[0] - non_spam_lengths.shape[0]

0

### Calculate probability
#### P(Word | Spam) - Probability that a word occurs given that the email is Spam

In [43]:
spam_probability = spam_summary_token / (spam_wc + VOCABULARY_LIMIT)

# Example of the probabilities we get
spam_probability[:5]

0    0.012019
1    0.005168
2    0.006735
3    0.011185
4    0.006686
dtype: float64

In [44]:
# As a test, we should get a value of 1 if we sum all the probabilities
spam_probability.sum()

1.0

#### P(Word | Nonspam) - Probability that a word occurs given that the email is Nonspam

In [45]:
non_spam_probability = ham_summary_token / (non_spam_wc + VOCABULARY_LIMIT)

# Example of the probabilities we get
non_spam_probability[:5]

0    0.021513
1    0.010154
2    0.008020
3    0.003680
4    0.006321
dtype: float64

In [46]:
# As a test, we should get a value of 1 if we sum all the probabilities
non_spam_probability.sum()

1.0

#### P(word) - Probability that a word is there regardless of it being spam or not.

In [47]:
#summed_all_tokens = token_num.sum(axis=0) + 1
#prob_word_all = summed_all_tokens / (word_count + VOCABULARY_LIMIT)
#prob_word_all.sum()
prob_tokens_all = token_num.sum(axis=0) / word_count

In [48]:
prob_tokens_all.sum()

1.0

## Save the Trained model

In [49]:
np.savetxt(TOKEN_PROBABILTY_SPAM, spam_probability)
np.savetxt(TOKEN_PROBABILTY_NONSPAM, non_spam_probability)
np.savetxt(TOKEN_PROBABILTY_ALL, prob_tokens_all)

## Prepare Test data

In [50]:
%%time

# Make a full matrix for test data
test_data = create_full_mtx(sparse_test_data, nr_words=VOCABULARY_LIMIT)

Wall time: 3.95 s


In [52]:
x_test = test_data.loc[:, test_data.columns != 'CATEGORY']
y_test = test_data.CATEGORY

In [53]:
np.savetxt(TEST_FEATURE_MTX, x_test)
np.savetxt(TEST_TARGET, y_test)