In [1]:
import numpy as np
import pandas as pd

### Data Loading

In [2]:
data = pd.read_csv('spam.csv', encoding='latin-1')
# Show first 5 in dataset
data.head(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
# Drop some unwanted columns that do not provide insight
data = data.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
data = data.rename(columns={"v1":"label", "v2":"text"})
# Show last 5 in dataset
data.tail(5)

Unnamed: 0,label,text
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [4]:
data["label_tag"] = data.label.map({'ham':0, 'spam':1})
data.head(5)

Unnamed: 0,label,text,label_tag
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [5]:
# get the size of our dataset
print(data.count())
data.label.value_counts()

label        5572
text         5572
label_tag    5572
dtype: int64


ham     4825
spam     747
Name: label, dtype: int64

### Data Preparation

Training data

In [6]:
# first 4572/5572 emails
training_data = data[0:4572]
training_data_length = len(training_data.label)
training_data.head()

Unnamed: 0,label,text,label_tag
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


Testing data

In [7]:
# last 1000/5572
test_data = data[-1000:]
test_data_length = len(test_data.label)
test_data.head()

Unnamed: 0,label,text,label_tag
4572,spam,\URGENT! This is the 2nd attempt to contact U!...,1
4573,ham,:( but your not here....,0
4574,ham,Not directly behind... Abt 4 rows behind Ì_...,0
4575,spam,Congratulations ur awarded 500 of CD vouchers ...,1
4576,spam,Had your contract mobile 11 Mnths? Latest Moto...,1


#### What is the shape of our input data

Training data

In [8]:
print(training_data.shape)
print(training_data.label.shape)

(4572, 3)
(4572,)


There are 3 features and 4572 samples in our trtaining set

Test data

In [9]:
print(test_data.shape)
print(test_data.label.shape)

(1000, 3)
(1000,)


### Develop a Predictive Theory

In [10]:
import random

In [11]:
def pretty_print_text_and_label(i):
    print(training_data.label[i] + "\t:\t" + training_data.text[i][:80] + "...")

In [20]:
print("labels \t : \t texts\n")
# choose  a random spam set to analyse
# random.randrange(start, stop, step)
pretty_print_text_and_label(random.randrange(0,4572))
pretty_print_text_and_label(random.randrange(0,4572,4))
pretty_print_text_and_label(random.randrange(0,4572,50))
pretty_print_text_and_label(random.randrange(0,4572,100))
pretty_print_text_and_label(random.randrange(0,4572,200))
pretty_print_text_and_label(random.randrange(0,4572,500))
pretty_print_text_and_label(random.randrange(0,4572,800))
pretty_print_text_and_label(random.randrange(0,4572,1000))

labels 	 : 	 texts

ham	:	K.k:)apo k.good movie....
ham	:	I'm ok wif it cos i like 2 try new things. But i scared u dun like mah. Cos u sa...
ham	:	Probably money worries. Things are coming due and i have several outstanding inv...
ham	:	Wylie update: my weed dealer carlos went to freedom and had a class with lunsfor...
ham	:	No..but heard abt tat.....
ham	:	says the  &lt;#&gt;  year old with a man and money. I'm down to my last  &lt;#&g...
ham	:	He's just gonna worry for nothing. And he won't give you money its no use....
ham	:	He's just gonna worry for nothing. And he won't give you money its no use....


It is very easy to distinguish a spam text from a non-spam text (in this case ham) . Spam text occasionaly contain words like **free**, **sell**, **promotion**, **deal**, **offer**, **discount**, **lucky** e.t.c. This way we can let our network learn some of the words assocaiated with spams and based on such criteria we can classify a text as a spam or not.

#### Theory Validation

In [21]:
from collections import Counter
import numpy as np
import pprint 

In [22]:
spam_counts = Counter()
ham_counts = Counter()
total_counts = Counter()
spam_ham_ratios = Counter()

pp = pprint.PrettyPrinter(indent=4)

In [23]:
for i in range(training_data_length):
    if(training_data.label[i] == 0):
        for word in training_data.text[i].split(" "):
            ham_counts[word] += 1
            total_counts[word] += 1
    else:
        for word in training_data.text[i].split(" "):
            spam_counts[word] += 1
            total_counts[word] += 1

In [24]:
pp.pprint(spam_counts.most_common()[0:30])

[   ('to', 1758),
    ('you', 1368),
    ('I', 1204),
    ('a', 1094),
    ('the', 989),
    ('and', 736),
    ('in', 652),
    ('is', 648),
    ('i', 612),
    ('u', 567),
    ('for', 529),
    ('my', 522),
    ('', 521),
    ('of', 498),
    ('me', 465),
    ('your', 447),
    ('on', 410),
    ('have', 402),
    ('2', 371),
    ('that', 358),
    ('are', 327),
    ('it', 313),
    ('or', 304),
    ('call', 303),
    ('at', 300),
    ('be', 299),
    ('not', 292),
    ('with', 281),
    ('get', 270),
    ('will', 266)]


In [55]:
for word,count in list(total_counts.most_common()):
    if(count > 100):
        spam_ham_ratio = spam_counts[word] / float(ham_counts[word]+1)
        spam_ham_ratios[word] = spam_ham_ratio

pp.pprint(spam_ham_ratios)

Counter({   'to': 1758.0,
            'you': 1368.0,
            'I': 1204.0,
            'a': 1094.0,
            'the': 989.0,
            'and': 736.0,
            'in': 652.0,
            'is': 648.0,
            'i': 612.0,
            'u': 567.0,
            'for': 529.0,
            'my': 522.0,
            '': 521.0,
            'of': 498.0,
            'me': 465.0,
            'your': 447.0,
            'on': 410.0,
            'have': 402.0,
            '2': 371.0,
            'that': 358.0,
            'are': 327.0,
            'it': 313.0,
            'or': 304.0,
            'call': 303.0,
            'at': 300.0,
            'be': 299.0,
            'not': 292.0,
            'with': 281.0,
            'get': 270.0,
            'will': 266.0,
            'U': 253.0,
            'so': 239.0,
            'but': 236.0,
            'can': 235.0,
            '&lt;#&gt;': 235.0,
            'ur': 233.0,
            "I'm": 228.0,
            'You': 223.0,
            'when': 204.

### Transform Text into Numbers

Neural Networks only understand numbers hence we have to find a way to represent our text inputs in a way it can understand

In [28]:
vocab = set(total_counts.keys())
vocab_size = len(vocab)
print(vocab_size)

13874


We can see that from all our dataset, we have a total of **13874** unique words. Use this to build up our vocabulary vector containing columns of all these words.

Because, **13874**, can be a large size in memory (a matrix of size **13874 by 4572**), lets allocate its memory once with default zeros and will only change its contents accordingly later.

In [29]:
vocab_vector = np.zeros((1, vocab_size))
pp.pprint(vocab_vector.shape)
pp.pprint(vocab_vector)

(1, 13874)
array([[0., 0., 0., ..., 0., 0., 0.]])


Now, let's create a dictionary that allows us to look at every word in our vocabulary and map it to the `vocab_vector` column.

In [31]:
#  Maps a word to its column in the vocab_vector
word_column_dict = {}

for i, word in enumerate(vocab):
    # {key: value} is {word: column}
    word_column_dict[word] = i
    
pp.pprint(word_column_dict)

{   '': 0,
    '!': 11564,
    '!!': 11546,
    '!!!': 9762,
    '!!!!': 12034,
    "!!''.": 8649,
    '!1': 10114,
    '#': 10692,
    '#150': 1340,
    '#5000': 11499,
    '$': 8049,
    '$1': 7903,
    '$2': 2388,
    '$350': 3954,
    '$5.00': 3557,
    '$50': 7240,
    '$50...': 10123,
    '$700': 8683,
    '$900': 5139,
    '$95/pax,': 4221,
    '%': 100,
    '%.': 8176,
    '%of': 13449,
    '&': 6536,
    '&SAM': 11036,
    '&XXX': 9810,
    '&amp;': 11065,
    '&gt;:(': 11325,
    '&it': 12368,
    '&lt;#&gt;': 7042,
    '&lt;)': 12142,
    '&lt;3': 9555,
    '&lt;DECIMAL&gt;': 8014,
    '&lt;EMAIL&gt;': 9052,
    '&lt;TIME&gt;': 1982,
    '&lt;URL&gt;': 11930,
    '&othrs': 6234,
    "'": 10694,
    "''": 7206,
    "''OK'',": 5447,
    "'An": 8518,
    "'Comfort'": 9925,
    "'IF": 12518,
    "'Luxury'": 11391,
    "'MARRIED'": 8201,
    "'Maangalyam": 9297,
    "'Melle": 2458,
    "'Need'": 10352,
    "'SIMPLE'": 10335,
    "'Uptown": 8406,
    "'Wnevr": 12230,
    "'anythin

We are going to use the count of words as the input to our neural network. The `vocab_vector` will have columns for all the words in our training data in the form of `{key: value}` i.e `{word: count}` as held by the `word_column_dict`  python `Dictionary`. The individual word counts in any particular text is updated from 0 to a number based on a word's total count in any single text.

This means that the words with a higher count might have a higher weight in determining whether a text is a spam or not.

In [40]:
def update_input_layer(text):
    pp.pprint(text)
    global vocab_vector
    
    # clear out previous state, reset the vector to be all 0s
    vocab_vector *= 0
    for word in text.split(" "):
        vocab_vector[0][word_column_dict[word]] += 1

update_input_layer(training_data["text"][random.randrange(0,4572,4)])
pp.pprint(vocab_vector)

'Will it help if we propose going back again tomorrow'
array([[0., 0., 0., ..., 0., 0., 0.]])


### Build the SpamClassificationNeuralNetwork

In [41]:
import time
import sys

In [106]:
# Let's tweak our network from before to model these phenomena
class SpamClassificationNeuralNetwork(object):
    def __init__(self, training_data, num_hidden_nodes = 10, num_epochs = 10, learning_rate = 0.01):
        # set our random number generator 
        np.random.seed(1)
        # pre-process data
        self.pre_process_data(training_data)
        
        self.num_features = len(self.vocab)
        self.vocab_vector = np.zeros((1,len(self.vocab)))
        self.num_input_nodes = self.num_features
        self.num_hidden_nodes = num_hidden_nodes
        self.num_epochs = num_epochs
        self.num_output_nodes = 1
        self.learning_rate = learning_rate

        # Initialize weights
        self.weights_i_h = np.random.randn(self.num_input_nodes, self.num_hidden_nodes)
        self.weights_h_o = np.random.randn(self.num_hidden_nodes, self.num_output_nodes)
        
    def forward_backward_propagate(self, text, label):
        ### Forward pass ###
        # Input Layer
        self.update_input_layer(text)
        # Hidden layer
        hidden_layer = self.vocab_vector.dot(self.weights_i_h)
        # Output layer
        output_layer = self.sigmoid(hidden_layer.dot(self.weights_h_o))
        
        ### Backward pass ###
        # Output error
        output_layer_error = output_layer - label 
        output_layer_delta = output_layer_error * self.sigmoid_derivative(output_layer)

        # Backpropagated error - to the hidden layer
        hidden_layer_error = output_layer_delta.dot(self.weights_h_o.T) #
        # hidden layer gradients - no nonlinearity so it's the same as the error
        hidden_layer_delta = output_layer_error

        # update the weights - with grdient descent
        self.weights_h_o -= hidden_layer.T.dot(output_layer_delta) * self.learning_rate
        self.weights_i_h -= self.vocab_vector.T.dot(hidden_layer_delta) * self.learning_rate
        
        if(np.abs(output_layer_error) < 0.5):
                self.correct_so_far += 1
        
        
    
    def sigmoid(self,x):
        return 1 / (1 + np.exp(-x))
    
    
    def sigmoid_derivative(self,x):
        return x * (1 - x)

        
    def train(self):
        for epoch in range(self.num_epochs):
            self.correct_so_far = 0
            start = time.time()

            for i in range(len(training_data)):
                # Forward and Back Propagation
                self.forward_backward_propagate(training_data["text"][i], training_data["label_tag"][i])

                samples_per_second = i / float(time.time() - start + 0.001)

                sys.stdout.write("\rEpoch: "+ str(epoch)
                                 +" Progress: " + str(100 * i/float(len(training_data)))[:4] 
                                 + " % Speed(samples/sec): " + str(samples_per_second)[0:5] 
                                 + " #Correct: " + str(self.correct_so_far) 
                                 + " #Trained: " + str(i+1) 
                                 + " Training Accuracy: " + str(self.correct_so_far * 100 / float(i+1))[:4] + "%")
            print("")
        
    def pre_process_data(self, training_data):
        vocab = set()
        
        for review in training_data["text"]:
            for word in review.split(" "):
                vocab.add(word)
                
        self.vocab = list(vocab)
        
        
            
    def update_input_layer(self, text):
        global vocab_vector

        # clear out previous state, reset the vector to be all 0s
        self.vocab_vector *= 0
        for word in text.split(" "):
            self.vocab_vector[0][word_column_dict[word]] += 1
            

In [107]:
nn = SpamClassificationNeuralNetwork(training_data, num_epochs = 10, learning_rate=0.01)

In [108]:
nn.train()

Epoch: 0 Progress: 99.9 % Speed(samples/sec): 1114. #Correct: 3219 #Trained: 4572 Training Accuracy: 70.4%
Epoch: 1 Progress: 99.9 % Speed(samples/sec): 1170. #Correct: 3846 #Trained: 4572 Training Accuracy: 84.1%
Epoch: 2 Progress: 99.9 % Speed(samples/sec): 1304. #Correct: 4026 #Trained: 4572 Training Accuracy: 88.0%
Epoch: 3 Progress: 99.9 % Speed(samples/sec): 1385. #Correct: 4120 #Trained: 4572 Training Accuracy: 90.1%
Epoch: 4 Progress: 99.9 % Speed(samples/sec): 1383. #Correct: 4200 #Trained: 4572 Training Accuracy: 91.8%
Epoch: 5 Progress: 99.9 % Speed(samples/sec): 1234. #Correct: 4260 #Trained: 4572 Training Accuracy: 93.1%
Epoch: 6 Progress: 99.9 % Speed(samples/sec): 1215. #Correct: 4294 #Trained: 4572 Training Accuracy: 93.9%
Epoch: 7 Progress: 99.9 % Speed(samples/sec): 1356. #Correct: 4328 #Trained: 4572 Training Accuracy: 94.6%
Epoch: 8 Progress: 99.9 % Speed(samples/sec): 1373. #Correct: 4364 #Trained: 4572 Training Accuracy: 95.4%
Epoch: 9 Progress: 99.9 % Speed(sampl