# SMS Spam Classification
This notebook illustrates classification of SMS as SPAM or NOT SPAM using Recurrent Neural Network and LSTM.

In [33]:
from collections import Counter
import tensorflow as tf
%matplotlib inline
import matplotlib.pyplot as plt
import csv
import pandas
import sklearn
import pickle
from wordcloud import WordCloud
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import learning_curve    
import joblib

In [34]:
data = pd.read_csv('data/emails.csv',encoding='latin-1')
data.head(10)

Unnamed: 0,text,label
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
5,"Subject: great nnews hello , welcome to medzo...",1
6,Subject: here ' s a hot play in motion homela...,1
7,Subject: save your money buy getting this thin...,1
8,Subject: undeliverable : home based business f...,1
9,Subject: save your money buy getting this thin...,1


## Data Preprocessing
Lets save our labels and messages to text files

In [35]:
np.savetxt(r'data\messages.txt', data['text'].values, fmt='%s')
np.savetxt(r'data\labels.txt', data['label'].values, fmt='%s')


In [36]:
with open('data/messages.txt', encoding="ISO-8859-1") as f:
    messages = f.read()
with open('data/labels.txt',encoding="ISO-8859-1") as f:
    labels = f.read()

In [37]:
messages[:5]

'Subje'

In [39]:
data['label'].value_counts()

0    4358
1    1368
Name: label, dtype: int64

### Remove punctuations such a (. , !) etc and seperate using delimiter

In [40]:
from string import punctuation
all_text = ''.join([c for c in messages if c not in punctuation])
messages = all_text.split('\n')

all_text = ' '.join(messages)
words = all_text.split()

In [41]:
print (all_text[:500])
print ("\n")
print (words[:20])

Subject naturally irresistible your corporate identity  lt is really hard to recollect a company  the  market is full of suqgestions and the information isoverwhelminq  but a good  catchy logo  stylish statlonery and outstanding website  will make the task much easier   we do not promise that havinq ordered a iogo your  company will automaticaily become a world ieader  it isguite ciear that  without good products  effective business organization and practicable aim it  will be hotat nowadays mar


['Subject', 'naturally', 'irresistible', 'your', 'corporate', 'identity', 'lt', 'is', 'really', 'hard', 'to', 'recollect', 'a', 'company', 'the', 'market', 'is', 'full', 'of', 'suqgestions']


### Building our vocabulary and converting messages to vectors

In [42]:
split_words = Counter(words)
sorted_split_words = sorted(split_words, key=split_words.get, reverse=True)
vocab_to_int = {c : i for i, c in enumerate(sorted_split_words,1)}

# Convert the reviews to integers, same shape as reviews list, but with integers
messages_ints = []
for message in messages:
    messages_ints.append([vocab_to_int[i] for i in message.split()])

In [43]:
print (sorted_split_words[:50])
print ("\n")
print (messages[0])
print (messages_ints[0])
print ("\n")
print (len(messages[0]))
print (len(messages_ints[0]))

['the', 'to', 'and', 'of', 'a', 'you', 'in', 'i', 'for', 'enron', 'on', 'is', 'ect', 'this', 'your', 'be', 'that', 'with', 'we', 'vince', 'will', 'have', 'at', 'from', 'it', 'are', 's', 'as', 'Subject', 'hou', 'com', 'by', 'or', 'if', 'am', 'please', '2000', 'kaminski', 'not', 'subject', 'me', 'would', 'our', 'can', 're', 'cc', 'j', 'my', 'an', '1']


Subject naturally irresistible your corporate identity  lt is really hard to recollect a company  the  market is full of suqgestions and the information isoverwhelminq  but a good  catchy logo  stylish statlonery and outstanding website  will make the task much easier   we do not promise that havinq ordered a iogo your  company will automaticaily become a world ieader  it isguite ciear that  without good products  effective business organization and practicable aim it  will be hotat nowadays market  but we do promise that your marketing efforts  will become much more effective  here is the list of clear  benefits  creativeness  hand  made

#### Converting labels to 0 and 1 - SPAM:1 and NOT SPAM:0 

In [44]:
labels = labels.split("\n")
labels = np.array([0 if label == "0" else 1 for label in labels])

In [45]:
print(labels)

[1 1 1 ... 0 0 1]


In [46]:
from collections import Counter

message_lens = Counter([len(x) for x in messages_ints])
print("Zero-length messages: {}".format(message_lens[0]))
print("Maximum message length: {}".format(max(message_lens)))

Zero-length messages: 1
Maximum message length: 5003


In [47]:
messages_ints = [message for message in messages_ints if (len(message)>0)]

### Padding vectors with zeros so that all inputs are of same length

In [48]:
seq_len = 200
num_messages = len(messages)
features = np.zeros([num_messages, seq_len], dtype=int)
for i, row in enumerate(messages_ints):
    features[i, -len(row):] = np.array(row)[:seq_len]

In [50]:
features[1]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,    29,     1,   503,   194, 22179, 22180,
          12,  4890,    76, 22181,    39, 16808, 22182,     3, 22183,
          73, 22184,

### Splitting into training, validation and test data

In [51]:
split_frac1 = 0.8

idx1 = int(len(features) * split_frac1)
train_x, val_x = features[:idx1], features[idx1:]
train_y, val_y = labels[:idx1], labels[idx1:]

split_frac2 = 0.5
idx2 = int(len(val_x) * split_frac2)
val_x, test_x = val_x[:idx2], val_x[idx2:]
val_y, test_y = val_y[:idx2], val_y[idx2:]

print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

print("\t\t\Label Shapes:")
print("Train set: \t\t{}".format(train_y.shape), 
      "\nValidation set: \t{}".format(val_y.shape),
      "\nTest set: \t\t{}".format(test_y.shape))

			Feature Shapes:
Train set: 		(4581, 200) 
Validation set: 	(573, 200) 
Test set: 		(573, 200)
		\Label Shapes:
Train set: 		(4581,) 
Validation set: 	(573,) 
Test set: 		(573,)


### Initial prediction using Logistic Regression

In [52]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

clf = LogisticRegression()
clf.fit(train_x,train_y)
p = clf.predict(val_x)
print (accuracy_score(val_y,p))

0.7260034904013961


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### Defining Hyperparameters

In [53]:

lstm_size = 200
lstm_layers = 1
batch_size = 100
learning_rate = 0.001

### Creating placeholder for inputs, labels and dropout rate 

In [54]:
n_words = len(sorted_split_words)

# Create the graph object
graph = tf.Graph()
# Add nodes to the graph
with graph.as_default():
    inputs_ = tf.placeholder(tf.int32, [None,None], name = "inputs")
    labels_ = tf.placeholder(tf.int32, [None,None], name = "labels")
    keep_prob = tf.placeholder(tf.float32, name = "keep_prob")

In [55]:
# Size of the embedding vectors (number of units in the embedding layer)
embed_size = 300 

with graph.as_default():
    embedding = tf.Variable(tf.random_uniform((n_words, embed_size), -1, 1))
    embed = tf.nn.embedding_lookup(embedding, inputs_)

Membuat cell LSTM untuk digunakan didalam RNN

In [56]:
with graph.as_default():
    # Your basic LSTM cell
    lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
    
    # Add dropout to the cell
    drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
    
    # Stack up multiple LSTM layers, for deep learning
    cell = tf.contrib.rnn.MultiRNNCell([drop] * lstm_layers)
    
    # Getting an initial state of all zeros
    initial_state = cell.zero_state(batch_size, tf.float32)

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.


Now we need to actually run the data through the RNN nodes. You can use tf.nn.dynamic_rnn to do this. You'd pass in the RNN cell you created (our multiple layered LSTM cell for instance), and the inputs to the network.

Initial_state is the cell state that is passed between the hidden layers in successive time steps. We pass in our cell and the input to the cell, then it does the unrolling and everything else for us. It returns outputs for each time step and the final_state of the hidden layer.

In [57]:
with graph.as_default():
    outputs, final_state = tf.nn.dynamic_rnn(cell, embed, initial_state=initial_state)

Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


We only care about the final output, we'll be using that as our sentiment prediction. So we need to grab the last output with outputs[:, -1], the calculate the cost from that and labels_.

In [58]:

with graph.as_default():
    predictions = tf.contrib.layers.fully_connected(outputs[:, -1], 1, activation_fn=tf.sigmoid)
    cost = tf.losses.mean_squared_error(labels_, predictions)
    
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Menghitung tingkat akurasi dan prediksi

In [59]:
with graph.as_default():
    correct_pred = tf.equal(tf.cast(tf.round(predictions), tf.int32), labels_)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

Creating Batches

In [60]:
def get_batches(x, y, batch_size=100):
    
    n_batches = len(x)//batch_size
    x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size], y[ii:ii+batch_size]

### Training

In [61]:
epochs = 1

with graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())
    iteration = 1
    for e in range(epochs):
        state = sess.run(initial_state)
        
        for ii, (x, y) in enumerate(get_batches(train_x, train_y, batch_size), 1):
            feed = {inputs_: x,
                    labels_: y[:, None],
                    keep_prob: 0.5,
                    initial_state: state}
            loss, state, _ = sess.run([cost, final_state, optimizer], feed_dict=feed)
            
            if iteration%5==0:
                print("Epoch: {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Train loss: {:.3f}".format(loss))

            if iteration%25==0:
                val_acc = []
                val_state = sess.run(cell.zero_state(batch_size, tf.float32))
                for x, y in get_batches(val_x, val_y, batch_size):
                    feed = {inputs_: x,
                            labels_: y[:, None],
                            keep_prob: 1,
                            initial_state: val_state}
                    batch_acc, val_state = sess.run([accuracy, final_state], feed_dict=feed)
                    val_acc.append(batch_acc)
                print("Val acc: {:.3f}".format(np.mean(val_acc)))
            iteration +=1
#     joblib.dump(sess, 'checkpoints/best.pkl')
    saver.save(sess, "checkpoints/sentiment.ckpt")


Epoch: 0/1 Iteration: 5 Train loss: 0.086
Epoch: 0/1 Iteration: 10 Train loss: 0.000
Epoch: 0/1 Iteration: 15 Train loss: 0.999
Epoch: 0/1 Iteration: 20 Train loss: 1.000
Epoch: 0/1 Iteration: 25 Train loss: 1.000
Val acc: 0.000
Epoch: 0/1 Iteration: 30 Train loss: 0.999
Epoch: 0/1 Iteration: 35 Train loss: 0.998
Epoch: 0/1 Iteration: 40 Train loss: 0.990
Epoch: 0/1 Iteration: 45 Train loss: 0.921


### Testing 

In [None]:
test_acc = []
with tf.Session(graph=graph) as sess:
    saver.restore(sess, tf.train.latest_checkpoint('checkpoints'))
    test_state = sess.run(cell.zero_state(batch_size, tf.float32))
    for ii, (x, y) in enumerate(get_batches(test_x, test_y, batch_size), 1):
        feed = {inputs_: x,
                labels_: y[:, None],
                keep_prob: 1,
                initial_state: test_state}
        batch_acc, test_state = sess.run([accuracy, final_state], feed_dict=feed)
        test_acc.append(batch_acc)
    print("Test accuracy: {:.3f}".format(np.mean(test_acc)))