In [194]:
import tensorflow as tf
import numpy as np
import pandas as pd
import string
import re
import itertools as it
import nltk
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB
import sklearn
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse.csr import csr_matrix
from sklearn.neighbors import KNeighborsClassifier

## Raw data processing: extract data from .csv file

In [195]:
sms = pd.read_csv('/Users/zhaomengxuan/Documents/text mining/final project/spam.csv', encoding='latin-1')
sms.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [196]:
#Drop column and name change
sms = sms.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
sms = sms.rename(columns={"v1":"label", "v2":"text"})
sms.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [197]:
# convert label to a numerical variable
sms['simplabel'] = sms.label.map({'ham':0, 'spam':1})
sms.head()

Unnamed: 0,label,text,simplabel
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [198]:
#Count observations in each label
sms.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

## Raw data processing

In [199]:
text = [sms['text'][n] for n in range(len(sms))]
text = [sent.translate(str.maketrans('', '', string.punctuation)) for sent in text]
text[:5]

['Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amore wat',
 'Ok lar Joking wif u oni',
 'Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive entry questionstd txt rateTCs apply 08452810075over18s',
 'U dun say so early hor U c already then say',
 'Nah I dont think he goes to usf he lives around here though']

In [200]:
tr = int(0.7 * len(text))
dev = int(0.8 * len(text))
X_train, X_dev, X_test = sms['text'][:tr], sms['text'][tr:dev], sms['text'][dev:]
y_train, y_dev, y_test = sms["simplabel"][:tr], sms["simplabel"][tr:dev], sms["simplabel"][dev:]

In [201]:
print(X_train.shape)
print(X_test.shape)
print(X_dev.shape)
print(y_train.shape)
print(y_test.shape)
print(y_dev.shape)

(3900,)
(1115,)
(557,)
(3900,)
(1115,)
(557,)


In [202]:
vec = CountVectorizer()
vec.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [203]:
print(vec.get_feature_names()[0:20])
print(vec.get_feature_names()[-20:])

['00', '000', '000pes', '008704050406', '0089', '01223585236', '01223585334', '0125698789', '02', '0207', '02072069400', '02085076972', '021', '03', '04', '0430', '05', '050703', '0578', '06']
['zoe', 'zogtorius', 'zoom', 'zouk', 'zyada', 'åð', 'åòharry', 'åòit', 'åômorrow', 'åôrents', 'ì_', 'ì¼1', 'ìï', 'û_', 'ûªm', 'ûªt', 'ûªve', 'ûï', 'ûïharry', 'ûò']


In [204]:
Xtr = vec.transform(X_train)

In [205]:
Xte = vec.transform(X_test)

In [206]:
Xtr.shape

(3900, 7223)

In [207]:
Xte.shape

(1115, 7223)

In [208]:
simpytr, simpyte = np.array(y_train), np.array(y_test)

## Naive Bayes

In [209]:
clf_nb = MultinomialNB()
clf_nb.fit(Xtr, simpytr)
pred_nb = clf_nb.predict(Xte)

In [210]:
pred_nb[:20]

array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0])

In [211]:
print('Precission:', round(sklearn.metrics.precision_score(simpyte, pred_nb), 3))
print('Recall:', round(sklearn.metrics.recall_score(simpyte, pred_nb), 3))
print('F1 Score:', round(sklearn.metrics.f1_score(simpyte, pred_nb), 3))

Precission: 0.951
Recall: 0.945
F1 Score: 0.948


In [212]:
pd.DataFrame(sklearn.metrics.confusion_matrix(simpyte, pred_nb), index=['non-spam', 'spam'], 
             columns=['predict non-spam','predict spam'])

Unnamed: 0,predict non-spam,predict spam
non-spam,963,7
spam,8,137


## K-NN

In [213]:
clf_knn = KNeighborsClassifier(n_neighbors = 1)
clf_knn.fit(Xtr, simpytr)
pred_knn = clf_knn.predict(Xte)

In [214]:
pred_knn[:20]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [215]:
print('Precission:', round(sklearn.metrics.precision_score(simpyte, pred_knn), 3))
print('Recall:', round(sklearn.metrics.recall_score(simpyte, pred_knn), 3))
print('F1 Score:', round(sklearn.metrics.f1_score(simpyte, pred_knn), 3))

Precission: 0.988
Recall: 0.579
F1 Score: 0.73


In [216]:
pd.DataFrame(sklearn.metrics.confusion_matrix(simpyte, pred_knn), index=['non-spam', 'spam'], 
             columns=['predict non-spam','predict spam'])

Unnamed: 0,predict non-spam,predict spam
non-spam,969,1
spam,61,84


## SVM

In [217]:
clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(Xtr, simpytr)
pred_svm = clf_svm.predict(Xte)

In [218]:
pred_svm[:20]

array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [219]:
print('Precission:', round(sklearn.metrics.precision_score(simpyte, pred_svm), 3))
print('Recall:', round(sklearn.metrics.recall_score(simpyte, pred_svm), 3))
print('F1 Score:', round(sklearn.metrics.f1_score(simpyte, pred_svm), 3))

Precission: 0.992
Recall: 0.89
F1 Score: 0.938


In [220]:
pd.DataFrame(sklearn.metrics.confusion_matrix(simpyte, pred_svm), index=['non-spam', 'spam'], 
             columns=['predict non-spam','predict spam'])

Unnamed: 0,predict non-spam,predict spam
non-spam,969,1
spam,16,129


## Multi-layer Perceptron Neural Network

In [221]:
Xdev = vec.transform(X_dev)

In [222]:
Xtr, Xte, Xdev = csr_matrix(Xtr).toarray(), csr_matrix(Xte).toarray(), csr_matrix(Xdev).toarray()

In [223]:
def NNlabel(label):
    NNlabel = []
    for item in label:
        if item == 0: NNlabel.append([1,0])
        else: NNlabel.append([0,1])
    NNlabel = np.array(NNlabel)
    return NNlabel

In [224]:
ytr, yte, ydev = NNlabel(y_train), NNlabel(y_test), NNlabel(y_dev)

In [225]:
num_input = Xtr.shape[1]
num_classes = ytr.shape[1]
# tf Graph input
X = tf.placeholder("float", [None, num_input])
Y = tf.placeholder("float", [None, num_classes])

In [226]:
n_hidden_1 = 8
n_hidden_2 = 8
n_hidden_3 = 8

# Store layers weight & bias
weights = {
    'h1': tf.Variable(tf.random_normal([num_input, n_hidden_1])),
    'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),
    'h3': tf.Variable(tf.random_normal([n_hidden_2, n_hidden_3])),
    'out': tf.Variable(tf.random_normal([n_hidden_3, num_classes]))
}
biases = {
    'b1': tf.Variable(tf.random_normal([n_hidden_1])),
    'b2': tf.Variable(tf.random_normal([n_hidden_2])),
    'b3': tf.Variable(tf.random_normal([n_hidden_3])),
    'out': tf.Variable(tf.random_normal([num_classes]))
}

In [227]:
# Create model
def neural_net(x):
    # Hidden fully connected layer with 500 neurons
    layer_1 = tf.nn.sigmoid((tf.add(tf.matmul(x, weights['h1']), biases['b1'])))
    # Hidden fully connected layer with 500 neurons
    layer_2 = tf.nn.sigmoid((tf.add(tf.matmul(layer_1, weights['h2']), biases['b2'])))
    # Hidden fully connected layer with 500 neurons
    layer_3 = tf.nn.sigmoid((tf.add(tf.matmul(layer_2, weights['h3']), biases['b3'])))
    # Output fully connected layer with a neuron for each class
    out_layer = tf.nn.sigmoid((tf.matmul(layer_3, weights['out']) + biases['out']))
    return out_layer

In [228]:
# Parameters
learning_rate = 0.1
num_steps = 300
display_step = 50

In [229]:
# Construct model
logits = neural_net(X)

# Define loss and optimizer
loss_op = tf.nn.l2_loss(logits - Y)
optimizer = tf.train.FtrlOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(loss_op)

# Evaluate model (with test logits, for dropout to be disabled)
correct_pred = tf.equal(tf.argmax(logits, 1), tf.argmax(Y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

# Initialize the variables (i.e. assign their default value)
init = tf.global_variables_initializer()

In [230]:
# Start training
with tf.Session() as sess:

    # Run the initializer
    sess.run(init)

    for step in range(1, num_steps+1):
        # Run optimization op (backprop)
        sess.run(train_op, feed_dict={X: Xtr, Y: ytr})
        if step % display_step == 0 or step == 1:
            # Calculate batch loss and accuracy
            loss, acc = sess.run([loss_op, accuracy], feed_dict={X: Xtr,
                                                                 Y: ytr})
            print("Step " + str(step) + ", L2 Loss= " + \
                  "{:.3f}".format(loss) + ", Training Accuracy= " + \
                  "{:.3f}".format(acc))

    print("Optimization Finished!")

    # calculate precision, recall and f1 score on dev set, using sklearn
    pred_label = sess.run(logits, feed_dict={X: Xdev})
    pred = sess.run(tf.argmax(pred_label,1))
    y = sess.run(tf.argmax(ydev,1))
    
    print('\nDevelopment set results:')
    print('precision:', round(sklearn.metrics.precision_score(y, pred),3))
    print('recall:', round(sklearn.metrics.recall_score(y, pred),3))
    print('F1 score:',round(sklearn.metrics.f1_score(y, pred),3))
    print(pd.DataFrame(sklearn.metrics.confusion_matrix(y, pred), index=['non-spam', 'spam'], 
             columns=['predict non-spam','predict spam']))
    
    # calculate precision, recall and f1 score on testing set 
    pred_label_te = sess.run(logits, feed_dict={X: Xte})
    pred_te = sess.run(tf.argmax(pred_label_te,1))
    y_te = sess.run(tf.argmax(yte,1))
    
    print('\nTesting set results:')
    print('precision:', round(sklearn.metrics.precision_score(y_te, pred_te),3))
    print('recall:', round(sklearn.metrics.recall_score(y_te, pred_te),3))
    print('F1 score:',round(sklearn.metrics.f1_score(y_te, pred_te),3))
    print(pd.DataFrame(sklearn.metrics.confusion_matrix(y_te, pred_te), index=['non-spam', 'spam'], 
             columns=['predict non-spam','predict spam']))

Step 1, L2 Loss= 785.555, Training Accuracy= 0.867
Step 50, L2 Loss= 44.931, Training Accuracy= 0.998
Step 100, L2 Loss= 13.183, Training Accuracy= 1.000
Step 150, L2 Loss= 7.195, Training Accuracy= 1.000
Step 200, L2 Loss= 4.278, Training Accuracy= 1.000
Step 250, L2 Loss= 3.151, Training Accuracy= 1.000
Step 300, L2 Loss= 2.471, Training Accuracy= 1.000
Optimization Finished!

Development set results:
precision: 1.0
recall: 0.916
F1 score: 0.956
          predict non-spam  predict spam
non-spam               474             0
spam                     7            76

Testing set results:
precision: 0.971
recall: 0.931
F1 score: 0.951
          predict non-spam  predict spam
non-spam               966             4
spam                    10           135
