In [217]:
import tensorflow as tf
import numpy as np
import pandas as pd
import string
import re
import itertools as it
import nltk
from sklearn.naive_bayes import MultinomialNB
import sklearn
from sklearn import svm
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from scipy.sparse import csr_matrix

## Raw data processing: extract data from .csv file

In [218]:
sms = pd.read_csv('/Users/zhaomengxuan/Documents/text mining/final project/spam.csv', encoding='latin-1')
sms.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [219]:
#Drop column and name change
sms = sms.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
sms = sms.rename(columns={"v1":"label", "v2":"text"})
sms.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [220]:
#Count observations in each label
sms.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

## Raw data processing: extract structural features

In [221]:
rawtext = [sms['text'][n] for n in range(len(sms))]
rawtext[:5]

['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
 'Ok lar... Joking wif u oni...',
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
 'U dun say so early hor... U c already then say...',
 "Nah I don't think he goes to usf, he lives around here though"]

In [222]:
# structural feature 1: sms message length
sms_length = [len(st) for st in rawtext]
sms_length[:5]

[111, 29, 155, 49, 61]

In [223]:
# structural feature 2: numeric character ratio
numeric = [len(re.findall(r'\d', rawtext[i]))/len(rawtext[i]) for i in range(len(rawtext))]
numeric[:5]

[0.0, 0.0, 0.16129032258064516, 0.0, 0.0]

In [224]:
# structural features 3: non-alphanumeric character ratio
non_alphanumeric_pt = re.compile(r'[^\w\s]+')
non_alphanumeric = [len(''.join(re.findall(non_alphanumeric_pt, rawtext[i])))/len(rawtext[i]) for i in range(len(rawtext))]
non_alphanumeric[:5]

[0.08108108108108109,
 0.20689655172413793,
 0.03870967741935484,
 0.12244897959183673,
 0.03278688524590164]

In [225]:
# structural feature 4: if sms message include url
# include: 1, non: 0
url_index = [rawtext.index(st) for st in rawtext if 'http://' in st]
url_index

[15,
 304,
 517,
 634,
 832,
 880,
 1104,
 3057,
 3172,
 3461,
 3461,
 3860,
 3883,
 4164,
 4204,
 4256,
 4278,
 4353,
 4963]

In [226]:
url = [0] * len(rawtext)
for i in url_index:
    url[i] += 1
url[:5]

[0, 0, 0, 0, 0]

In [227]:
text = [sent.translate(str.maketrans('', '', string.punctuation)) for sent in rawtext]
text[:5]

['Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amore wat',
 'Ok lar Joking wif u oni',
 'Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive entry questionstd txt rateTCs apply 08452810075over18s',
 'U dun say so early hor U c already then say',
 'Nah I dont think he goes to usf he lives around here though']

In [228]:
digits = re.compile(r'\b\d{10,}\b')
text = [re.sub(digits, '<longdigit>', sent) for sent in text]
text[:10]

['Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amore wat',
 'Ok lar Joking wif u oni',
 'Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive entry questionstd txt rateTCs apply 08452810075over18s',
 'U dun say so early hor U c already then say',
 'Nah I dont think he goes to usf he lives around here though',
 'FreeMsg Hey there darling its been 3 weeks now and no word back Id like some fun you up for it still Tb ok XxX std chgs to send å£150 to rcv',
 'Even my brother is not like to speak with me They treat me like aids patent',
 'As per your request Melle Melle Oru Minnaminunginte Nurungu Vettam has been set as your callertune for all Callers Press 9 to copy your friends Callertune',
 'WINNER As a valued network customer you have been selected to receivea å£900 prize reward To claim call <longdigit> Claim code KL341 Valid 12 hours only',
 'Had your mobile 11 months or more U R entitled to Update

In [229]:
# structural feature 5: uppercase charactor ratio
def countupper(string):
    n = 0
    for letter in string:
        if letter.isupper(): n += 1
    ratio = n/len(string)
    return ratio
upper = [countupper(st) for st in text]
upper[:5]

[0.029411764705882353,
 0.08695652173913043,
 0.06711409395973154,
 0.046511627906976744,
 0.03389830508474576]

In [230]:
text_lower = [[w.lower() for w in sent.split()] for sent in text]
text_lower[:5]

[['go',
  'until',
  'jurong',
  'point',
  'crazy',
  'available',
  'only',
  'in',
  'bugis',
  'n',
  'great',
  'world',
  'la',
  'e',
  'buffet',
  'cine',
  'there',
  'got',
  'amore',
  'wat'],
 ['ok', 'lar', 'joking', 'wif', 'u', 'oni'],
 ['free',
  'entry',
  'in',
  '2',
  'a',
  'wkly',
  'comp',
  'to',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  '21st',
  'may',
  '2005',
  'text',
  'fa',
  'to',
  '87121',
  'to',
  'receive',
  'entry',
  'questionstd',
  'txt',
  'ratetcs',
  'apply',
  '08452810075over18s'],
 ['u', 'dun', 'say', 'so', 'early', 'hor', 'u', 'c', 'already', 'then', 'say'],
 ['nah',
  'i',
  'dont',
  'think',
  'he',
  'goes',
  'to',
  'usf',
  'he',
  'lives',
  'around',
  'here',
  'though']]

In [231]:
# structural reature 6: number of terms
terms_count = [len(t) for t in text_lower]
terms_count[:5]

[20, 6, 28, 11, 13]

In [232]:
structural = np.matrix(np.array([sms_length, numeric, non_alphanumeric, url, upper, terms_count])).T
structural.shape

(5572, 6)

## Split data into training (70%), dev (10%)(used later for neural net model) and testing (20%)

In [233]:
tr = int(len(text_lower)*0.7)
dev = int(len(text_lower)*0.8)

In [234]:
text_train, text_dev, text_test = text_lower[:tr], text_lower[tr:dev], text_lower[dev:]

In [235]:
text_train, text_test, text_dev = \
[' '.join(sent) for sent in text_train],\
[' '.join(sent) for sent in text_test],\
[' '.join(sent) for sent in text_dev]

In [236]:
vec = CountVectorizer()
vec.fit(text_train)
Xtrain, Xtest, Xdev = vec.transform(text_train), \
vec.transform(text_test), vec.transform(text_dev)

In [237]:
Xtrain, Xtest, Xdev = \
csr_matrix(Xtrain).toarray(), csr_matrix(Xtest).toarray(), csr_matrix(Xdev).toarray()

In [238]:
Xtrain.shape

(3900, 7600)

In [239]:
structural_train, structural_dev, structural_test = \
structural[:tr], structural[tr:dev], structural[dev:]

In [240]:
Xtr, Xte, Xdev = \
np.concatenate((Xtrain.T, structural_train.T)).T, \
np.concatenate((Xtest.T, structural_test.T)).T, \
np.concatenate((Xdev.T, structural_dev.T)).T

In [241]:
Xtr.shape

(3900, 7606)

In [242]:
Xte.shape

(1115, 7606)

In [243]:
Xdev.shape

(557, 7606)

## Labels for NB/SVM: 0 for non-spam, 1 for spam

In [244]:
simplelabels = []
for i in range(len(sms)):
    if sms['label'][i] == 'ham':
        simplelabels.append(0)
    elif sms['label'][i] == 'spam':
        simplelabels.append(1)
simplelabels = np.array(simplelabels)
simplelabels

array([0, 0, 1, ..., 0, 0, 0])

## Labels for tensorflow: [0,1] for non-spam, [1,0] for spam

In [245]:
labels = []
for i in range(len(sms)):
    if sms['label'][i] == 'ham':
        labels.append([1,0])
    elif sms['label'][i] == 'spam':
        labels.append([0,1])
labels = np.array(labels)

In [246]:
labels

array([[1, 0],
       [1, 0],
       [0, 1],
       ...,
       [1, 0],
       [1, 0],
       [1, 0]])

# No. of Spam vs Ham in Each Set

In [247]:
# training set
# 3381 non-spam, 519 spam, 6.5:1
sms.label[:tr].value_counts()

ham     3381
spam     519
Name: label, dtype: int64

In [248]:
# development set
# used later
sms.label[tr:dev].value_counts()

ham     474
spam     83
Name: label, dtype: int64

In [249]:
# testing set
# 970 non-spam, 145 spam, 6.7:1
sms.label[dev:].value_counts()

ham     970
spam    145
Name: label, dtype: int64

# Labels Spliting

In [250]:
# labels for NB, K-NN, SVM
# no dev set
simpytr, simpyte = simplelabels[:tr], simplelabels[dev:]

In [251]:
simpyte.shape

(1115,)

In [252]:
# labels for neural net
# training 70%, dev 10%, testing 20%
ytr, ydev, yte = labels[:tr], labels[tr:dev], labels[dev:]

## Naive Bayes

In [253]:
clf_nb = MultinomialNB()
clf_nb.fit(Xtr, simpytr)
pred_nb = clf_nb.predict(Xte)

In [254]:
pred_nb

array([0, 1, 0, ..., 0, 0, 0])

In [255]:
print('Precission:', round(sklearn.metrics.precision_score(simpyte, pred_nb), 3))
print('Recall:', round(sklearn.metrics.recall_score(simpyte, pred_nb), 3))
print('F1 Score:', round(sklearn.metrics.f1_score(simpyte, pred_nb), 3))

Precission: 0.962
Recall: 0.862
F1 Score: 0.909


In [256]:
pd.DataFrame(sklearn.metrics.confusion_matrix(simpyte, pred_nb), index=['non-spam', 'spam'], 
             columns=['predict non-spam','predict spam'])

Unnamed: 0,predict non-spam,predict spam
non-spam,965,5
spam,20,125


## K-NN

In [257]:
clf_knn = KNeighborsClassifier(n_neighbors = 1)
clf_knn.fit(Xtr, simpytr)
pred_knn = clf_knn.predict(Xte)

In [258]:
pred_knn[:20]

array([0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [259]:
print('Precission:', round(sklearn.metrics.precision_score(simpyte, pred_knn), 3))
print('Recall:', round(sklearn.metrics.recall_score(simpyte, pred_knn), 3))
print('F1 Score:', round(sklearn.metrics.f1_score(simpyte, pred_knn), 3))

Precission: 0.768
Recall: 0.89
F1 Score: 0.824


In [260]:
pd.DataFrame(sklearn.metrics.confusion_matrix(simpyte, pred_knn), index=['non-spam', 'spam'], 
             columns=['predict non-spam','predict spam'])

Unnamed: 0,predict non-spam,predict spam
non-spam,931,39
spam,16,129


## SVM

In [261]:
clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(Xtr, simpytr)
pred_svm = clf_svm.predict(Xte)

In [262]:
pred_svm[:20]

array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0])

In [263]:
print('Precission:', round(sklearn.metrics.precision_score(simpyte, pred_svm), 3))
print('Recall:', round(sklearn.metrics.recall_score(simpyte, pred_svm), 3))
print('F1 Score:', round(sklearn.metrics.f1_score(simpyte, pred_svm), 3))

Precission: 0.964
Recall: 0.91
F1 Score: 0.936


In [264]:
pd.DataFrame(sklearn.metrics.confusion_matrix(simpyte, pred_svm), index=['non-spam', 'spam'], 
             columns=['predict non-spam','predict spam'])

Unnamed: 0,predict non-spam,predict spam
non-spam,965,5
spam,13,132


## multi-layer Perceptron Neural Network

In [265]:
num_input = Xtr.shape[1]
num_classes = ytr.shape[1]
# tf Graph input
X = tf.placeholder("float", [None, num_input])
Y = tf.placeholder("float", [None, num_classes])

In [266]:
n_hidden_1 = 8
n_hidden_2 = 8
n_hidden_3 = 8

# Store layers weight & bias
weights = {
    'h1': tf.Variable(tf.random_normal([num_input, n_hidden_1])),
    'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),
    'h3': tf.Variable(tf.random_normal([n_hidden_2, n_hidden_3])),
    'out': tf.Variable(tf.random_normal([n_hidden_3, num_classes]))
}
biases = {
    'b1': tf.Variable(tf.random_normal([n_hidden_1])),
    'b2': tf.Variable(tf.random_normal([n_hidden_2])),
    'b3': tf.Variable(tf.random_normal([n_hidden_3])),
    'out': tf.Variable(tf.random_normal([num_classes]))
}

In [267]:
# Create model
def neural_net(x):
    # Hidden fully connected layer with 500 neurons
    layer_1 = tf.nn.sigmoid((tf.add(tf.matmul(x, weights['h1']), biases['b1'])))
    # Hidden fully connected layer with 500 neurons
    layer_2 = tf.nn.sigmoid((tf.add(tf.matmul(layer_1, weights['h2']), biases['b2'])))
    # Hidden fully connected layer with 500 neurons
    layer_3 = tf.nn.sigmoid((tf.add(tf.matmul(layer_2, weights['h3']), biases['b3'])))
    # Output fully connected layer with a neuron for each class
    out_layer = tf.nn.sigmoid((tf.matmul(layer_3, weights['out']) + biases['out']))
    return out_layer

In [268]:
# Parameters
learning_rate = 0.1
num_steps = 300
display_step = 50

In [269]:
# Construct model
logits = neural_net(X)

# Define loss and optimizer
loss_op = tf.nn.l2_loss(logits - Y)
optimizer = tf.train.FtrlOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(loss_op)

# Evaluate model (with test logits, for dropout to be disabled)
correct_pred = tf.equal(tf.argmax(logits, 1), tf.argmax(Y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

# Initialize the variables (i.e. assign their default value)
init = tf.global_variables_initializer()

In [270]:
# Start training
with tf.Session() as sess:

    # Run the initializer
    sess.run(init)

    for step in range(1, num_steps+1):
        # Run optimization op (backprop)
        sess.run(train_op, feed_dict={X: Xtr, Y: ytr})
        if step % display_step == 0 or step == 1:
            # Calculate batch loss and accuracy
            loss, acc = sess.run([loss_op, accuracy], feed_dict={X: Xtr,
                                                                 Y: ytr})
            print("Step " + str(step) + ", L2 Loss= " + \
                  "{:.3f}".format(loss) + ", Training Accuracy= " + \
                  "{:.3f}".format(acc))

    print("Optimization Finished!")

    # calculate precision, recall and f1 score on dev set, using sklearn
    pred_label = sess.run(logits, feed_dict={X: Xdev})
    pred = sess.run(tf.argmax(pred_label,1))
    y = sess.run(tf.argmax(ydev,1))
    
    print('\nDevelopment set results:')
    print('precision:', round(sklearn.metrics.precision_score(y, pred),3))
    print('recall:', round(sklearn.metrics.recall_score(y, pred),3))
    print('F1 score:',round(sklearn.metrics.f1_score(y, pred),3))
    print(pd.DataFrame(sklearn.metrics.confusion_matrix(y, pred), index=['non-spam', 'spam'], 
             columns=['predict non-spam','predict spam']))
    
    # calculate precision, recall and f1 score on testing set 
    pred_label_te = sess.run(logits, feed_dict={X: Xte})
    pred_te = sess.run(tf.argmax(pred_label_te,1))
    y_te = sess.run(tf.argmax(yte,1))
    
    print('\nTesting set results:')
    print('precision:', round(sklearn.metrics.precision_score(y_te, pred_te),3))
    print('recall:', round(sklearn.metrics.recall_score(y_te, pred_te),3))
    print('F1 score:',round(sklearn.metrics.f1_score(y_te, pred_te),3))
    print(pd.DataFrame(sklearn.metrics.confusion_matrix(y_te, pred_te), index=['non-spam', 'spam'], 
             columns=['predict non-spam','predict spam']))

Step 1, L2 Loss= 504.036, Training Accuracy= 0.867
Step 50, L2 Loss= 406.182, Training Accuracy= 0.867
Step 100, L2 Loss= 42.810, Training Accuracy= 0.995
Step 150, L2 Loss= 18.045, Training Accuracy= 0.998
Step 200, L2 Loss= 13.059, Training Accuracy= 0.998
Step 250, L2 Loss= 11.456, Training Accuracy= 0.998
Step 300, L2 Loss= 10.593, Training Accuracy= 0.998
Optimization Finished!

Development set results:
precision: 0.961
recall: 0.88
F1 score: 0.918
          predict non-spam  predict spam
non-spam               471             3
spam                    10            73

Testing set results:
precision: 0.97
recall: 0.897
F1 score: 0.932
          predict non-spam  predict spam
non-spam               966             4
spam                    15           130
