In [16]:
%matplotlib inline 
import requests
import pprint # for pretty printing
import os # listing and managing file patho
import zipfile # for zip and unzip utilities
import pandas # for data analysis
import csv#导入numpy的库函数
import numpy as np
import matplotlib.pyplot as plt # for plotting
from textblob import TextBlob
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer # for converting documents in word count
from cvxopt import matrix, solvers, spmatrix, sparse
from sklearn import svm
from array import array
import scipy as sp

In [17]:
with  open('./data/SMSSpamCollection', 'r') as f:
    sms_messages = f.readlines()
sms_messages = [m.rstrip() for m in sms_messages]
messages = pandas.read_csv('./data/SMSSpamCollection', sep='\t', quoting=csv.QUOTE_NONE,
                           names=["label", "message"])
messages['length'] = messages['message'].map(lambda text: len(text))
def split_into_tokens(message):
    return TextBlob(message).words 
messages.message.head().apply(split_into_tokens)
def split_into_lemmas(message):
    words = TextBlob(message).words.lower()
    # for each word, take its "base form" = lemma 
    return [word.lemma for word in words]

# see how head portion changes
messages.message.head().apply(split_into_lemmas)
bow_transformer = CountVectorizer(analyzer=split_into_lemmas).fit(messages['message'])
messages_bow = bow_transformer.transform(messages['message'])#messages_bow是数组化的文本了

In [18]:
np.random.seed(1)
N= 5574# number of sms message
percent_step_size = int(5574*20/100)
random_indices = np.random.permutation(range(N))
train_indices = random_indices[:3*percent_step_size]
valid_indices = random_indices[3*percent_step_size: 4*percent_step_size]
test_indices = random_indices[4*percent_step_size: N+1]

X=messages_bow[train_indices]
_Y=messages.label[train_indices]
Y=np.zeros((_Y.shape[0]))
for i,j in zip(range(_Y.shape[0]),_Y):
    if j == 'spam':
        Y[i]=1
    else:
        Y[i]=-1 

In [19]:
#define C, kernel, and matrixs 

C=300
def kernel(x,y):
    gama=10
    b=np.linalg.norm(np.subtract(x, y))**2
    b=np.exp(-gama*b)
    return b  
_X=X.todense()
X_v=messages_bow[valid_indices]
n_v=X_v.shape[0]
_X_v=X_v.todense()
X_t=messages_bow[test_indices]
n_t=X_t.shape[0]
_X_t=X_t.todense()

In [20]:
#train the model
n=X.shape[0]
K=np.zeros((n,n))
for i in range(n):
    for j in range(n):
        K[i,j]=kernel(_X[i],_X[j])

In [21]:
P=matrix(K)
q=matrix(-np.ones((n, 1)))
G_a=matrix(-np.eye(n))
G_b=matrix(np.eye(n))
G=matrix(np.vstack((G_a,G_b)))
h_a=matrix(np.zeros(n))
h_b=matrix(np.ones(n)*C)
h=matrix(np.vstack((h_a,h_b)))
b=matrix(np.zeros(1))
A=matrix(Y.reshape(1,n))
solution=solvers.qp(P, q, G, h, A, b)
arfa=solution['x']
arfa=np.array(arfa)

     pcost       dcost       gap    pres   dres
 0:  7.8789e+06 -4.8913e+07  6e+07  1e-10  6e-14
 1:  2.9998e+06 -6.0098e+06  9e+06  4e-11  5e-14
 2:  4.7361e+05 -7.0087e+05  1e+06  2e-11  9e-15
 3:  6.4121e+04 -8.8496e+04  2e+05  6e-11  7e-15
 4:  7.1742e+03 -1.4511e+04  2e+04  2e-11  2e-15
 5: -1.0784e+00 -2.9843e+03  3e+03  2e-11  9e-16
 6: -6.8608e+02 -1.0186e+03  3e+02  3e-12  2e-16
 7: -7.1185e+02 -7.2194e+02  1e+01  1e-12  8e-17
 8: -7.1196e+02 -7.1209e+02  1e-01  2e-13  8e-17
 9: -7.1196e+02 -7.1196e+02  1e-03  4e-14  8e-17
10: -7.1196e+02 -7.1196e+02  1e-05  2e-15  6e-17
Optimal solution found.


In [22]:
wx=np.zeros(n)
w0=np.zeros(n)
for i in range(n):
    for j in range(n):
        wx[i]+=arfa[j]*Y[j]*K[j,i]
    w0[i]=Y[i]-wx[i] 
bias=np.mean(w0)

In [23]:
#validation process
K_v=np.zeros((n,n_v))
for i in range(n):
    for j in range(n_v):
        K_v[i,j]=kernel(_X[i],_X_v[j])

In [24]:
wx_v=np.zeros(n_v)
Y_v=np.zeros(n_v)
for i in range(n_v):
    for j in range(n):
        wx_v[i]+=arfa[j]*Y[j]*K_v[j,i]
    Y_v[i]=wx_v[i]+bias

prediction=['a']*Y_v.shape[0]
for i in range (Y_v.shape[0]):
    if Y_v[i]<0:
        prediction[i]='ham'
    else:
        prediction[i]='spam'  

print ('accuracy', accuracy_score(messages.label[valid_indices],prediction))   


#Based on validation, set C as 300 and gamma as 10

accuracy 0.893177737882


In [25]:
#Testing
K_t=np.zeros((n,n_t))
for i in range(n):
    for j in range(n_t):
        K_t[i,j]=kernel(_X[i],_X_t[j])

In [26]:
wx_t=np.zeros(n_t)
Y_t=np.zeros(n_t)
for i in range(n_t):
    for j in range(n):
        wx_t[i]+=arfa[j]*Y[j]*K_t[j,i]
    Y_t[i]=wx_t[i]+bias

prediction_t=['a']*Y_t.shape[0]
for i in range (Y_t.shape[0]):
    if Y_t[i]<0:
        prediction_t[i]='ham'
    else:
        prediction_t[i]='spam'  

print ('accuracy', accuracy_score(messages.label[test_indices],prediction_t))   

accuracy 0.897137745975


In [27]:
#SKLearn
svm = SVC(kernel='rbf', gamma=10, C=300)
svm.fit(X, Y)
result_skl = svm.predict(X_t)
prediction_skl=['a']*result_skl.shape[0]
for i in range (result_skl.shape[0]):
    if result_skl[i]>0:
        prediction_skl[i]='spam'
    else:
        prediction_skl[i]='ham' 
print ('accuracy', accuracy_score(messages.label[test_indices],prediction_skl))        

accuracy 0.897137745975
