In [18]:
import numpy as np
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
import csv

from sklearn import feature_extraction
from sklearn import decomposition
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

import sys
reload(sys)
sys.setdefaultencoding('ISO-8859-1')

''' Load the training set '''
pd.set_option('display.max_column', None)
training_set = pd.read_csv('train.csv')
print "size of training_set = " + str(np.shape(training_set))

# we want to separate the txt messages into two groups: ham(1) and spam(0)
ham_or_spam = training_set['label']
training_set_target = []
for i in range(len(ham_or_spam)):
    if ham_or_spam[i] == "ham":
        training_set_target.append(1)
    else:
        training_set_target.append(0)

# store the array in y, because below we transform this one to a matrix
y = training_set_target
training_set_target = np.transpose(np.matrix(training_set_target))
print "size of training_set_target = " + str(np.shape(training_set_target))

training_set_content = training_set['sms'].as_matrix()
print "number of text messages = " + str(np.shape(training_set_content))
print training_set_content[:20]

''' Load the stopwords '''
stop_words = stopwords.words('english')
print "We use %d stopwords from nltk library, such as:" % np.shape(stop_words)[0]
print stop_words[:10]

''' Tokenization '''
def tokenization(text):
    tokens=[]
    for word in nltk.word_tokenize(text):
        if re.search('[a-zA-Z]', word) and word.lower() not in stop_words:
            tokens.append(word.lower())
    return tokens

training_set_corpus_tokenized = []
for i in training_set_content:
    training_set_corpus_tokenized.append(' '.join(tokenization(i)))

''' Applying TF-IDF '''
vectorizer = CountVectorizer(decode_error = 'ignore')
transformer = TfidfTransformer(norm = 'l2', use_idf = True)
tfidf_matrix = transformer.fit_transform(vectorizer.fit_transform(training_set_corpus_tokenized))

print "size of tf-idf matrix = " + str(np.shape(tfidf_matrix))
word = vectorizer.get_feature_names()

X = tfidf_matrix.toarray()
print "size of X = " + str(np.shape(X))

''' Define function for logistic regression '''
def sigmoid(z):
    sigma = 1 / (1 + np.exp(-z))
    return sigma

def loss_function(X, y, w, lmd):
    # X is the training set, 3000 x 3000, w is the weight vector, 3000 x 1
    # y is the target, 3000 x 1, lmd is the hyper parameter for regularization
    m = np.shape(y)[0]
    y = np.transpose(y)
    hypo = sigmoid(np.dot(X, w))
    # avoid the case of log(0)
    for i in range(len(hypo)):
        if hypo[i][0] > 0.99999:
            hypo[i][0] = 0.99999
        elif hypo[i][0] < 0.00001:
            hypo[i][0] = 0.00001
    loss = (1.0 / m) * (-np.dot(y, np.log(hypo)) - np.dot((1 - y), np.log(1 - hypo))) + lmd * np.dot(np.transpose(w), w)
    return loss

def gradient_descent(X, y, w, eta, lmd):
    m = np.shape(y)[0]
    hypo = sigmoid(np.dot(X, w))
    loss_history = []
    prev_loss = 9999.0
    for i in range(1, 1001):
        w = w - (1.0 / m) * eta * np.power(i, -0.9) * np.dot(np.transpose(X), hypo - y) + lmd * w
        crt_loss = loss_function(X, y, w, lmd)[0][0]
        # break when loss starts to grow
        if crt_loss > prev_loss:
            break
        loss_history.append(crt_loss)
        prev_loss = crt_loss
    print loss_history
    return w

def predict(X, y, w):
    predict_hypo = sigmoid(np.dot(X, w))
    p = predict_hypo
    for i in range(np.shape(y)[0]):
        if predict_hypo[i][0] >= 0.5:
            p[i][0] = 1
        else:
            p[i][0] = 0
    return p

from sklearn.metrics import precision_score, recall_score, accuracy_score
def compute_accuracy(X, y, w):
    result = predict(X, y, w)
    wrong_answer = 0.0
    for i in range(np.shape(y)[0]):
        if result[i][0] != y[i][0]:
            wrong_answer = wrong_answer + 1
    accuracy = 1 - wrong_answer / np.shape(y)[0]
    return accuracy
#     print "accuracy_my_model = ", accuracy_score(result, training_set_target);
#     print "precision_my_model = ", precision_score(result, training_set_target);
#     print "recall_my_model = ", recall_score(result, training_set_target);    

# initialize w to be a matrix of dimension 3000 x 1 with all elements to be 0
w = np.transpose(np.matrix(np.zeros(np.shape(X)[1])))
w = gradient_descent(X, training_set_target, w, 1, 0)
print w

accuracy = compute_accuracy(X, training_set_target, w)
print "accuracy_my_model = " + str(accuracy)

''' Load the test data and calculate the accuracy '''
test_set = pd.read_csv('test.csv')
print "size of test_set = " + str(np.shape(test_set))

ham_or_spam = test_set['label']
test_set_target = []
for i in range(len(ham_or_spam)):
    if ham_or_spam[i] == "ham":
        test_set_target.append(1)
    else:
        test_set_target.append(0)
y_test = test_set_target
test_set_target = np.transpose(np.matrix(test_set_target))
print "size of test_set_target = " + str(np.shape(test_set_target))

test_set_content = test_set['sms'].as_matrix()
test_set_corpus_tokenized = []
for i in test_set_content:
    test_set_corpus_tokenized.append(' '.join(tokenization(i)))

print "size of test_set_corpus_tokenized = " + str(np.shape(test_set_corpus_tokenized))
tfidf_test = transformer.transform(vectorizer.transform(test_set_corpus_tokenized))
print "size of tfidf_test = " + str(np.shape(tfidf_test))

X_test = tfidf_test.toarray()
print "size of X_test = " + str(np.shape(X_test))

In [51]:
1

1

In [53]:
accuracy_history=[]
from sklearn.cross_validation import KFold 
# return the lmd that maximize the accuracy
def cross_validation(X, w, y, eta):
    # divide the dataset into 10 folds
    #kf = KFold(len(training_set_target), n_folds = 10)
    training_cost_avg = 0
    test_cost_avg = 0
    #accuracy_history = []
    #for train_index, test_index in kf:
    #for i in range(0,10):
    i=2
    X_train, X_test = np.vstack((X[0:i*300],X[(i+1)*300:3000])), X[i*300:(i+1)*300]
    y_train, y_test = np.vstack((y[0:i*300],y[(i+1)*300:3000])), y[i*300:(i+1)*300]
    lmd = np.linspace(0, 100, num=100)
    for i in range(len(lmd)):
        w = np.transpose(np.matrix(np.zeros(np.shape(X_train)[1])))
        w = gradient_descent(X_train, y_train, w, eta, lmd[i])
        accuracy = compute_accuracy(X_test, y_test, w)
        accuracy_history.append(accuracy)
    # find the index of this best lmd
    best_lmd_index = np.argmax(accuracy_history)
    # find it in the lmd array
    lmd_best = lmd[best_lmd_index]
    return lmd_best

w = np.transpose(np.matrix(np.zeros(np.shape(X)[1])))
lmd_best = cross_validation(X, training_set_target, w, 1)
lmd_best, len(accuracy_history), accuracy_history

(0.0,
 100,
 [1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0])

In [39]:
test_split = []
total=3000;
patch=300;
for i in range(0,10):
    test_split.append([i*300, (i+1)*300]);
test_split    
    

[[0, 300],
 [300, 600],
 [600, 900],
 [900, 1200],
 [1200, 1500],
 [1500, 1800],
 [1800, 2100],
 [2100, 2400],
 [2400, 2700],
 [2700, 3000]]

In [44]:
lmd = np.linspace(0, 1, num=100)
lmd[2]

0.020202020202020204