In [3]:
# Jerry branch
import numpy as np

import datetime

import tensorflow as tf 

from sklearn import linear_model, ensemble, tree, svm, naive_bayes, model_selection

import keras
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Flatten, Dropout
from keras.layers import Conv1D, MaxPooling1D, Flatten, BatchNormalization

from sklearn.model_selection import KFold

import matplotlib.pyplot as plt

BORDER = "===================================================================================="

Using TensorFlow backend.


In [4]:
def load_data(filename, skiprows = 1):
    """
    Function loads data stored in the file filename and returns it as a numpy ndarray.
    
    Inputs:
        filename: given as a string.
        
    Outputs:
        Data contained in the file, returned as a numpy ndarray
    """
    return np.loadtxt(filename, skiprows=skiprows, delimiter=' ')

In [5]:
def WritePredictionOut(modelName, prediction):
    print(prediction)
    print("prediction shape is: {}".format(prediction.shape))

    output = "Id,Prediction\n"
    for i in range(prediction.shape[0]):
        output = output + ("{0},{1}\n".format(i + 1, prediction[i].astype(int)))

        
    now = datetime.datetime.now();
        
    filename = "{}_{}.{}.{}_{}.{}.{}_predictions.csv".format(modelName, now.year, now.month, now.day, now.hour, now.minute, now.second)
        
    file = open(filename,'w') 
    file.write(output)
    file.close()

In [19]:
def ClassifyWithSVM(X_train, Y_train, X_test, C, loss, verbose=0):
    """
    Function takes training and testing data, and fits using logistic regression
    
    Inputs:
        X_train: The training data
        Y_train: The training data labels
        X_test:  The testing data
        
    Outputs:
        model: passes out the model we trained.
        prediction: a nparray with shape (X_test.shape[0], ) containing the predicted labels our model generated
        accuracy: training accuracy of model.
    """
    
    if (verbose == 1):
        print('\n{0}\nSVM -- C = {1} : loss -- {2}\n{0}\n'.format(BORDER, C, loss))
        
    model = svm.LinearSVC(C=C, loss=loss, verbose=verbose)
    model.fit(X_train, Y_train)
    
    accuracy = model.score(X_train, Y_train)
    
    if (verbose == 1):
        print('Training accuracy: ', accuracy)
    
    prediction = model.predict(X_test)
    return model, prediction, accuracy

def CrossValidate(X_train, Y_train, X_test, nFold, C=0.03, loss='hinge', verbose=0):
    kf = KFold(n_splits = nFold)
    inds = [ind for ind in kf.split(X_train, Y_train)]
        
    total_train_acc = []
    total_val_acc = []
    
    # perform 5-fold validation
    for i in range(0,5):
        traini, vali = inds[i]
        model, prediction, accuracy = ClassifyWithSVM(X_train[traini], Y_train[traini], X_test, C, loss, verbose=verbose)
        
        # Compute accuracy.
        val_acc = model.score(X_train[vali], Y_train[vali])
            
        total_train_acc = np.append(total_train_acc, accuracy)
        total_val_acc = np.append(total_val_acc, val_acc)


    accuracy = np.sum(total_val_acc) / float(nFold);
    
    
    return accuracy


In [75]:
# Loads training dataset
training = load_data('data/training_data.txt', 1)
X_train = training[:, 1:]
Y_train = training[:, 0]

# Loads testing dataset
# There is no label for testing set 
X_test = load_data('data/test_data.txt', 1)

In [82]:
N = X_train.shape[0]
corpusWordCount = np.sum((X_train != 0).astype(int), axis=0)
documentTotalWords = np.sum(X_train, axis=1)
zeroWords = np.sum(documentTotalWords == 0)
print("corpusWordCount shape {}".format(corpusWordCount.shape))
print("documentTotalWords shape {}".format(documentTotalWords.shape))
print("documents with 0 words: {}".format(zeroWords))

print("X_train shape {}".format(X_train.shape))

idf = np.log(N / (1 + corpusWordCount))
# idf = np.zeros((X_train.shape[0]))
# for i in range (X_train.shape[0]):
#     idf[i] = corpusWordCount

# Data processing
X_train_count  = np.copy(X_train)
X_train_binary = (X_train != 0).astype(int)
X_train_freq   = np.copy(X_train)
for i in range(X_train.shape[0]):
    if (documentTotalWords[i] != 0):
        X_train_freq[i] = np.log(X_train_freq[i] + 1)
#         X_train_freq[i] = X_train_freq[i] / documentTotalWords[i]

X_train_tfidf  = X_train_freq * idf;

print(X_train_freq[0])
        
documentTotalWords = np.sum(X_test, axis=1)
zeroWords = np.sum(documentTotalWords == 0)
print("documentTotalWords shape {}".format(documentTotalWords.shape))
print("documents with 0 words: {}".format(zeroWords))

X_test_count  = np.copy(X_test)
X_test_binary = (X_test != 0).astype(int)
X_test_freq   = np.copy(X_test)
for i in range(X_test.shape[0]):
    if (documentTotalWords[i] != 0):
        X_test_freq[i] = np.log(X_test_freq[i] + 1)
#         X_test_freq[i] = X_test_freq[i] / documentTotalWords[i]

X_test_tfidf  = X_test_freq * idf;


corpusWordCount shape (1000,)
documentTotalWords shape (20000,)
documents with 0 words: 4
X_train shape (20000, 1000)
[0.         0.         0.         0.         0.         1.09861229
 0.         1.09861229 0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.69314718 0.         0.         0.69314718
 0.         0.         0.         0.         0.         0.
 0.69314718 0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.69314718
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.       

In [91]:
# print(np.sum(X_train_freq < 0))
# print(np.sum(idf < 0))

# print(idf)

print("log({} / (1 + {}))".format(N, corpusWordCount[0]))

log(20000 / (1 + 30565.0))


In [20]:
numCs = 100
Cs = []
for i in range(numCs):
    Cs.append((i + 1) / numCs)

svc = svm.LinearSVC()
# parameters = {'penalty':('l1', 'l2'), 'loss':('hinge', 'squared_hinge'), 'C':Cs}
parameters = {'loss':('hinge', 'squared_hinge'), 'C':Cs}


clf = model_selection.GridSearchCV(svc, parameters, verbose=1)
clf.fit(X_train, Y_train)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


KeyboardInterrupt: 

In [15]:
trainingAccuracy = clf.score(X_train, Y_train)
print("traininfAccuracy: {}".format(trainingAccuracy))

traininfAccuracy: 0.8688


In [85]:
# print("Best parameters set found on development set:")
# print()
# print(clf.best_params_)
# print()

# best_params = clf.best_params_

# accuracy = CrossValidate(X_train, Y_train, X_test, 5, C=best_params['C'], loss=best_params['loss'], verbose = 1);
accuracy = CrossValidate(X_train_binary, Y_train, X_test_binary, 5, C=0.03, loss='hinge', verbose = 0);
print("Binary data accuracy is {}".format(accuracy))

accuracy = CrossValidate(X_train_count, Y_train, X_test_count, 5, C=0.03, loss='hinge', verbose = 0);
print("Count data accuracy is {}".format(accuracy))

accuracy = CrossValidate(X_train_freq, Y_train, X_test_freq, 5, C=0.03, loss='hinge', verbose = 0);
print("Freq data accuracy is {}".format(accuracy))

accuracy = CrossValidate(X_train_tfidf, Y_train, X_test_tfidf, 5, C=0.03, loss='hinge', verbose = 0);
print("tfidf data accuracy is {}".format(accuracy))

Binary data accuracy is 0.84375
Count data accuracy is 0.84825
Freq data accuracy is 0.84755
tfidf data accuracy is 0.8459


In [None]:
_, prediction1, _ = ClassifyWithSVM(X_train_binary, Y_train, X_test_binary, C=0.03, loss='hinge', verbose = 0)
_, prediction2, _ = ClassifyWithSVM(X_train_count,  Y_train, X_test_count,  C=0.03, loss='hinge', verbose = 0)
_, prediction2, _ = ClassifyWithSVM(X_train_freq,   Y_train, X_test_freq,   C=0.03, loss='hinge', verbose = 0)

mismatches = np.sum((prediction1 != prediction2).astype(int)) 
print(mismatches)

In [18]:
predictions = clf.predict(X_test)

In [19]:
print(predictions)

[1. 1. 0. ... 0. 1. 0.]



SVM

[LibLinear]Training accuracy:  0.873625

SVM

[LibLinear]Training accuracy:  0.8748125

SVM

[LibLinear]Training accuracy:  0.8779375

SVM

[LibLinear]Training accuracy:  0.878

SVM

[LibLinear]Training accuracy:  0.8746875
Accuracy is 0.84825


In [23]:
WritePredictionOut("SVC_C0.03_HingeLoss", predictions)

[1. 1. 0. ... 0. 1. 0.]
prediction shape is: (10000,)


# NuSVM

In [None]:
# numNus = 100
# Nus = []
# for i in range(numNus):
#     Nus.append((i + 1) / numNus)
    
    
# svc = svm.NuSVC()
# # parameters = {'penalty':('l1', 'l2'), 'loss':('hinge', 'squared_hinge'), 'C':Cs}
# parameters = {'kernel':('rbf', 'linear', 'poly', 'sigmoid'), 'nu':Nus}
 

# clf = model_selection.GridSearchCV(svc, parameters, verbose=1)
# clf.fit(X_train, Y_train)

# print("Best parameters set found on development set:")
# print()
# print(clf.best_params_)
# print()

Fitting 3 folds for each of 400 candidates, totalling 1200 fits
