In [3]:
# Jerry branch
import numpy as np

import datetime

import tensorflow as tf 

from sklearn import linear_model, ensemble, tree, svm, naive_bayes, model_selection

import keras
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Flatten, Dropout
from keras.layers import Conv1D, MaxPooling1D, Flatten, BatchNormalization
 m
from sklearn.model_selection import KFold

import matplotlib.pyplot as plt

BORDER = "===================================================================================="

Using TensorFlow backend.


In [4]:
def load_data(filename, skiprows = 1):
    """
    Function loads data stored in the file filename and returns it as a numpy ndarray.
    
    Inputs:
        filename: given as a string.
        
    Outputs:
        Data contained in the file, returned as a numpy ndarray
    """
    return np.loadtxt(filename, skiprows=skiprows, delimiter=' ')

In [22]:
def WritePredictionOut(modelName, prediction):
    print(prediction)
    print("prediction shape is: {}".format(prediction.shape))

    output = "Id,Prediction\n"
    for i in range(prediction.shape[0]):
        output = output + ("{0},{1}\n".format(i + 1, prediction[i].astype(int)))

        
    now = datetime.datetime.now();
        
    filename = "{}_{}.{}.{}_{}.{}.{}_predictions.csv".format(modelName, now.year, now.month, now.day, now.hour, now.minute, now.second)
        
    file = open(filename,'w') 
    file.write(output)
    file.close()

In [25]:
def ClassifyWithSVM(X_train, Y_train, X_test, verbose=0):
    """
    Function takes training and testing data, and fits using logistic regression
    
    Inputs:
        X_train: The training data
        Y_train: The training data labels
        X_test:  The testing data
        
    Outputs:
        model: passes out the model we trained.
        prediction: a nparray with shape (X_test.shape[0], ) containing the predicted labels our model generated
        accuracy: training accuracy of model.
    """
    
    if (verbose == 1):
        print('\n{}\nSVM\n{}\n'.format(BORDER, BORDER))
        
    model = svm.LinearSVC(C=0.03, loss='hinge', verbose=verbose)
    model.fit(X_train, Y_train)
    
    accuracy = model.score(X_train, Y_train)
    
    if (verbose == 1):
        print('Training accuracy: ', accuracy)
    
    prediction = model.predict(X_test)
    return model, prediction, accuracy

def CrossValidate(X_train, Y_train, X_test, nFold, verbose=0):
    kf = KFold(n_splits = nFold)
    inds = [ind for ind in kf.split(X_train, Y_train)]
        
    total_train_acc = []
    total_val_acc = []
    
    
    # perform 5-fold validation
    for i in range(0,5):
        traini, vali = inds[i]
        model, prediction, accuracy = ClassifyWithSVM(X_train[traini], Y_train[traini], X_test, verbose=verbose)
        
        # Compute accuracy.
        val_acc = model.score(X_train[vali], Y_train[vali])
            
        total_train_acc = np.append(total_train_acc, accuracy)
        total_val_acc = np.append(total_val_acc, val_acc)


    accuracy = np.sum(total_val_acc) / float(nFold);
    
    
    return accuracy


In [7]:
# Loads training dataset
training = load_data('data/training_data.txt', 1)
X_train = training[:, 1:]
Y_train = training[:, 0]

In [8]:
# Loads testing dataset
# There is no label for testing set 
X_test = load_data('data/test_data.txt', 1)

In [12]:
numCs = 100
Cs = []
for i in range(numCs):
    Cs.append((i + 1) / numCs)

svc = svm.LinearSVC()
# parameters = {'penalty':('l1', 'l2'), 'loss':('hinge', 'squared_hinge'), 'C':Cs}
parameters = {'loss':('hinge', 'squared_hinge'), 'C':Cs}


clf = model_selection.GridSearchCV(svc, parameters, verbose=1)
clf.fit(X_train, Y_train)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


[Parallel(n_jobs=1)]: Done 600 out of 600 | elapsed:  9.4min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'loss': ('hinge', 'squared_hinge'), 'C': [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3, 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42...0.84, 0.85, 0.86, 0.87, 0.88, 0.89, 0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99, 1.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [13]:
clf.score(X_train, Y_train)

0.87535

In [16]:
print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()

Best parameters set found on development set:

{'C': 0.03, 'loss': 'hinge'}



In [18]:
predictions = clf.predict(X_test)

In [19]:
print(predictions)

[1. 1. 0. ... 0. 1. 0.]


In [27]:
accuracy = CrossValidate(X_train, Y_train, X_test, 5, verbose = 1);
print("Accuracy is {}".format(accuracy))


SVM

[LibLinear]Training accuracy:  0.873625

SVM

[LibLinear]Training accuracy:  0.8748125

SVM

[LibLinear]Training accuracy:  0.8779375

SVM

[LibLinear]Training accuracy:  0.878

SVM

[LibLinear]Training accuracy:  0.8746875
Accuracy is 0.84825


In [23]:
WritePredictionOut("SVC_C0.03_HingeLoss", predictions)

[1. 1. 0. ... 0. 1. 0.]
prediction shape is: (10000,)


# NuSVM

In [None]:
numNus = 100
Nus = []
for i in range(numNus):
    Nus.append((i + 1) / numNus)
    
    
svc = svm.NuSVC()
# parameters = {'penalty':('l1', 'l2'), 'loss':('hinge', 'squared_hinge'), 'C':Cs}
parameters = {'kernel':('rbf', 'linear', 'poly', 'sigmoid'), 'nu':Nus}
 

clf = model_selection.GridSearchCV(svc, parameters, verbose=1)
clf.fit(X_train, Y_train)

print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()

Fitting 3 folds for each of 400 candidates, totalling 1200 fits
