In [1]:
# Jenny branch
import numpy as np

import datetime

import tensorflow as tf 

from sklearn import linear_model, ensemble


import keras
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Flatten, Dropout
from keras.layers import Conv1D, MaxPooling1D, Flatten, BatchNormalization

from sklearn.model_selection import KFold

BORDER = "===================================================================================="

Using TensorFlow backend.


In [2]:
def load_data(filename, skiprows = 1):
    """
    Function loads data stored in the file filename and returns it as a numpy ndarray.
    
    Inputs:
        filename: given as a string.
        
    Outputs:
        Data contained in the file, returned as a numpy ndarray
    """
    return np.loadtxt(filename, skiprows=skiprows, delimiter=' ')

In [3]:
def WritePredictionOut(modelName):
    print(prediction)
    print("prediction shape is: {}".format(prediction.shape))

    output = "Id,Prediction\n"
    for i in range(prediction.shape[0]):
        output = output + ("{0},{1}\n".format(i + 1, prediction[i]))

        
    now = datetime.datetime.now();
        
    filename = modelName + "_" + now.year + "_" + now.month + "_" + now.day + "_predictions.csv"
        
    file = open(filename,'w') 
    file.write(output)
    file.close()

In [4]:
def ClassifyWithNeuralNetwork(X_train, Y_train, X_test, verbose=0):
    """
    Function takes training and testing data, and fits using our neural network implementation
    
    Inputs:
        X_train: The training data
        Y_train: The training data labels
        X_test:  The testing data
        
    Outputs:
        model: passes out the model we trained.
        prediction: a nparray with shape (X_test.shape[0], ) containing the predicted labels our model generated
        accuracy: training accuracy of model.
    """
    if (verbose == 1):
        print('\n{}\nNEURAL NETWORK\n{}\n'.format(BORDER, BORDER))

    # Converting Y values to one hot vector
    num_classes = 2
    Y_train = keras.utils.to_categorical(Y_train, num_classes=num_classes)
    
    # Input size
    n_train = X_train.shape[0]
    n_words = X_train.shape[1]
    n_test = Y_train.shape[0]
    
    # Layer set up.
    model = Sequential()
    model.add(Dense(200, input_shape=(n_words,)))
    model.add(Activation('relu'))
    model.add(Dropout(0.15))

    model.add(Dense(100, input_shape=(n_words,)))
    model.add(Activation('relu'))
    model.add(Dropout(0.15))

    model.add(Dense(50, input_shape=(n_words,)))
    model.add(Activation('relu'))
    model.add(Dropout(0.15))

    model.add(Dense(num_classes))
    model.add(Activation('softmax'))

    ## Printing a summary of the layers and weights in your model
    if (verbose == 1):
        model.summary()

    ## In the line below we have specified the loss function as 'mse' (Mean Squared Error) because in the above code we did not one-hot encode the labels.
    ## In your implementation, since you are one-hot encoding the labels, you should use 'categorical_crossentropy' as your loss.
    ## You will likely have the best results with RMS prop or Adam as your optimizer.  In the line below we use Adadelta
    model.compile(loss='categorical_crossentropy',optimizer='RMSprop', metrics=['accuracy'])

    fit = model.fit(X_train, Y_train, batch_size=128, epochs=10, verbose=verbose)
    
    ## Printing the accuracy of our model, according to the loss function specified in model.compile above
    score = model.evaluate(X_train, Y_train, verbose=0)
    if (verbose == 1):
        print('Training score:', score[0])
        print('Training accuracy:', score[1])

    prediction = model.predict(X_test, verbose=1)

    zeros = prediction[:, 0]
    ones  = prediction[:, 1]

    prediction = (zeros < ones).astype(int)

    return model, prediction, score[1]

In [5]:
def ClassifyWithConvolutionNetwork(X_train, Y_train, X_test, verbose=0):
    """
    Function takes training and testing data, and fits using our neural network implementation
    
    Inputs:
        X_train: The training data
        Y_train: The training data labels
        X_test:  The testing data
        
    Outputs:
        model: passes out the model we trained.
        prediction: a nparray with shape (X_test.shape[0], ) containing the predicted labels our model generated
        accuracy: training accuracy of model.
    """
    if (verbose == 1):
        print('\n{}\nNEURAL NETWORK\n{}\n'.format(BORDER, BORDER))

    # Converting Y values to one hot vector
    num_classes = 2
    Y_train = keras.utils.to_categorical(Y_train, num_classes=num_classes)
    
    # Input size
    n_train = X_train.shape[0]
    n_words = X_train.shape[1]
    n_test = Y_train.shape[0]
    
    
    # For 1D Convolution, change dimension of input 
    X_train = np.expand_dims(X_train, axis=2)
    X_test = np.expand_dims(X_test, axis=2)
    
        
    ## Create your own model here given the constraints in the problem
    model = Sequential()

    model.add(Conv1D(5, 3, padding="same", input_shape=(n_words, 1, )))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.1))

    model.add(Conv1D(5, 2, padding="same"))
    model.add(Activation('relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.1))

    model.add(Flatten())
    model.add(Dense(100))
    model.add(Activation('relu'))
    model.add(Dropout(0.1))

    ## Once you one-hot encode the data labels, the line below should be predicting probabilities of each of the 2 classes
    model.add(Dense(2))
    model.add(Activation('softmax'))

    ## Printing a summary of the layers and weights in your model
    if (verbose == 1):
        model.summary()

    ## In the line below we have specified the loss function as 'mse' (Mean Squared Error) because in the above code we did not one-hot encode the labels.
    ## In your implementation, since you are one-hot encoding the labels, you should use 'categorical_crossentropy' as your loss.
    ## You will likely have the best results with RMS prop or Adam as your optimizer.  In the line below we use Adadelta
    model.compile(loss='categorical_crossentropy',optimizer='RMSprop', metrics=['accuracy'])

    fit = model.fit(X_train, Y_train, batch_size=128, epochs=10, verbose=verbose)
    
    ## Printing the accuracy of our model, according to the loss function specified in model.compile above
    score = model.evaluate(X_train, Y_train, verbose=0)
    
    if (verbose == 1):
        print('Training score:', score[0])
        print('Training accuracy:', score[1])

    prediction = model.predict(X_test, verbose=1)

    zeros = prediction[:, 0]
    ones  = prediction[:, 1]

    prediction = (zeros < ones).astype(int)

    return model, prediction, score[1]

In [6]:
def ClassifyWithLogisticRegression(X_train, Y_train, X_test, verbose=0):
    """
    Function takes training and testing data, and fits using logistic regression
    
    Inputs:
        X_train: The training data
        Y_train: The training data labels
        X_test:  The testing data
        
    Outputs:
        model: passes out the model we trained.
        prediction: a nparray with shape (X_test.shape[0], ) containing the predicted labels our model generated
        accuracy: training accuracy of model.
    """
    
    if (verbose == 1):
        print('\n{}\nLOGISTIC REGRESSION\n{}\n'.format(BORDER, BORDER))
    
    model = linear_model.LogisticRegression()
    model.fit(X_train, Y_train)
    
    Y_pred = model.predict(X_train)
    
    correctlyClassified = (Y_pred == Y_train).astype(int);
    accuracy = np.sum(correctlyClassified) / correctlyClassified.shape[0]
    
    if (verbose == 1):
        print('Training accuracy: ', accuracy)
    
    prediction = model.predict(X_test)
    return model, prediction, accuracy

In [7]:
def ClassifyWithAdaBoost(X_train, Y_train, X_test, verbose=0):
    """
    Function takes training and testing data, and fits using logistic regression
    
    Inputs:
        X_train: The training data
        Y_train: The training data labels
        X_test:  The testing data
        
    Outputs:
        model: passes out the model we trained.
        prediction: a nparray with shape (X_test.shape[0], ) containing the predicted labels our model generated
        accuracy: training accuracy of model.
    """

    if (verbose == 1):
        print('\n{}\nADABOOST\n{}\n'.format(BORDER, BORDER))
    
    model = ensemble.AdaBoostClassifier()
    model.fit(X_train, Y_train)
    
    Y_pred = model.predict(X_train)
    
    correctlyClassified = (Y_pred == Y_train).astype(int);
    accuracy = np.sum(correctlyClassified) / correctlyClassified.shape[0]
    
    if (verbose == 1):
        print('Training accuracy: ', accuracy)
    
    prediction = model.predict(X_test)
    return model, prediction, accuracy

In [18]:
def CrossValidation_NeuralNetwork(X_train, Y_train, X_test, verbose=0):
    kf = KFold(n_splits = 5)
    inds = [ind for ind in kf.split(X_train, Y_train)]
        
    total_train_acc = []
    total_val_acc = []
    
     # Converting Y values to one hot vector
    num_classes = 2
    Y = keras.utils.to_categorical(Y_train, num_classes=num_classes)
    
    # perform 5-fold validation
    for i in range(0,5):
        traini, vali = inds[i]
        model, prediction, accuracy = ClassifyWithNeuralNetwork(X_train[traini], Y_train[traini], X_test)
        
        # Compute accuracy.
        val_acc = model.evaluate(X_train[vali], Y[vali], verbose=verbose)
            
        total_train_acc = np.append(total_train_acc, accuracy)
        total_val_acc = np.append(total_val_acc, val_acc[1])

    print(BORDER)
    print("CROSS VALIDATION: ClassifyWithNeuralNetwork")
    print(BORDER)
    print("training accuracy", total_train_acc)
    print("val accuracy", total_val_acc)
    print("average training accuracy", np.sum(total_train_acc) / 5.)
    print("average val accuracy", np.sum(total_val_acc) / 5.)
    
    return 0


In [17]:
def CrossValidation_ConvolutionNetwork(X_train, Y_train, X_test, verbose=0):
    kf = KFold(n_splits = 5)
    inds = [ind for ind in kf.split(X_train, Y_train)]
        
    total_train_acc = []
    total_val_acc = []
    
     # Converting Y values to one hot vector
    num_classes = 2
    Y = keras.utils.to_categorical(Y_train, num_classes=num_classes)
    
    
    # For 1D Convolution, change dimension of input 
    X = np.expand_dims(X_train, axis=2)
    
    
    # perform 5-fold validation
    for i in range(0,5):
        traini, vali = inds[i]
        model, prediction, accuracy = ClassifyWithConvolutionNetwork(X_train[traini], Y_train[traini], X_test)
        
                
        # Compute accuracy.
        val_acc = model.evaluate(X[vali], Y[vali], verbose=verbose)
            
        total_train_acc = np.append(total_train_acc, accuracy)
        total_val_acc = np.append(total_val_acc, val_acc[1])

    print(BORDER)
    print("CROSS VALIDATION: ClassifyWithNeuralNetwork")
    print(BORDER)
    print("training accuracy", total_train_acc)
    print("val accuracy", total_val_acc)
    print("average training accuracy", np.sum(total_train_acc) / 5.)
    print("average val accuracy", np.sum(total_val_acc) / 5.)
    
    return 0

In [10]:
def CrossValidation_sklearn(modelFunction, X_train, Y_train, X_test, verbose=0):
    kf = KFold(n_splits = 5)
    inds = [ind for ind in kf.split(X_train, Y_train)]
        
    total_train_acc = []
    total_val_acc = []
    
    
    # perform 5-fold validation
    for i in range(0,5):
        traini, vali = inds[i]
        model, prediction, accuracy = modelFunction(X_train[traini], Y_train[traini], X_test)
        
        # Compute accuracy.
        val_acc = model.score(X_train[vali], Y_train[vali])
            
        total_train_acc = np.append(total_train_acc, accuracy)
        total_val_acc = np.append(total_val_acc, val_acc)

    print(BORDER)
    print("CROSS VALIDATION: " + modelFunction.__name__)
    print(BORDER)
    print("training accuracy", total_train_acc)
    print("val accuracy", total_val_acc)
    print("average training accuracy", np.sum(total_train_acc) / 5.)
    print("average val accuracy", np.sum(total_val_acc) / 5.)
    
    return 0


In [11]:
# Loads training dataset
training = load_data('data/training_data.txt', 1)
X_train = training[:, 1:]
Y_train = training[:, 0]

In [12]:
# Loads testing dataset
# There is no label for testing set 
X_test = load_data('data/test_data.txt', 1)

In [19]:
# Using Validation 
CrossValidation_NeuralNetwork(X_train, Y_train, X_test)
CrossValidation_ConvolutionNetwork(X_train, Y_train, X_test)

CrossValidation_sklearn(ClassifyWithLogisticRegression, X_train, Y_train, X_test)
CrossValidation_sklearn(ClassifyWithAdaBoost, X_train, Y_train, X_test)

CROSS VALIDATION: ClassifyWithNeuralNetwork
training accuracy [0.9993125 0.99925   0.9991875 0.9991875 0.9990625]
val accuracy [0.832   0.83825 0.8345  0.837   0.82875]
average training accuracy 0.9992000000000001
average val accuracy 0.8341000000000001
CROSS VALIDATION: ClassifyWithNeuralNetwork
training accuracy [0.9958125 0.99025   0.9975625 0.9869375 0.99625  ]
val accuracy [0.83275 0.8105  0.81325 0.809   0.81225]
average training accuracy 0.9933624999999999
average val accuracy 0.81555
CROSS VALIDATION: ClassifyWithLogisticRegression
training accuracy [0.8780625 0.8784375 0.8806875 0.882375  0.877375 ]
val accuracy [0.85    0.8415  0.842   0.84075 0.85   ]
average training accuracy 0.8793875
average val accuracy 0.8448499999999999
CROSS VALIDATION: ClassifyWithAdaBoost
training accuracy [0.802375  0.8026875 0.7985    0.7965    0.800625 ]
val accuracy [0.7925  0.812   0.78975 0.791   0.787  ]
average training accuracy 0.8001375
average val accuracy 0.79445


0

In [None]:
# model, prediction, accuracy = ClassifyWithNeuralNetwork(X_train, Y_train, X_test)
# model, prediction, accuracy = ClassifyWithConvolutionNetwork(X_train, Y_train, X_test)
# model, prediction, accuracy = ClassifyWithLogisticRegression(X_train, Y_train, X_test)
# model, prediction, accuracy = ClassifyWithAdaBoost(X_train, Y_train, X_test)