In [18]:
import numpy as np

import tensorflow as tf 

from sklearn import linear_model

import keras
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Flatten, Dropout

BORDER = "----------------------------------------------------------------------------"

In [2]:
def load_data(filename, skiprows = 1):
    """
    Function loads data stored in the file filename and returns it as a numpy ndarray.
    
    Inputs:
        filename: given as a string.
        
    Outputs:
        Data contained in the file, returned as a numpy ndarray
    """
    return np.loadtxt(filename, skiprows=skiprows, delimiter=' ')

In [15]:
def ClassifyWithNeuralNetwork(X_train, Y_train, X_test):
    """
    Function takes training and testing data, and fits using our neural network implementation
    
    Inputs:
        X_train: The training data
        Y_train: The training data labels
        X_test:  The testing data
        
    Outputs:
        prediction: a nparray with shape (X_test.shape[0], ) containing the predicted labels our model generated
    """
    print('\n{}\nNEURAL NETWORK\n{}\n'.format(BORDER, BORDER))

    # Converting Y values to one hot vector
    num_classes = 2
    Y_train = keras.utils.to_categorical(Y_train, num_classes=num_classes)
    
    # Input size
    n_train = X_train.shape[0]
    n_words = X_train.shape[1]
    n_test = Y_train.shape[0]
    
    # Layer set up.
    model = Sequential()
    model.add(Dense(200, input_shape=(n_words,)))
    model.add(Activation('relu'))
    model.add(Dropout(0.15))

    model.add(Dense(100, input_shape=(n_words,)))
    model.add(Activation('relu'))
    model.add(Dropout(0.15))

    model.add(Dense(50, input_shape=(n_words,)))
    model.add(Activation('relu'))
    model.add(Dropout(0.15))

    model.add(Dense(num_classes))
    model.add(Activation('softmax'))

    ## Printing a summary of the layers and weights in your model
    model.summary()

    ## In the line below we have specified the loss function as 'mse' (Mean Squared Error) because in the above code we did not one-hot encode the labels.
    ## In your implementation, since you are one-hot encoding the labels, you should use 'categorical_crossentropy' as your loss.
    ## You will likely have the best results with RMS prop or Adam as your optimizer.  In the line below we use Adadelta
    model.compile(loss='categorical_crossentropy',optimizer='RMSprop', metrics=['accuracy'])

    fit = model.fit(X_train, Y_train, batch_size=128, epochs=10, verbose=1)
    
    ## Printing the accuracy of our model, according to the loss function specified in model.compile above
    score = model.evaluate(X_train, Y_train, verbose=0)
    print('Training score:', score[0])
    print('Training accuracy:', score[1])

    prediction = model.predict(X_test, verbose=1)

    zeros = prediction[:, 0]
    ones  = prediction[:, 1]

    prediction = (zeros < ones).astype(int)

    return prediction


In [16]:
def ClassifyWithLogisticRegression(X_train, Y_train, X_test):
    """
    Function takes training and testing data, and fits using logistic regression
    
    Inputs:
        X_train: The training data
        Y_train: The training data labels
        X_test:  The testing data
        
    Outputs:
        prediction: a nparray with shape (X_test.shape[0], ) containing the predicted labels our model generated
    """
    
    print('\n{}\nLOGISTIC REGRESSION\n{}\n'.format(BORDER, BORDER))
    
    model = linear_model.LogisticRegression()
    model.fit(X_train, Y_train)
    
    Y_pred = model.predict(X_train)
    
    correctlyClassified = (Y_pred == Y_train).astype(int);
    accuracy = np.sum(correctlyClassified) / correctlyClassified.shape[0]
    
    print('Training accuracy: ', accuracy)
    
    
    prediction = model.predict(X_test)
    return prediction

In [5]:
# Loads training dataset
training = load_data('data/training_data.txt', 1)
X_train = training[:, 1:]
Y_train = training[:, 0]

In [6]:
# Loads testing dataset
# There is no label for testing set 
X_test = load_data('data/test_data.txt', 1)

In [20]:
prediction = ClassifyWithNeuralNetwork(X_train, Y_train, X_test)
prediction = ClassifyWithLogisticRegression(X_train, Y_train, X_test)


----------------------------------------------------------------------------
NEURAL NETWORK
----------------------------------------------------------------------------

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_9 (Dense)              (None, 200)               200200    
_________________________________________________________________
activation_9 (Activation)    (None, 200)               0         
_________________________________________________________________
dropout_7 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 100)               20100     
_________________________________________________________________
activation_10 (Activation)   (None, 100)               0         
_________________________________________________________________
dropout_8 (Dropout)          (None, 1

In [12]:
print(prediction)
print("prediction shape is: {}".format(prediction.shape))

output = "Id,Prediction\n"
for i in range(prediction.shape[0]):
    output = output + ("{0},{1}\n".format(i + 1, prediction[i]))

file = open('predictions.csv','w') 
file.write(output)
file.close()


[1. 1. 0. ... 0. 1. 0.]
prediction shape is: (10000,)
