In [5]:
# Jenny branch
import numpy as np

import datetime

import tensorflow as tf 

from sklearn import linear_model, ensemble, tree, svm, naive_bayes, model_selection, pipeline, feature_selection
from sklearn import linear_model, ensemble, tree, svm, naive_bayes
from scipy.stats import uniform as sp_rand

import keras
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Flatten, Dropout
from keras.layers import Conv1D, MaxPooling1D, Flatten, BatchNormalization, GlobalAveragePooling1D
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.model_selection import RandomizedSearchCV

from collections import Counter



BORDER = "===================================================================================="

Using TensorFlow backend.


In [6]:
def load_data(filename, skiprows = 1):
    """
    Function loads data stored in the file filename and returns it as a numpy ndarray.
    
    Inputs:
        filename: given as a string.
        
    Outputs:
        Data contained in the file, returned as a numpy ndarray
    """
    return np.loadtxt(filename, skiprows=skiprows, delimiter=' ')

In [7]:
def WritePredictionOut(modelName, prediction):
    print(prediction)
    print("prediction shape is: {}".format(prediction.shape))

    output = "Id,Prediction\n"
    for i in range(prediction.shape[0]):
        output = output + ("{0},{1}\n".format(i + 1, prediction[i].astype(int)))

        
    now = datetime.datetime.now();
        
    filename = "{}_{}.{}.{}_{}.{}.{}_predictions.csv".format(modelName, now.year, now.month, now.day, now.hour, now.minute, now.second)
        
    file = open(filename,'w') 
    file.write(output)
    file.close()

In [8]:
def ClassifyWithNeuralNetwork(X_train, Y_train, X_test, verbose=0):
    """
    Function takes training and testing data, and fits using our neural network implementation
    
    Inputs:
        X_train: The training data
        Y_train: The training data labels
        X_test:  The testing data
        
    Outputs:
        model: passes out the model we trained.
        prediction: a nparray with shape (X_test.shape[0], ) containing the predicted labels our model generated
        accuracy: training accuracy of model.
    """
    if (verbose == 1):
        print('\n{}\nNEURAL NETWORK\n{}\n'.format(BORDER, BORDER))

    # Converting Y values to one hot vector
    num_classes = 2
    Y_train = keras.utils.to_categorical(Y_train, num_classes=num_classes)
    
    # Input size
    n_train = X_train.shape[0]
    n_words = X_train.shape[1]
    n_test = Y_train.shape[0]
    
    # Layer set up.
    model = Sequential()
    
    model.add(Dense(128, input_shape=(n_words, )))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))

    model.add(Dense(128))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))

    model.add(Dense(num_classes))
    model.add(Activation('softmax'))

    ## Printing a summary of the layers and weights in your model
    if (verbose == 1):
        model.summary()

    ## In the line below we have specified the loss function as 'mse' (Mean Squared Error) because in the above code we did not one-hot encode the labels.
    ## In your implementation, since you are one-hot encoding the labels, you should use 'categorical_crossentropy' as your loss.
    ## You will likely have the best results with RMS prop or Adam as your optimizer.  In the line below we use Adadelta
    model.compile(loss='categorical_crossentropy',optimizer='adam', metrics=['accuracy'])

    fit = model.fit(X_train, Y_train, batch_size=256, epochs=4, verbose=verbose, validation_split=0.0)
    
    ## Printing the accuracy of our model, according to the loss function specified in model.compile above
    score = model.evaluate(X_train, Y_train, verbose=0)
    if (verbose == 1):
        print('Training score:', score[0])
        print('Training accuracy:', score[1])

    prediction = model.predict(X_test, verbose=verbose)

    zeros = prediction[:, 0]
    ones  = prediction[:, 1]

    prediction = (zeros < ones).astype(int)

    return model, prediction, score[1]

In [9]:
def ClassifyWithConvolutionNetwork(X_train, Y_train, X_test, verbose=0):
    """
    Function takes training and testing data, and fits using our neural network implementation
    
    Inputs:
        X_train: The training data
        Y_train: The training data labels
        X_test:  The testing data
        
    Outputs:
        model: passes out the model we trained.
        prediction: a nparray with shape (X_test.shape[0], ) containing the predicted labels our model generated
        accuracy: training accuracy of model.
    """
    if (verbose == 1):
        print('\n{}\nNEURAL NETWORK\n{}\n'.format(BORDER, BORDER))

    # Converting Y values to one hot vector
    num_classes = 2
    Y_train = keras.utils.to_categorical(Y_train, num_classes=num_classes)
    
    # Input size
    n_train = X_train.shape[0]
    n_words = X_train.shape[1]
    n_test = Y_train.shape[0]
    
    
    # For 1D Convolution, change dimension of input 
    X_train = np.expand_dims(X_train, axis=2)
    X_test = np.expand_dims(X_test, axis=2)
    
        
    ## Create your own model here given the constraints in the problem
    model = Sequential()

    model.add(Conv1D(128, 3, padding="same", input_shape=(n_words, 1, )))

    model.add(Flatten())
    model.add(Dropout(0.2))
    model.add(Dense(64))
    model.add(Activation('relu'))
    model.add(Dropout(0.2))

    ## Once you one-hot encode the data labels, the line below should be predicting probabilities of each of the 2 classes
    model.add(Dense(2))
    model.add(Activation('softmax'))

    ## Printing a summary of the layers and weights in your model
    if (verbose == 1):
        model.summary()

    ## In the line below we have specified the loss function as 'mse' (Mean Squared Error) because in the above code we did not one-hot encode the labels.
    ## In your implementation, since you are one-hot encoding the labels, you should use 'categorical_crossentropy' as your loss.
    ## You will likely have the best results with RMS prop or Adam as your optimizer.  In the line below we use Adadelta
    model.compile(loss='categorical_crossentropy',optimizer='adam', metrics=['accuracy'])

    fit = model.fit(X_train, Y_train, batch_size=128, epochs=3, verbose=verbose, validation_split=0.0)
    
    ## Printing the accuracy of our model, according to the loss function specified in model.compile above
    score = model.evaluate(X_train, Y_train, verbose=0)
    
    if (verbose == 1):
        print('Training score:', score[0])
        print('Training accuracy:', score[1])

    prediction = model.predict(X_test, verbose=verbose)

    zeros = prediction[:, 0]
    ones  = prediction[:, 1]

    prediction = (zeros < ones).astype(int)

    return model, prediction, score[1]

In [10]:
def ClassifyWithLogisticRegression(X_train, Y_train, X_test, verbose=0):
    """
    Function takes training and testing data, and fits using logistic regression
    
    Inputs:
        X_train: The training data
        Y_train: The training data labels
        X_test:  The testing data
        
    Outputs:
        model: passes out the model we trained.
        prediction: a nparray with shape (X_test.shape[0], ) containing the predicted labels our model generated
        accuracy: training accuracy of model.
    """
    
    if (verbose == 1):
        print('\n{}\nLOGISTIC REGRESSION\n{}\n'.format(BORDER, BORDER))
    
    model = linear_model.LogisticRegression(penalty='l1', solver='saga')
    model.fit(X_train, Y_train)
    
    Y_pred = model.predict(X_train)
    
    correctlyClassified = (Y_pred == Y_train).astype(int);
    accuracy = np.sum(correctlyClassified) / correctlyClassified.shape[0]
    
    if (verbose == 1):
        print('Training accuracy: ', accuracy)
    
    prediction = model.predict(X_test)
    return model, prediction, accuracy

In [None]:
def ClassifyWithLogisticRegressionOptimized(X_train, Y_train, X_test, verbose=0):
    """
    Function takes training and testing data, and fits using logistic regression
    
    Inputs:
        X_train: The training data
        Y_train: The training data labels
        X_test:  The testing data
        
    Outputs:
        model: passes out the model we trained.
        prediction: a nparray with shape (X_test.shape[0], ) containing the predicted labels our model generated
        accuracy: training accuracy of model.
    """
    
    if (verbose == 1):
        print('\n{}\nLOGISTIC REGRESSION OPTIMIZED\n{}\n'.format(BORDER, BORDER))
        
    model = linear_model.LogisticRegressionCV()
    
    #model = model_selection.GridSearchCV(model, l1)
    model.fit(X_train, Y_train)
    
    Y_pred = model.predict(X_train)
    
    correctlyClassified = (Y_pred == Y_train).astype(int);
    accuracy = np.sum(correctlyClassified) / correctlyClassified.shape[0]
    
    if (verbose == 1):
        print('Training accuracy: ', accuracy)
        print(model.C)
    
    prediction = model.predict(X_test)
    
    return model, prediction, accuracy

In [11]:
def ClassifyWithLinearRegression(X_train, Y_train, X_test, verbose=0):
    """
    Function takes training and testing data, and fits using logistic regression
    
    Inputs:
        X_train: The training data
        Y_train: The training data labels
        X_test:  The testing data
        
    Outputs:
        model: passes out the model we trained.
        prediction: a nparray with shape (X_test.shape[0], ) containing the predicted labels our model generated
        accuracy: training accuracy of model.
    """
    
    if (verbose == 1):
        print('\n{}\nLINEAR REGRESSION\n{}\n'.format(BORDER, BORDER))
    
    model = linear_model.LinearRegression()
    model.fit(X_train, Y_train)
    
    Y_pred = model.predict(X_train)
    
    correctlyClassified = (Y_pred == Y_train).astype(int);
    accuracy = np.sum(correctlyClassified) / correctlyClassified.shape[0]
    
    if (verbose == 1):
        print('Training accuracy: ', accuracy)
    
    prediction = model.predict(X_test)
    return model, prediction, accuracy

In [12]:
def ClassifyWithAdaBoost(X_train, Y_train, X_test, verbose=0):
    """
    Function takes training and testing data, and fits using logistic regression
    
    Inputs:
        X_train: The training data
        Y_train: The training data labels
        X_test:  The testing data
        
    Outputs:
        model: passes out the model we trained.
        prediction: a nparray with shape (X_test.shape[0], ) containing the predicted labels our model generated
        accuracy: training accuracy of model.
    """

    if (verbose == 1):
        print('\n{}\nADABOOST\n{}\n'.format(BORDER, BORDER))
    
    model = ensemble.AdaBoostClassifier()
    model.fit(X_train, Y_train)
    
    Y_pred = model.predict(X_train)
    
    correctlyClassified = (Y_pred == Y_train).astype(int);
    accuracy = np.sum(correctlyClassified) / correctlyClassified.shape[0]
    
    if (verbose == 1):
        print('Training accuracy: ', accuracy)
    
    prediction = model.predict(X_test)
    return model, prediction, accuracy

In [13]:
def ClassifyWithDecisionTree(X_train, Y_train, X_test, verbose=0):
    """
    Function takes training and testing data, and fits using logistic regression
    
    Inputs:
        X_train: The training data
        Y_train: The training data labels
        X_test:  The testing data
        
    Outputs:
        model: passes out the model we trained.
        prediction: a nparray with shape (X_test.shape[0], ) containing the predicted labels our model generated
        accuracy: training accuracy of model.
    """
    
    if (verbose == 1):
        print('\n{}\nDECISION TREE\n{}\n'.format(BORDER, BORDER))
        
    model = tree.DecisionTreeClassifier()
    model.fit(X_train, Y_train)
    
    Y_pred = model.predict(X_train)
    
    correctlyClassified = (Y_pred == Y_train).astype(int);
    accuracy = np.sum(correctlyClassified) / correctlyClassified.shape[0]
    
    if (verbose == 1):
        print('Training accuracy: ', accuracy)
    
    prediction = model.predict(X_test)
    return model, prediction, accuracy

In [14]:
def ClassifyWithRandomForest(X_train, Y_train, X_test, verbose=0):
    """
    Function takes training and testing data, and fits using logistic regression
    
    Inputs:
        X_train: The training data
        Y_train: The training data labels
        X_test:  The testing data
        
    Outputs:
        model: passes out the model we trained.
        prediction: a nparray with shape (X_test.shape[0], ) containing the predicted labels our model generated
        accuracy: training accuracy of model.
    """
    
    if (verbose == 1):
        print('\n{}\nRANDOM FOREST\n{}\n'.format(BORDER, BORDER))
        
    model = ensemble.RandomForestClassifier()
    model.fit(X_train, Y_train)
    
    Y_pred = model.predict(X_train)
    
    correctlyClassified = (Y_pred == Y_train).astype(int);
    accuracy = np.sum(correctlyClassified) / correctlyClassified.shape[0]
    
    if (verbose == 1):
        print('Training accuracy: ', accuracy)
    
    prediction = model.predict(X_test)
    return model, prediction, accuracy

In [15]:
def ClassifyWithSVM(X_train, Y_train, X_test, verbose=0):
    """
    Function takes training and testing data, and fits using logistic regression
    
    Inputs:
        X_train: The training data
        Y_train: The training data labels
        X_test:  The testing data
        
    Outputs:
        model: passes out the model we trained.
        prediction: a nparray with shape (X_test.shape[0], ) containing the predicted labels our model generated
        accuracy: training accuracy of model.
    """
    
    if (verbose == 1):
        print('\n{}\nSVM\n{}\n'.format(BORDER, BORDER))
        
    model = svm.LinearSVC(penalty='l1', verbose=verbose, dual=False, C=0.1)
    model.fit(X_train, Y_train)
    
    Y_pred = model.predict(X_train)
    
    correctlyClassified = (Y_pred == Y_train).astype(int);
    accuracy = np.sum(correctlyClassified) / correctlyClassified.shape[0]
    
    if (verbose == 1):
        print('Training accuracy: ', accuracy)
    
    prediction = model.predict(X_test)
    return model, prediction, accuracy

In [16]:
def ClassifyWithSVC(X_train, Y_train, X_test, verbose=0):
    """
    Function takes training and testing data, and fits using logistic regression
    
    Inputs:
        X_train: The training data
        Y_train: The training data labels
        X_test:  The testing data
        
    Outputs:
        model: passes out the model we trained.
        prediction: a nparray with shape (X_test.shape[0], ) containing the predicted labels our model generated
        accuracy: training accuracy of model.
    """
    
    if (verbose == 1):
        print('\n{}\nSVC\n{}\n'.format(BORDER, BORDER))
        
    model = svm.SVC(kernel="rbf", C=0.03, verbose=verbose)
    model.fit(X_train, Y_train)
    
    Y_pred = model.predict(X_train)
    
    correctlyClassified = (Y_pred == Y_train).astype(int);
    accuracy = np.sum(correctlyClassified) / correctlyClassified.shape[0]
    
    if (verbose == 1):
        print('Training accuracy: ', accuracy)
    
    prediction = model.predict(X_test)
    return model, prediction, accuracy

In [None]:
def ClassifyWithPipeline(X_train, Y_train, X_test, verbose=0):
    """
    Function takes training and testing data, and fits using logistic regression
    
    Inputs:
        X_train: The training data
        Y_train: The training data labels
        X_test:  The testing data
        
    Outputs:
        model: passes out the model we trained.
        prediction: a nparray with shape (X_test.shape[0], ) containing the predicted labels our model generated
        accuracy: training accuracy of model.
    """
    
    if (verbose == 1):
        print('\n{}\nPIPELINE\n{}\n'.format(BORDER, BORDER))
        
    select = feature_selection.SelectKBest(k='all')
    classifier = linear_model.LogisticRegression()

    transform = [('feature_selection', select),
        ('classifier', classifier)]

    model = pipeline.Pipeline(transform)
    model.fit( X_train, Y_train )

    Y_pred = model.predict(X_train)
    
    correctlyClassified = (Y_pred == Y_train).astype(int);
    accuracy = np.sum(correctlyClassified) / correctlyClassified.shape[0]
    
    print('Training accuracy: ', accuracy)
    
    prediction = model.predict(X_test)

    print('prediction', prediction)
    
    return model, prediction, accuracy

In [17]:
def ClassifyWithBernoulliNB(X_train, Y_train, X_test, verbose=0):
    """
    Function takes training and testing data, and fits using logistic regression
    
    Inputs:
        X_train: The training data
        Y_train: The training data labels
        X_test:  The testing data
        
    Outputs:
        model: passes out the model we trained.
        prediction: a nparray with shape (X_test.shape[0], ) containing the predicted labels our model generated
        accuracy: training accuracy of model.
    """
    
    if (verbose == 1):
        print('\n{}\nBernoulli Naive Bayes\n{}\n'.format(BORDER, BORDER))
        
    model = naive_bayes.BernoulliNB(alpha=0.01)
    model.fit(X_train, Y_train)
    
    Y_pred = model.predict(X_train)
    
    correctlyClassified = (Y_pred == Y_train).astype(int);
    accuracy = np.sum(correctlyClassified) / correctlyClassified.shape[0]
    
    if (verbose == 1):
        print('Training accuracy: ', accuracy)
    
    prediction = model.predict(X_test)
    return model, prediction, accuracy

In [18]:
def ClassifyWithMultinomialNB(X_train, Y_train, X_test, verbose=0):
    """
    Function takes training and testing data, and fits using logistic regression
    
    Inputs:
        X_train: The training data
        Y_train: The training data labels
        X_test:  The testing data
        
    Outputs:
        model: passes out the model we trained.
        prediction: a nparray with shape (X_test.shape[0], ) containing the predicted labels our model generated
        accuracy: training accuracy of model.
    """
    
    if (verbose == 1):
        print('\n{}\n Multinomial Naive Bayes\n{}\n'.format(BORDER, BORDER))
        
    model = naive_bayes.MultinomialNB(alpha=.01)
    model.fit(X_train, Y_train)
    
    Y_pred = model.predict(X_train)
    
    correctlyClassified = (Y_pred == Y_train).astype(int);
    accuracy = np.sum(correctlyClassified) / correctlyClassified.shape[0]
    
    if (verbose == 1):
        print('Training accuracy: ', accuracy)
    
    prediction = model.predict(X_test)
    return model, prediction, accuracy

In [19]:
def ClassifyWithEnsemble(X_train, Y_train, X_test, verbose=0):
    """
    Function takes training and testing data, and fits using logistic regression
    
    Inputs:
        X_train: The training data
        Y_train: The training data labels
        X_test:  The testing data
        
    Outputs:
        prediction: a nparray with shape (X_test.shape[0], ) containing the predicted labels our model generated
    """
    
    if (verbose == 1):
        print('\n{}\nEnsemble\n{}\n'.format(BORDER, BORDER))
        
    _, pred_NN, _ = ClassifyWithNeuralNetwork(X_train, Y_train, X_test, verbose=verbose)
    _, pred_CN, _ = ClassifyWithConvolutionNetwork(X_train, Y_train, X_test, verbose=verbose)
    _, pred_LR, _ = ClassifyWithLogisticRegression(X_train, Y_train, X_test, verbose=verbose)
    _, pred_SVM, _ = ClassifyWithSVM(X_train, Y_train, X_test, verbose=verbose)
    _, pred_NB, _ = ClassifyWithBernoulliNB(X_train, Y_train, X_test, verbose=verbose)
        
    # Combine prediction into list
    preds = []
    for i in range(len(pred_NN)):
        preds.append((pred_NN[i], pred_CN[i], pred_LR[i], pred_SVM[i], pred_NB[i]))
            
    
    # Final prediction
    Y_pred = np.array([])
    for i in range(len(preds)):
        lst = Counter(preds[i]).most_common()
        Y_pred = np.append(Y_pred, int(lst[0][0]))
    
    return Y_pred

In [20]:
def ClassifyWithBagging(X_train, Y_train, X_test, verbose=0):
    """
    Function takes training and testing data, and fits using logistic regression
    
    Inputs:
        X_train: The training data
        Y_train: The training data labels
        X_test:  The testing data
        
    Outputs:
        prediction: a nparray with shape (X_test.shape[0], ) containing the predicted labels our model generated
    """
    
    if (verbose == 1):
        print('\n{}\nBagging\n{}\n'.format(BORDER, BORDER))
        
    _, pred1, _ = ClassifyWithNeuralNetwork(X_train, Y_train, X_test, verbose=verbose)
    _, pred2, _ = ClassifyWithNeuralNetwork(X_train, Y_train, X_test, verbose=verbose)
    _, pred3, _ = ClassifyWithNeuralNetwork(X_train, Y_train, X_test, verbose=verbose)
    _, pred4, _ = ClassifyWithNeuralNetwork(X_train, Y_train, X_test, verbose=verbose)
    _, pred5, _ = ClassifyWithNeuralNetwork(X_train, Y_train, X_test, verbose=verbose)
        
    # Combine prediction into list
    preds = []
    for i in range(len(pred1)):
        preds.append((pred1[i], pred2[i], pred3[i], pred4[i], pred5[i]))
            
    
    # Final prediction
    Y_pred = np.array([])
    for i in range(len(preds)):
        lst = Counter(preds[i]).most_common()
        Y_pred = np.append(Y_pred, int(lst[0][0]))
    
    return Y_pred

In [21]:
def CrossValidation_NeuralNetwork(X_train, Y_train, X_test, nFold, verbose=0):
    
    kf = KFold(n_splits = nFold)
    inds = [ind for ind in kf.split(X_train, Y_train)]
        
    total_train_acc = []
    total_val_acc = []
    
     # Converting Y values to one hot vector
    num_classes = 2
    Y = keras.utils.to_categorical(Y_train, num_classes=num_classes)
    
    # perform 5-fold validation
    for i in range(0,nFold):
        traini, vali = inds[i]
        model, prediction, accuracy = ClassifyWithNeuralNetwork(X_train[traini], Y_train[traini], X_test)
        
        # Compute accuracy.
        val_acc = model.evaluate(X_train[vali], Y[vali], verbose=verbose)
            
        total_train_acc = np.append(total_train_acc, accuracy)
        total_val_acc = np.append(total_val_acc, val_acc[1])

    print(BORDER)
    print("CROSS VALIDATION: ClassifyWithNeuralNetwork")
    print(BORDER)
    print("training accuracy", total_train_acc)
    print("val accuracy", total_val_acc)
    print("average training accuracy", np.sum(total_train_acc) / 5.)
    print("average val accuracy", np.sum(total_val_acc) / 5.)
    
    return 0


In [22]:
def CrossValidation_ConvolutionNetwork(X_train, Y_train, X_test, nFold, verbose=0):
    kf = KFold(n_splits = nFold)
    inds = [ind for ind in kf.split(X_train, Y_train)]
        
    total_train_acc = []
    total_val_acc = []
    
    # Converting Y values to one hot vector
    num_classes = 2
    Y = keras.utils.to_categorical(Y_train, num_classes=num_classes)
    
    
    # For 1D Convolution, change dimension of input 
    X = np.expand_dims(X_train, axis=2)
    
    # perform 5-fold validation
    for i in range(0,nFold):
        traini, vali = inds[i]
        model, prediction, accuracy = ClassifyWithConvolutionNetwork(X_train[traini], Y_train[traini],X_test)
                
        # Compute accuracy
        val_acc = model.evaluate(X[vali], Y[vali], verbose=verbose)

        total_train_acc = np.append(total_train_acc, accuracy)
        total_val_acc = np.append(total_val_acc, val_acc[1])

    print(BORDER)
    print("CROSS VALIDATION: ClassifyWithConvolutionNetwork")
    print(BORDER)
    print("training accuracy", total_train_acc)
    print("val accuracy", total_val_acc)
    print("average training accuracy", np.sum(total_train_acc) / 5.)
    print("average val accuracy", np.sum(total_val_acc) / 5.)
    
    return 0

In [23]:
def CrossValidation_sklearn(modelFunction, X_train, Y_train, X_test, nFold, verbose=0):
    kf = KFold(n_splits = nFold)
    inds = [ind for ind in kf.split(X_train, Y_train)]
        
    total_train_acc = []
    total_val_acc = []
    
    
    # perform 5-fold validation
    for i in range(0,nFold):
        traini, vali = inds[i]
        model, prediction, accuracy = modelFunction(X_train[traini], Y_train[traini], X_test, verbose=verbose)
        
        # Compute accuracy.
        val_acc = model.score(X_train[vali], Y_train[vali])
            
        total_train_acc = np.append(total_train_acc, accuracy)
        total_val_acc = np.append(total_val_acc, val_acc)

    print(BORDER)
    print("CROSS VALIDATION: " + modelFunction.__name__)
    print(BORDER)
    print("training accuracy", total_train_acc)
    print("val accuracy", total_val_acc)
    print("average training accuracy", np.sum(total_train_acc) / float(nFold))
    print("average val accuracy", np.sum(total_val_acc) / float(nFold))
    
    return 0


In [24]:
def CrossValidation_Ensemble(X_train, Y_train, X_test, nFold, verbose=0):
    nModels = 5
    kf = KFold(n_splits = nFold)
    inds = [ind for ind in kf.split(X_train, Y_train)]
        
    total_train_acc = []
    total_val_acc = []
    
    # perform 5-fold validation
    for i in range(0,nFold):
        traini, vali = inds[i]
        
        model_NB, pred_NB, acc_NB = ClassifyWithBernoulliNB(X_train[traini], Y_train[traini], X_train[vali], verbose=verbose)
        model_LR, pred_LR, acc_LR = ClassifyWithLogisticRegression(X_train[traini], Y_train[traini],X_train[vali], verbose=verbose)
        model_SVM, pred_SVM, acc_SVM = ClassifyWithSVM(X_train[traini], Y_train[traini], X_train[vali], verbose=verbose)
        model_NN, pred_NN, acc_NN = ClassifyWithNeuralNetwork(X_train[traini], Y_train[traini], X_train[vali], verbose=verbose)
        model_CN, pred_CN, acc_CN = ClassifyWithConvolutionNetwork(X_train[traini], Y_train[traini], X_train[vali], verbose=verbose)
        
        
        # Combine prediction into list
        preds = []
        for i in range(len(pred_NN)):
            preds.append((pred_NB[i], pred_NN[i], pred_CN[i], pred_LR[i], pred_SVM[i]))
            
    
        # Final prediction
        Y_pred = np.array([])
        for i in range(len(preds)):
            lst = Counter(preds[i]).most_common()
            Y_pred = np.append(Y_pred, int(lst[0][0]))
            
        total_train_acc = np.append(total_train_acc, acc_NN)
        total_train_acc = np.append(total_train_acc, acc_CN)
        total_train_acc = np.append(total_train_acc, acc_LR)
        total_train_acc = np.append(total_train_acc, acc_SVM)
        total_train_acc = np.append(total_train_acc, acc_NB)
        
        correctlyClassified = (Y_pred == Y_train[vali]).astype(int);
        val_acc = np.sum(correctlyClassified) / correctlyClassified.shape[0]
        total_val_acc = np.append(total_val_acc, val_acc)

    print(BORDER)
    print("CROSS VALIDATION: Ensemble")
    print(BORDER)
    print("training accuracy", total_train_acc)
    print("val accuracy", total_val_acc)
    print("average training accuracy", np.sum(total_train_acc) / float(nModels * nFold))
    print("average val accuracy", np.sum(total_val_acc) / float(nFold))

In [25]:
def CrossValidation_Bagging(X_train, Y_train, X_test, nFold, verbose=0):
    nModels = 5
    kf = KFold(n_splits = nFold)
    inds = [ind for ind in kf.split(X_train, Y_train)]
        
    total_train_acc = []
    total_val_acc = []
    
    # perform 5-fold validation
    for i in range(0,nFold):
        traini, vali = inds[i]
        model1, pred1, acc1 = ClassifyWithNeuralNetwork(X_train[traini], Y_train[traini], X_train[vali], verbose=verbose)
        model2, pred2, acc2 = ClassifyWithNeuralNetwork(X_train[traini], Y_train[traini], X_train[vali], verbose=verbose)
        model3, pred3, acc3 = ClassifyWithNeuralNetwork(X_train[traini], Y_train[traini], X_train[vali], verbose=verbose)
        model4, pred4, acc4 = ClassifyWithNeuralNetwork(X_train[traini], Y_train[traini], X_train[vali], verbose=verbose)
        model5, pred5, acc5 = ClassifyWithNeuralNetwork(X_train[traini], Y_train[traini], X_train[vali], verbose=verbose)
        
        # Combine prediction into list
        preds = []
        for i in range(len(pred1)):
            preds.append((pred1[i], pred2[i], pred3[i], pred4[i], pred5[i]))
            
    
        # Final prediction
        Y_pred = np.array([])
        for i in range(len(preds)):
            lst = Counter(preds[i]).most_common()
            Y_pred = np.append(Y_pred, int(lst[0][0]))
            
        total_train_acc = np.append(total_train_acc, acc1)
        total_train_acc = np.append(total_train_acc, acc2)
        total_train_acc = np.append(total_train_acc, acc3)
        total_train_acc = np.append(total_train_acc, acc4)
        total_train_acc = np.append(total_train_acc, acc5)
        
        correctlyClassified = (Y_pred == Y_train[vali]).astype(int);
        val_acc = np.sum(correctlyClassified) / correctlyClassified.shape[0]
        total_val_acc = np.append(total_val_acc, val_acc)

    print(BORDER)
    print("CROSS VALIDATION: Bagging")
    print(BORDER)
    print("training accuracy", total_train_acc)
    print("val accuracy", total_val_acc)
    print("average training accuracy", np.sum(total_train_acc) / float(nModels * nFold))
    print("average val accuracy", np.sum(total_val_acc) / float(nFold))

In [26]:
# Loads training dataset
training = load_data('data/training_data.txt', 1)
X_train = training[:, 1:]
Y_train = training[:, 0]

In [27]:
# Loads testing dataset
# There is no label for testing set 
X_test = load_data('data/test_data.txt', 1)

In [None]:
N = X_train.shape[0]
corpusWordCount = np.sum((X_train != 0).astype(int), axis=0)
documentTotalWords = np.sum(X_train, axis=1)
zeroWords = np.sum(documentTotalWords == 0)

print("corpusWordCount shape {}".format(corpusWordCount.shape))
print("documentTotalWords shape {}".format(documentTotalWords.shape))
print("documents with 0 words: {}".format(zeroWords))

print("X_train shape {}".format(X_train.shape))

idf = np.log(N / (1 + corpusWordCount))
# idf = np.zeros((X_train.shape[0]))
# for i in range (X_train.shape[0]):
#     idf[i] = corpusWordCount

# Data processing
X_train_count  = np.copy(X_train)
X_train_binary = (X_train != 0).astype(int)
X_train_freq   = np.copy(X_train)
for i in range(X_train.shape[0]):
    if (documentTotalWords[i] != 0):
        X_train_freq[i] = np.log(X_train_freq[i] + 1)
#         X_train_freq[i] = X_train_freq[i] / documentTotalWords[i]

X_train_tfidf  = X_train_freq * idf;

print(X_train_freq[0])
        
documentTotalWords = np.sum(X_test, axis=1)
zeroWords = np.sum(documentTotalWords == 0)
print("documentTotalWords shape {}".format(documentTotalWords.shape))
print("documents with 0 words: {}".format(zeroWords))

X_test_count  = np.copy(X_test)
X_test_binary = (X_test != 0).astype(int)
X_test_freq   = np.copy(X_test)
for i in range(X_test.shape[0]):
    if (documentTotalWords[i] != 0):
        X_test_freq[i] = np.log(X_test_freq[i] + 1)
#         X_test_freq[i] = X_test_freq[i] / documentTotalWords[i]

X_test_tfidf  = X_test_freq * idf;

In [None]:
val_count = CrossValidation_Bagging(X_train_count, Y_train, X_test_count,5, verbose = 1)
val_binary = CrossValidation_Bagging(X_train_binary, Y_train, X_test_binary,5, verbose = 1)
val_freq = CrossValidation_Bagging(X_train_freq, Y_train, X_test_freq,5, verbose = 1)
val_tfidf = CrossValidation_Bagging(X_train_tfidf, Y_train, X_test_tfidf,5, verbose = 1)
print(BORDER)
print("Bagging")
print(BORDER)
print("Count: ", val_count)
print("Binary: ",  val_binary)
print("Freq: ", val_freq)
print("tfidf: ", val_tfidf)

In [None]:
# Using Validation   
CrossValidation_NeuralNetwork(X_train, Y_train, X_test, 5)
CrossValidation_ConvolutionNetwork(X_train, Y_train, X_test, 5)

CrossValidation_sklearn(ClassifyWithLinearRegression, X_train, Y_train, X_test, 5)
CrossValidation_sklearn(ClassifyWithLogisticRegression, X_train, Y_train, X_test, 5)
CrossValidation_sklearn(ClAassifyWithAdaBoost, X_train, Y_train, X_test, 5)
CrossValidation_sklearn(ClassifyWithDecisionTree, X_train, Y_train, X_test, 5)
CrossValidation_sklearn(ClassifyWithRandomForest, X_train, Y_train, X_test, 5)

CrossValidation_sklearn(ClassifyWithSVM, X_train, Y_train, X_test, 5, verbose=1)
CrossValidation_sklearn(ClassifyWithSVC, X_train, Y_train, X_test, 5, verbose=1)

CrossValidation_sklearn(ClassifyWithBernoulliNB, X_train, Y_train, X_test, 5)
CrossValidation_sklearn(ClassifyWithMultinomialNB, X_train, Y_train, X_test, 5)

CrossValidation_Ensemble(X_train, Y_train, X_test, 5)
CrossValidation_Bagging(X_train, Y_train, X_test, 5)

CROSS VALIDATION: ClassifyWithNeuralNetwork
training accuracy [0.9081875 0.90625   0.9110625 0.9130625 0.915125 ]
val accuracy [0.85925 0.845   0.843   0.847   0.8555 ]
average training accuracy 0.9107375
average val accuracy 0.84995


In [30]:
pred = ClassifyWithBagging(X_train, Y_train, X_test, verbose=1)
WritePredictionOut('Bagging', pred)


Bagging


NEURAL NETWORK

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_76 (Dense)             (None, 128)               128128    
_________________________________________________________________
activation_76 (Activation)   (None, 128)               0         
_________________________________________________________________
dropout_51 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_77 (Dense)             (None, 128)               16512     
_________________________________________________________________
activation_77 (Activation)   (None, 128)               0         
_________________________________________________________________
dropout_52 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_78 (Dense)             (None, 2)           