# ASSISTments Data Mining Competition 2017 - Results

## Imports and constants

In [None]:
import pickle
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (20.0, 10.0)
plt.rcParams.update({'font.size': 22})

In [None]:
def plot(layer, dims, metric, title, ylabel):
    f = plt.figure()
    
    for hidden_dim in dims:
        name = str(hidden_dim) + '_' + str(layer)
        pickle_results = open('Results/results_' + name + '.pickle', "rb")
        results = pickle.load(pickle_results)
        
        plt.plot(results[metric], label=str(hidden_dim))
        plt.legend(prop={'size':20})
        plt.title(title + ' - ' + str(layer) + ' layers', fontsize=35)
        plt.xlabel('Epochs')
        plt.ylabel(ylabel)
        
    f.savefig('Plots/plot_' + metric + '_' + str(layer) + '.pdf', bbox_inches='tight')

## Training set

We can see that with 2 or 3 layers for our multilayer RNN, the system is able to learn. The greater the hidden dimension, the greater the risks of overfitting and the faster the learning

In [None]:
plot(3, [16, 20, 24, 28, 32, 40, 48], 'accs', 'Accuracy on the training set over time', 'Accuracy')

In [None]:
plot(4, [16, 20, 24, 28, 32], 'accs', 'Accuracy on the training set over time', 'Accuracy')

In [None]:
plot(3, [16, 20, 24, 28, 32], 'aucs', 'ROC AUC on the training set over time', 'ROC AUC')

In [None]:
plot(4, [16, 20, 24, 28, 32], 'aucs', 'ROC AUC on the training set over time', 'ROC AUC')

## Validation set

On the validation set, we can see the effects of the overfitting when the number of layers is too big or when the hidden dimension is.

Initially, the result on accuracy is concentrated around 75% as the model initially predicts only "0" for all inputs. After some training, the accuracy either drops because of overfitting or due to a simplistic model, or the accuracy increases over 75%, meaning that the model is able to accurately predict.

In [None]:
plot(3, [16, 20, 24, 28, 32], 'val_accs', 'Accuracy on the validation set over time', 'Accuracy')

In [None]:
plot(4, [16, 20, 24, 28, 32], 'val_accs', 'Accuracy on the validation set over time', 'Accuracy')

We can see that for 32 hidden dimension and 3 layers, the accuracy reaches 90%

In [None]:
plot(3, [16, 20, 24, 28, 32, 40, 48], 'val_aucs', 'ROC AUC on the validation set over time', 'ROC AUC')

In [None]:
plot(4, [16, 20, 24, 28, 32], 'val_aucs', 'ROC AUC on the validation set over time', 'ROC AUC')

If we isolate the best performing parameter: 20 hidden dim and 4 layers

In [None]:
def plot_one(layer, dim, metric, ylabel):
    f = plt.figure()
    
    name = str(dim) + '_' + str(layer)
    pickle_results = open('Results/results_' + name + '.pickle', "rb")
    results = pickle.load(pickle_results)

    plt.plot(results[metric], label=str(dim))
    plt.title(ylabel + ' for hidden dim ' + str(dim) + ' with ' + str(layer) + ' layers', fontsize=35)
    plt.xlabel('Epochs')
    plt.ylabel(ylabel)
        
    f.savefig('Plots/plot_' + metric + '_' + str(layer) +  '_' + str(dim) + '.pdf', bbox_inches='tight')

In [None]:
plot_one(4, 20, 'val_aucs', 'AUC ROC')