# ASSISTments Data Mining Competition 2017 - Optional Semester Project

## Imports and constants

In [1]:
# Imports
import numpy as np
import pandas as pd
import functools
import tensorflow as tf

# Constants
DATA_DIR = 'Data/'

## Loading the data

We load all the data into dataframes

In [2]:
student_train_logs = pd.read_pickle('student_train_logs')
student_test_logs = pd.read_pickle('student_test_logs')

In [3]:
train_idx = student_train_logs['ITEST_id'].unique()

We load the training labels into a dataframe

In [4]:
train_labels = pd.read_csv('Data/training_label.csv', index_col='ITEST_id').sort_index()
train_labels.drop_duplicates(subset=None, keep='first', inplace=True)

## Feature engineering

### Train input data

In order to use an RNN (LSTM), we need to create sequences of actions. A sequence is a matrix in which each row is an action taken by a student that follows a chronological order. An action is a row in the provided dataset (see above). In summary, we produce a Tensor (a 3D array) with the following dimensions:

* First: All the students actions
* Second: A list of actions for a specific student
* Third: A specific action (which in turn is a vector of metrics (see above))

The output that interests us is the last output of the sequence. Which is the output consisting of the last action of a specific student taking into account all previous actions of that student. This output will represent if the student will choose a STEM career or not.

**A problem that we will encounter and will need to solve is that sequences have dynamic lengths**. Because Tensors need to be of fixed predefined size. Meaning that not all students have done the same number of actions while learning.

A possible solution could be to use *padding*. Padding means normalizing the size of all sequences (to the maximum of all sequences size) and add 0 vectors for sequences that are smaller than the maximum size.

In [5]:
# Creation of the input tensor
train_input = []

for idx in train_idx:
    specific_student_actions = student_train_logs[student_train_logs['ITEST_id'] == idx]
    student_actions = []
    
    for action_idx, action in specific_student_actions.iterrows():
        student_actions.append(np.array(action))
    
    train_input.append(np.array(student_actions))

For example, first student has taken 504 actions while the second student has taken 129 actions.

We retrieve the biggest sequence and increase the size of all other sequences to be equal to the maximum with 0 vectors

In [6]:
max_sequence_size = 0
for i in range(len(train_input)):
    max_sequence_size = max(max_sequence_size, len(train_input[i]))
max_sequence_size

2742

In [7]:
train_input_padd = []
n_features = len(train_input[0][0])

for i in range(len(train_input)):
    n = len(train_input[i])
    rem = max_sequence_size - n

    z = np.zeros((rem, n_features), dtype=train_input[i].dtype)
    train_input_padd.append(np.concatenate((train_input[i], z)))

In [8]:
train_input_padd[0].shape

(2742, 51)

### Train output data

The training output is a list of one-hot vectors of size 2 (one for each student). If the value of index 0 is 1, then it is non-STEM, if value at index 1 is 1, then it is STEM.

In [9]:
train_output = []

for idx, row in train_labels.iterrows():
    temp_arr = ([0] * 2)
    
    if row['isSTEM'] == 1:
        temp_arr[1] = 1
    else:
        temp_arr[0] = 1
        
    train_output.append(temp_arr)

In [10]:
np.sum(train_output, axis=0)

array([350, 117])

### Test data

We will split the training data into two sets, one for training and the other for testing. We will randomly take 10% of the data for testing and use the rest as training data. Ultimately we will train with everything before predicting.

**TODO** Améliorer ça et utiliser kFold cross validation (stratified), parce qu'on a relativement peu de données

In [11]:
NUM_TEST = int(np.ceil(len(train_input_padd) * 0.05))
print(NUM_TEST)
X_train = train_input_padd[NUM_TEST:]
y_train = train_output[NUM_TEST:]
X_test = train_input_padd[:NUM_TEST]
y_test = train_output[:NUM_TEST]

24


In [12]:
print('Number of students for training:', len(X_train))
print('Number of students for testing:', len(X_test))

Number of students for training: 443
Number of students for testing: 24


## Neural network implementation

Here we fist create placeholders to hold our data. The dimensions for data are [Batch Size, Sequence Length, Input Dimension]. Here our values are:

* Batch size: to be defined at runtime
* Sequence length: Different for each student
* Input Dimension: Known (number of features of the data)

In [13]:
def lazy_property(function):
    attribute = '_' + function.__name__

    @property
    @functools.wraps(function)
    def wrapper(self):
        if not hasattr(self, attribute):
            setattr(self, attribute, function(self))
        return getattr(self, attribute)
    return wrapper


class VariableSequenceClassification:

    def __init__(self, data, target, num_hidden=70, num_layers=2):
        self.data = data
        self.target = target
        self._num_hidden = num_hidden
        self._num_layers = num_layers
        self.prediction
        self.error
        self.optimize

    @lazy_property
    def length(self):
        used = tf.sign(tf.reduce_max(tf.abs(self.data), reduction_indices=2))
        length = tf.reduce_sum(used, reduction_indices=1)
        length = tf.cast(length, tf.int32)
        return length

    @lazy_property
    def prediction(self):
        # Recurrent network.
        output, _ = tf.nn.dynamic_rnn(
            tf.nn.rnn_cell.LSTMCell(self._num_hidden),
            data,
            dtype=tf.float32,
            sequence_length=self.length,
        )
        last = self._last_relevant(output, self.length)
        # Softmax layer.
        weight, bias = self._weight_and_bias(
            self._num_hidden, int(self.target.get_shape()[1]))
        prediction = tf.nn.softmax(tf.matmul(last, weight) + bias)
        return prediction

    @lazy_property
    def cost(self):
        cross_entropy = -tf.reduce_sum(self.target * tf.log(self.prediction))
        return cross_entropy

    @lazy_property
    def optimize(self):
        learning_rate = 0.0003
        optimizer = tf.train.AdamOptimizer(learning_rate)
        return optimizer.minimize(self.cost)

    @lazy_property
    def error(self):
        mistakes = tf.not_equal(
            tf.argmax(self.target, 1), tf.argmax(self.prediction, 1))
        return tf.reduce_mean(tf.cast(mistakes, tf.float32))

    @staticmethod
    def _weight_and_bias(in_size, out_size):
        weight = tf.truncated_normal([in_size, out_size], stddev=0.01)
        bias = tf.constant(0.1, shape=[out_size])
        return tf.Variable(weight), tf.Variable(bias)

    @staticmethod
    def _last_relevant(output, length):
        batch_size = tf.shape(output)[0]
        max_length = int(output.get_shape()[1])
        output_size = int(output.get_shape()[2])
        index = tf.range(0, batch_size) * max_length + (length - 1)
        flat = tf.reshape(output, [-1, output_size])
        relevant = tf.gather(flat, index)
        return relevant

## Flow execution

We execute the flow execution graph defined before with our data

In [14]:
y_test

[[0, 1],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [0, 1],
 [0, 1],
 [1, 0],
 [1, 0],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0]]

In [15]:
num_classes = 2
row_size = n_features
batch_size = 50
no_of_batches = int(len(X_train) / batch_size)

data = tf.placeholder(tf.float32, [None, max_sequence_size, row_size])
target = tf.placeholder(tf.float32, [None, num_classes])
model = VariableSequenceClassification(data, target)
sess = tf.Session()
sess.run(tf.global_variables_initializer())

for epoch in range(4):
    ptr = 0
    for _ in range(no_of_batches):
        inp, out = X_train[ptr:ptr+batch_size], y_train[ptr:ptr+batch_size]
        ptr += batch_size
        sess.run(model.optimize, {data: inp, target: out})
    error = sess.run(model.error, {data: X_test, target: y_test})
    print('Epoch {:2d} error {:3.1f}%'.format(epoch + 1, 100 * error))
    print(sess.run(model.prediction,{data: X_test}))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch  1 error 37.5%
[[ 0.5075289   0.49247113]
 [ 0.50085479  0.49914521]
 [ 0.51198596  0.4880141 ]
 [ 0.49997818  0.50002187]
 [ 0.50667328  0.49332681]
 [ 0.51198596  0.4880141 ]
 [ 0.50492579  0.49507415]
 [ 0.5119859   0.48801413]
 [ 0.5122174   0.4877826 ]
 [ 0.49997818  0.50002187]
 [ 0.51278174  0.48721829]
 [ 0.51112705  0.48887298]
 [ 0.51112705  0.48887298]
 [ 0.49997818  0.50002187]
 [ 0.51332843  0.4866716 ]
 [ 0.50953275  0.49046725]
 [ 0.51147872  0.48852125]
 [ 0.50873464  0.49126539]
 [ 0.5087347   0.49126536]
 [ 0.51147866  0.48852128]
 [ 0.49997818  0.50002187]
 [ 0.50413603  0.495864  ]
 [ 0.51147866  0.48852128]
 [ 0.51147872  0.48852125]]
Epoch  2 error 20.8%
[[ 0.51706856  0.48293138]
 [ 0.51935345  0.48064655]
 [ 0.52961993  0.47038007]
 [ 0.51123428  0.48876569]
 [ 0.52701777  0.47298226]
 [ 0.52961993  0.47038007]
 [ 0.51531875  0.48468119]
 [ 0.52961987  0.4703801 ]
 [ 0.53158599  0.46841398]
 [ 0.51123428  0.48876569]
 [ 0.53279454  0.46720549]
 [ 0.5285254