# ASSISTments Data Mining Competition 2017 - Optional Semester Project

## Imports and constants

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [2]:
DATA_DIR = 'Data/'

## Loading the data

We load all the data into dataframes

In [3]:
student_logs = pd.concat([
    pd.read_csv(DATA_DIR + 'student_log_' + str(i) + '.csv') for i in range(1, 11)
], ignore_index=True)

student_logs.head()

  if self.run_code(code, result):


Unnamed: 0,AveCarelessness,AveCorrect,AveKnow,AveResBored,AveResConf,AveResEngcon,AveResFrust,AveResGaming,AveResOfftask,ITEST_id,...,timeOver80,timeSinceSkill,timeTaken,totalFrAttempted,totalFrPastWrongCount,totalFrPercentPastWrong,totalFrSkillOpportunities,totalFrSkillOpportunitiesByScaffolding,totalFrTimeOnSkill,totalTimeByPercentCorrectForskill
0,0.183276,0.483902,0.352416,0.208389,0.115905,0.679126,0.112408,0.196561,0.156503,8,...,0,0.0,49.0,0,0,0.0,0,0.0,0.0,0.0
1,0.183276,0.483902,0.352416,0.208389,0.115905,0.679126,0.112408,0.196561,0.156503,8,...,0,0.0,4.0,1,0,0.0,1,0.0,49.0,106.0
2,0.183276,0.483902,0.352416,0.208389,0.115905,0.679126,0.112408,0.196561,0.156503,8,...,0,0.0,6.0,2,0,0.0,0,0.0,0.0,0.0
3,0.183276,0.483902,0.352416,0.208389,0.115905,0.679126,0.112408,0.196561,0.156503,8,...,0,0.0,18.0,3,1,0.0,1,0.0,0.0,0.0
4,0.183276,0.483902,0.352416,0.208389,0.115905,0.679126,0.112408,0.196561,0.156503,8,...,0,0.0,2.0,3,1,1.0,1,1.0,6.0,77.999999


In [4]:
train_labels = pd.read_csv('Data/training_label.csv', index_col='ITEST_id').sort_index()
train_labels.drop_duplicates(subset=None, keep='first', inplace=True)

train_labels.head()

Unnamed: 0_level_0,SchoolId,AveCorrect,MCAS,isSTEM
ITEST_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
9,2,0.438492,32,1
27,1,0.348837,21,0
33,2,0.686391,52,0
35,2,0.379658,34,0
37,3,0.305785,-999,0


In [5]:
test_labels = pd.read_csv(DATA_DIR + 'validation_test_label.csv', index_col='ITEST_id').sort_index()
test_labels.head()

Unnamed: 0_level_0,SchoolId,AveCorrect,MCAS
ITEST_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
9,2,0.438492,32
101,4,0.403553,29
161,1,0.483425,40
164,2,0.256983,9
176,2,0.575949,50


We only keep actions for students for which we have labels in train_labels and test_labels. We also sort by student ID and by startTime of in order to have a chronological suite of actions

In [7]:
train_idx = train_labels.index.values
test_idx = test_labels.index.values

student_train_logs = student_logs[student_logs['ITEST_id'].isin(train_idx)].sort_values(by=['ITEST_id', 'startTime'])
student_test_logs = student_logs[student_logs['ITEST_id'].isin(test_idx)].sort_values(by=['ITEST_id', 'startTime'])
print('Training data shape:', student_train_logs.shape)
print('Test data shape:', student_test_logs.shape)

Training data shape: (251488, 77)
Test data shape: (91038, 77)


## Feature engineering

### Train input data

In order to use an RNN (LSTM), we need to create sequences of actions. A sequence is a matrix in which each row is an action taken by a student that follows a chronological order. An action is a row in the provided dataset (see above). In summary, we produce a Tensor (a 3D array) with the following dimensions:

* First: All the students actions
* Second: A list of actions for a specific student
* Third: A specific action (which in turn is a vector of metrics (see above))

The output that interests us is the last output of the sequence. Which is the output consisting of the last action of a specific student taking into account all previous actions of that student. This output will represent if the student will choose a STEM career or not.

**A problem that we will encounter and will need to solve is that sequences have dynamic lengths**. Because Tensors need to be of fixed predefined size. Meaning that not all students have done the same number of actions while learning.

A possible solution could be to reduce the actions taken to a fixed minimum number, but that wouldn't work because we need to be able to give a prediction even for students with very few actions taken.

In [8]:
# Creation of the input tensor
train_input = []

for idx in train_idx:
    specific_student_actions = student_train_logs[student_train_logs['ITEST_id'] == idx]
    student_actions = []
    
    for action_idx, action in specific_student_actions.iterrows():
        student_actions.append(np.array(action))
    
    train_input.append(np.array(student_actions))

For example, first student has taken 504 actions while the second student has taken 129 actions

In [9]:
print(train_input[0].shape)
print(train_input[1].shape)

(504, 77)
(129, 77)


### Train output data

The training output is a list of one-hot vectors of size 2 (one for each student). If the value of index 0 is 1, then it is non-STEM, if value at index 1 is 1, then it is STEM.

In [10]:
train_output = []

for idx, row in train_labels.iterrows():
    temp_arr = ([0] * 2)
    
    if row['isSTEM'] == 1:
        temp_arr[1] = 1
    else:
        temp_arr[0] = 1
        
    train_output.append(temp_arr)

In [11]:
len(train_output)

467

### Test data

We will split the training data into two sets, one for training and the other for testing. We will randomly take 10% of the data for testing and use the rest as training data. Ultimately we will train with everything before predicting.

**TODO** Améliorer ça et utiliser kFold cross validation (stratified), parce qu'on a relativement peu de données

In [12]:
NUM_TEST = int(np.ceil(len(train_input) / 10))
print(NUM_TEST)
X_test = train_input[NUM_TEST:]
y_test = train_output[NUM_TEST:]
X_train = train_input[:NUM_TEST]
y_train = train_output[:NUM_TEST]

47


In [18]:
print('Number of students for training:', len(X_train))
print('Number of students for testing:', len(X_test))

Number of students for training: 47
Number of students for testing: 420


## Neural network implementation

Here we fist create placeholders to hold our data. The dimensions for data are [Batch Size, Sequence Length, Input Dimension]. Here our values are:

* Batch size: to be defined at runtime
* Sequence length: Different for each student
* Input Dimension: Known (number of features of the data)

For our target

In [20]:
# Input dimension
input_dimension = len(train_input[0][0])
data = tf.placeholder(tf.float32, [None, None, input_dimension])
target = tf.placeholder(tf.float32, [None, 16])

77


## Flow execution