In [11]:
import os
import csv
import random
import time
import tensorflow as tf
import numpy as np
from sklearn.metrics import roc_curve, auc

import logging
logger = logging.getLogger(__name__)

In [12]:
DATA_DIR = './data/'
#train_file = os.path.join(DATA_DIR, 'builder_train.csv')
#test_file = os.path.join(DATA_DIR, 'builder_test.csv')
train_file = os.path.join(DATA_DIR, '0910_b_train.csv')
test_file = os.path.join(DATA_DIR, '0910_b_test.csv')

In [13]:
def read_data_from_csv(filename):
    rows = []
    max_num_problems_answered = 0
    num_problems = 0
    
    print("Reading {0}".format(filename))
    with open(filename, 'r') as f:
        reader = csv.reader(f, delimiter=',')
        for row in reader:
            rows.append(row)
    print("{0} lines was read".format(len(rows)))
    
    # tuples stores the student answering sequence as 
    # ([num_problems_answered], [problem_ids], [is_corrects])
    tuples = []
    for i in range(0, len(rows), 3):
        # numbers of problem a student answered
        num_problems_answered = int(rows[i][0])
        
        # only keep student with at least 3 records.
        if num_problems_answered < 3:
            continue
        
        problem_ids = rows[i+1]
        is_corrects = rows[i+2]
        
        invalid_ids_loc = [i for i, pid in enumerate(problem_ids) if pid=='']        
        for invalid_loc in invalid_ids_loc:
            del problem_ids[invalid_loc]
            del is_corrects[invalid_loc]
        
        tup =(num_problems_answered, problem_ids, is_corrects)
        tuples.append(tup)
        
        if max_num_problems_answered < num_problems_answered:
            max_num_problems_answered = num_problems_answered
        
        pid = max(int(pid) for pid in problem_ids if pid!='')
        if num_problems < pid:
            num_problems = pid
    # add 1 to num_problems because 0 is in the pid
    num_problems+=1

    #shuffle the tuple
    random.shuffle(tuples)

    print ("max_num_problems_answered:", max_num_problems_answered)
    print ("num_problems:", num_problems)
    print("The number of students is {0}".format(len(tuples)))
    print("Finish reading data.")
    
    return tuples, max_num_problems_answered, num_problems

In [14]:
def padding(question_seq, question_corr, target_length):
    num_questions = len(question_seq)
    pad_length = target_length - num_questions
    question_seq += [-1]*pad_length
    question_corr += [0]*pad_length
    return (question_seq, question_corr)

In [15]:
start_time = time.time()

students_train, max_num_problems_answered_train, num_problems = \
read_data_from_csv(train_file)

students_train = [padding(q_seq, q_corr, max_num_problems_answered_train) 
                  for _, q_seq, q_corr in students_train]

end_time = time.time()
print("time used: {0}s".format(end_time-start_time))

Reading ./data/0910_b_train.csv
10116 lines was read
max_num_problems_answered: 1219
num_problems: 124
The number of students is 3134
Finish reading data.
time used: 0.29899144172668457s


## Student Model

### Placeholder Explanation
X is the one-hot encoded input sequence of a student.
y is the one-hot encoded correct sequence of a student.

For example, the student i has a seq [1, 3, 1, 1, 2] with correct map [0, 1, 1, 1, 0]. The X_seq will be one hot encoded as:
$$
\left[
    \begin{array}{ccccc}
        0&1&0&0&0\\
        0&0&0&1&0\\
        0&1&0&0&0\\
        0&1&0&0&0\\
    \end{array}
\right]
$$

The X_corr map will be one hot encoded as:
$$
\left[
    \begin{array}{ccccc}
        0&0&0&0&0\\
        0&0&0&1&0\\
        0&1&0&0&0\\
        0&1&0&0&0\\
    \end{array}
\right]
$$

Then, it will be concatenated into $X^i$:
$$
\left[
    \begin{array}{ccccc|ccccc}
        0&1&0&0&0&0&0&0&0&0\\
        0&0&0&1&0&0&0&0&1&0\\
        0&1&0&0&0&0&1&0&0&0\\
        0&1&0&0&0&0&1&0&0&0\\
    \end{array}
\right]
$$

The last question '2' is not used in the $X^i$ because it is the last record that the student has and therefore used in $y$.
So, $y$ would be seq [3, 1, 1, 2] with corr map [1, 1, 1, 0]
$$
\left[
    \begin{array}{ccccc}
        0&0&0&1&0\\
        0&1&0&0&0\\
        0&1&0&0&0\\
        0&0&0&0&0\\
    \end{array}
\right]
$$


In [16]:
def seq_corr_to_onehot(seq, corr, num_steps, num_problems):
    seq_oh = tf.one_hot(seq, depth=num_problems)
    seq_oh_flat = tf.reshape(seq_oh, [-1, num_problems])
    
    # element-wise multiplication between Matrix and Vector
    # the i-th column of Matrixelement-wisedly multiply the i-th element in the Vector
    corr_flat = tf.reshape(corr, [-1])
    corr_mat = tf.multiply(tf.transpose(seq_oh_flat), tf.cast(corr_flat, dtype=tf.float32))
    corr_mat = tf.transpose(corr_mat)
    corr_mat = tf.reshape(corr_mat, shape=[-1, num_steps, num_problems])
    
    return seq_oh, corr_mat

In [17]:
# network configuration
batch_size = 32
num_layers = 1
state_size = 200
num_steps = max_num_problems_answered_train-1
input_size = num_problems * 2
output_size = num_problems

inputs_seq = tf.placeholder(tf.int32, [None, num_steps])
inputs_corr = tf.placeholder(tf.int32, [None, num_steps])
X_seq, X_corr = seq_corr_to_onehot(inputs_seq, inputs_corr, num_steps, num_problems)
X = tf.concat([X_seq, X_corr], axis=2, name='X')
X = tf.cast(X, dtype=tf.float32)

targets_seq = tf.placeholder(tf.int32, [None, num_steps])
targets_corr = tf.placeholder(tf.int32, [None, num_steps])
y_seq, y_corr = seq_corr_to_onehot(targets_seq, targets_corr, num_steps, num_problems)

init_state = tf.placeholder(tf.float32, [num_layers, 2, None, state_size])
state_per_layer_list  = tf.unstack(init_state, axis=0)
rnn_tuple_state = tuple([tf.contrib.rnn.LSTMStateTuple(
            state_per_layer_list[idx][0],
            state_per_layer_list[idx][1]
        ) for idx in range(num_layers)])

In [18]:
rnn_tuple_state

(LSTMStateTuple(c=<tf.Tensor 'strided_slice:0' shape=(?, 200) dtype=float32>, h=<tf.Tensor 'strided_slice_1:0' shape=(?, 200) dtype=float32>),)

### Network Configuration
There are basically 2 elements needed to construct the LSTM network
1. The cell, and
2. The rnn structure.

The cell is defined via the tf.contrib.rnn library. It supports the multilayer RNN as well. 

The RNN is defined via the tf.nn.dynamic_rnn. It is parameterized by the cell defined, the input X, and a initial state.

In [19]:
# build up the network
with tf.variable_scope('cell'):
    cell = tf.contrib.rnn.LSTMCell(num_units=state_size,
                                   forget_bias=1.0,
                                   state_is_tuple=True)
    
    cell = tf.contrib.rnn.DropoutWrapper(cell,
                                        output_keep_prob=1.0)
    
    cell = tf.contrib.rnn.MultiRNNCell([cell]*num_layers, state_is_tuple=True)

with tf.variable_scope('rnn'):
    states_series, current_state = tf.nn.dynamic_rnn(cell, 
                                                    X,
                                                    initial_state=rnn_tuple_state,
                                                    time_major=False)

print("the states series is:\n", states_series)
print("\nthe current_state is:\n", current_state)

the states series is:
 Tensor("rnn/rnn/transpose:0", shape=(?, 1218, 200), dtype=float32)

the current_state is:
 (LSTMStateTuple(c=<tf.Tensor 'rnn/rnn/while/Exit_2:0' shape=(?, 200) dtype=float32>, h=<tf.Tensor 'rnn/rnn/while/Exit_3:0' shape=(?, 200) dtype=float32>),)


In [20]:
W_yh = tf.Variable(tf.random_normal([state_size, output_size]), name="W_yh")
b_yh = tf.Variable(tf.constant(0.1, shape=[output_size,]), name="b_yh")

states_series = tf.reshape(states_series, [-1, state_size])
logits_flat = tf.matmul(states_series, W_yh) + b_yh
y_seq_flat = tf.cast(tf.reshape(y_seq, [-1, output_size]), dtype=tf.float32)
y_corr_flat = tf.cast(tf.reshape(y_corr, [-1, output_size]), dtype=tf.float32)

target_logits = tf.multiply(logits_flat, y_seq_flat)
target_logits = tf.reduce_sum(target_logits, axis=1)

target_labels = tf.multiply(y_corr_flat, y_seq_flat)
target_labels = tf.reduce_sum(target_labels, axis=1)

loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=target_logits, 
                                               labels=target_labels)
total_loss = tf.reduce_mean(loss)
optimizer = tf.train.AdamOptimizer(learning_rate=1e-3).minimize(loss)

In [23]:
def run_train(sess):
    sess.run(tf.global_variables_initializer())
    for epoch_idx in range(num_epochs):
        num_students = len(students_train)
        
        iteration = 0
        for batch_idx in range(0, num_students, batch_size):
            start_idx = batch_idx
            end_idx = min(num_students, batch_idx+batch_size)
            
            new_batch_size = end_idx - start_idx
            _current_state = np.zeros((num_layers, 2, new_batch_size, state_size))
            
            inputs_seq_batch = np.array([tup[0][:-1] for tup in students_train[start_idx:end_idx]], dtype=np.int32)
            inputs_corr_batch = np.array([tup[1][:-1] for tup in students_train[start_idx:end_idx]], dtype=np.int32)
            
            y_seq_batch = np.array([tup[0][1:] for tup in students_train[start_idx:end_idx]], dtype=np.int32)
            y_corr_batch = np.array([tup[1][1:] for tup in students_train[start_idx:end_idx]], dtype=np.int32)
            
#             print(inputs_seq_batch.shape, 
#                  inputs_corr_batch.shape,
#                  y_seq_batch.shape,
#                  y_corr_batch.shape)

            _optimizer, _current_state = sess.run([optimizer, current_state],
                    feed_dict={
                    inputs_seq: inputs_seq_batch,
                    inputs_corr: inputs_corr_batch,
                    targets_seq: y_seq_batch,
                    targets_corr: y_corr_batch,
                    init_state: _current_state,
                })
            
            if iteration%100 == 0:
                _total_loss= sess.run([total_loss],
                    feed_dict={
                    inputs_seq: inputs_seq_batch,
                    inputs_corr: inputs_corr_batch,
                    targets_seq: y_seq_batch,
                    targets_corr: y_corr_batch,
                    init_state: _current_state,
                })
                print("Epoch {0}, batch {1}, loss value: {2}".format(epoch_idx, batch_idx, _total_loss))
            
            iteration+=1

                
                
def run_test(sess):
    pass

In [24]:
WITH_CONFIG = True
num_epochs = 25

start_time = time.time()
if WITH_CONFIG:
    config = tf.ConfigProto()
    config.gpu_options.per_process_gpu_memory_fraction = 0.5
    with tf.Session(config=config) as sess:
        run_train(sess)
else:
    with tf.Session() as sess:
        run_train(sess)
           
end_time = time.time()

print("program run for: {0}s".format(end_time-start_time))

Epoch 0, batch 0, loss value: [0.68823177]
Epoch 1, batch 0, loss value: [0.68385661]
Epoch 2, batch 0, loss value: [0.68093604]
Epoch 3, batch 0, loss value: [0.6787768]


KeyboardInterrupt: 

In [None]:
X = np.random.randint(124, size=(3, 10))
X

In [None]:
with tf.Session():
    x = tf.cast([[0,1,0,0,1]], dtype=tf.int32)
    zero = tf.constant(0, dtype=tf.int32)
    where = tf.not_equal(x, zero)
    indices = tf.where(where)
    print(tf.gather(x, indices).eval())

In [None]:
num_problems