In [1]:
import os
import csv
import random
import time
import tensorflow as tf
import numpy as np
from sklearn.metrics import roc_curve, auc

import logging
logger = logging.getLogger(__name__)

In [2]:
DATA_DIR = './data/'
train_file = os.path.join(DATA_DIR, 'builder_train.csv')
test_file = os.path.join(DATA_DIR, 'builder_test.csv')
# train_file = os.path.join(DATA_DIR, '0910_b_train.csv')
# test_file = os.path.join(DATA_DIR, '0910_b_test.csv')

In [3]:
def read_data_from_csv(filename):
    rows = []
    max_num_problems_answered = 0
    num_problems = 0
    
    print("Reading {0}".format(filename))
    with open(filename, 'r') as f:
        reader = csv.reader(f, delimiter=',')
        for row in reader:
            rows.append(row)
    print("{0} lines was read".format(len(rows)))
    
    # tuples stores the student answering sequence as 
    # ([num_problems_answered], [problem_ids], [is_corrects])
    tuples = []
    for i in range(0, len(rows), 3):
        # numbers of problem a student answered
        num_problems_answered = int(rows[i][0])
        
        # only keep student with at least 3 records.
        if num_problems_answered <= 2:
            continue
        
        problem_ids = rows[i+1]
        is_corrects = rows[i+2]
        
        invalid_ids_loc = [i for i, pid in enumerate(problem_ids) if pid=='']        
        for invalid_loc in invalid_ids_loc:
            del problem_ids[invalid_loc]
            del is_corrects[invalid_loc]
        
        tup =(num_problems_answered, problem_ids, is_corrects)
        tuples.append(tup)
        
        if max_num_problems_answered < num_problems_answered:
            max_num_problems_answered = num_problems_answered
        
        pid = max(int(pid) for pid in problem_ids if pid!='')
        if num_problems < pid:
            num_problems = pid
    # add 1 to num_problems because 0 is in the pid
    num_problems+=1

    #shuffle the tuple
    random.shuffle(tuples)

    print ("max_num_problems_answered:", max_num_problems_answered)
    print ("num_problems:", num_problems)
    print("The number of students is {0}".format(len(tuples)))
    print("Finish reading data.")
    
    return tuples, max_num_problems_answered, num_problems

In [4]:
def padding(question_seq, question_corr, target_length):
    num_questions = len(question_seq)
    pad_length = target_length - num_questions
    question_seq += [-1]*pad_length
    question_corr += [0]*pad_length
    return (question_seq, question_corr)

In [5]:
start_time = time.time()

students_train, max_num_problems_answered_train, num_problems = \
read_data_from_csv(train_file)

students_train = [padding(q_seq, q_corr, max_num_problems_answered_train) 
                  for _, q_seq, q_corr in students_train]

end_time = time.time()
print("time used: {0}s".format(end_time-start_time))

Reading ./data/builder_train.csv
10083 lines was read
max_num_problems_answered: 4290
num_problems: 124
The number of students is 3135
Finish reading data.
time used: 0.548058271408081s


## Student Model

### Data processing
We define 4 placeholders here
1. inputs_seq: the padded students answering sequence.
2. inputs_corr: the padded students correctness sequence.
3. y_seq: the padded student answering sequence which roll 1 unit to the right.
4. y_seq_corr: the padded student correctness sequence which roll 1 unit to the right.

The seq is first one-hot encoded and then used for finding the one-hot encoded correctmapping. For example, the student i has a seq [1 3 1 1] with correct map [0 1 1 1]. The seq will be one hot encoded as:
$$
\left[
    \begin{array}{ccccc}
        0&1&0&0&0\\
        0&0&0&1&0\\
        0&1&0&0&0\\
        0&1&0&0&0\\
    \end{array}
\right]
$$

The corr will be one hot encoded as:
$$
\left[
    \begin{array}{ccccc}
        0&0&0&0&0\\
        0&0&0&1&0\\
        0&1&0&0&0\\
        0&1&0&0&0\\
    \end{array}
\right]
$$

Then, it will be concatenated into $X^i$:
$$
\left[
    \begin{array}{ccccc|ccccc}
        0&1&0&0&0&0&0&0&0&0\\
        0&0&0&1&0&0&0&0&1&0\\
        0&1&0&0&0&0&1&0&0&0\\
        0&1&0&0&0&0&1&0&0&0\\
    \end{array}
\right]
$$

In [6]:
batch_size = 32

# network configuration
num_layers = 1
state_size = 200
num_steps = max_num_problems_answered_train
input_size = num_problems


# It is expected to receives a sequence of question_id and the correctness
# that student answered. For example, the inputs sequences may be 
# [15, 25, 25, -1, -1] and the inputs_correctness is [1, 0, 1, 0, 0].
inputs_seq = tf.placeholder(tf.int32, [batch_size, num_steps])
inputs_corr = tf.placeholder(tf.int32, [batch_size, num_steps])

# The inputs_sequence is then one-hot encoded
inputs_seq_oh = tf.one_hot(inputs_seq, depth=num_problems, name='inputs_seq_oh1')
print(inputs_seq_oh)

# Compute the correct matrix
inputs_seq_oh = tf.cast(inputs_seq_oh, tf.int32, name='inputs_seq_oh2')
inputs_seq_flat = tf.reshape(inputs_seq_oh, [-1, num_problems], name='inputs_seq_flat1')
inputs_corr_flat = tf.reshape(inputs_corr, [-1], name='inputs_corr_flat1')
inputs_corr_flat = tf.transpose(tf.multiply(tf.transpose(inputs_seq_flat), inputs_corr_flat), name='inputs_corr_flat1')
inputs_corr_oh = tf.reshape(inputs_corr_flat, [-1, num_steps, num_problems], name='inputs_corr_oh1')

# Concatenate the sequence and correctness into the input tensor X
X = tf.concat((inputs_seq_oh, inputs_corr_oh), axis=2)
X = tf.cast(X, tf.float32)

# Do the same thing on y
y_seq = tf.placeholder(tf.int32, [batch_size, num_steps])
y_corr = tf.placeholder(tf.int32, [batch_size, num_steps])

y_seq_oh = tf.one_hot(y_seq, depth=num_problems)
y_seq_oh = tf.cast(y_seq_oh, tf.int32)
y_seq_flat = tf.reshape(y_seq_oh, [-1, num_problems])
y_corr_flat = tf.reshape(y_corr, [-1])
y_corr_flat = tf.transpose(tf.multiply(tf.transpose(y_seq_flat), y_corr_flat))
y_corr_oh = tf.reshape(y_corr_flat, [-1, num_steps, num_problems])

Tensor("inputs_seq_oh1:0", shape=(32, 4290, 124), dtype=float32)


### Network Configuration
There are basically 2 elements needed to construct the LSTM network
1. The cell, and
2. The rnn structure.

The cell is defined via the tf.contrib.rnn library. It supports the multilayer RNN as well.

The RNN is defined via the tf.nn.dynamic_rnn. It is parameterized by the cell defined, the input X, and a initial state.

In [7]:
# build up the network
with tf.variable_scope('cell'):
    # single layer
    cell = tf.contrib.rnn.LSTMCell(num_units=state_size,
                                   forget_bias=1.0,
                                   state_is_tuple=True)
    
    # multi layers
    cell = tf.contrib.rnn.MultiRNNCell([cell]*num_layers, state_is_tuple=True)

with tf.variable_scope('rnn'):
    _init_state = cell.zero_state(batch_size, dtype=tf.float32)
    states_series, current_state = tf.nn.dynamic_rnn(cell, 
                                                    X,
                                                    initial_state=_init_state,
                                                    time_major=False)

print("the states series is:\n", states_series)
print("\nthe current_state is:\n", current_state)

the states series is:
 Tensor("rnn/rnn/transpose:0", shape=(32, 4290, 200), dtype=float32)

the current_state is:
 (LSTMStateTuple(c=<tf.Tensor 'rnn/rnn/while/Exit_2:0' shape=(32, 200) dtype=float32>, h=<tf.Tensor 'rnn/rnn/while/Exit_3:0' shape=(32, 200) dtype=float32>),)


In [8]:
W_yh = tf.Variable(tf.random_normal([state_size, num_problems]))
b_yh = tf.Variable(tf.constant(0.1, shape=[num_problems,]))

states_series = tf.reshape(states_series, [-1, state_size])
labels = tf.reshape(y_corr_oh, [-1, num_problems])
labels = tf.cast(labels, tf.float32)

logits = tf.matmul(states_series, W_yh) + b_yh
target_logits = tf.multiply(logits, labels)
y_pred = tf.sigmoid(logits)

print(y_pred)
print(labels)
print(target_logits)

Tensor("Sigmoid:0", shape=(137280, 124), dtype=float32)
Tensor("Cast_2:0", shape=(137280, 124), dtype=float32)
Tensor("Mul_2:0", shape=(137280, 124), dtype=float32)


In [9]:
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=target_logits, labels=labels))
optimizer = tf.train.AdamOptimizer(learning_rate=1e-3).minimize(loss)

# auc = tf.metrics.auc(labels, y_pred)

In [10]:
USING_GPU = False
num_epochs = 30

In [11]:
def run_train(sess):
    sess.run(tf.global_variables_initializer())
    for epoch_idx in range(num_epochs):
        print("Epochs: ", epoch_idx)
        num_students = len(students_train)
        for batch_idx in range(0, num_students, batch_size):
            start_idx = batch_idx
            end_idx = min(num_students, batch_idx+batch_size)
            inputs_seq_batch = np.array([tup[0] for tup in students_train[start_idx:end_idx]], dtype=np.int32)
            inputs_corr_batch = np.array([tup[1] for tup in students_train[start_idx:end_idx]], dtype=np.int32)

            # shift y to right with 1
            y_seq_batch = np.array([np.roll(tup[0], 1) for tup in students_train[start_idx:end_idx]], dtype=np.int32)
            y_corr_batch = np.array([np.roll(tup[1], 1) for tup in students_train[start_idx:end_idx]], dtype=np.int32)
            #print(inputs_seq_batch.dtype, inputs_corr_batch.dtype, y_seq_batch.dtype, y_corr_batch.dtype)
            
            _optimizer= sess.run([optimizer],
                    feed_dict={
                    inputs_seq: inputs_seq_batch,
                    inputs_corr: inputs_corr_batch,
                    y_seq: y_seq_batch,
                    y_corr: y_corr_batch,
                })

            if batch_idx%100 == 0:
                _loss= sess.run([loss],
                    feed_dict={
                    inputs_seq: inputs_seq_batch,
                    inputs_corr: inputs_corr_batch,
                    y_seq: y_seq_batch,
                    y_corr: y_corr_batch,
                })
                print("Epoch {0}, batch {1}, loss value: {2}".format(epoch_idx, batch_idx, _loss))

                
def run_test(sess):
    pass

In [12]:
start_time = time.time()
if USING_GPU:
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        run_train(sess)
else:
    with tf.Session() as sess:
        run_train(sess)
           
end_time = time.time()

print("program run for: {0}s".format(end_time-start_time))

Epochs:  0
Epoch 0, batch 0, loss value: [0.72782922]
Epoch 0, batch 800, loss value: [0.72770357]
Epoch 0, batch 1600, loss value: [0.72780907]
Epoch 0, batch 2400, loss value: [0.72778744]


ValueError: Cannot feed value of shape (31, 4290) for Tensor 'Placeholder:0', which has shape '(32, 4290)'

In [9]:
if True:
    with tf.Session():
        # It is expected to receives a sequence of question_id and the correctness
        # that student answered. For example, the inputs sequences may be 
        # [15, 25, 25, -1, -1] and the inputs_correctness is [1, 0, 1, 0, 0].
        batch_size =3
        num_problems = 5
        num_steps = 4
        inputs_seq = np.random.randint(num_problems, size=(batch_size, num_steps))
        inputs_corr = np.random.randint(2, size=(3,4))
        print(inputs_seq)
        print(inputs_corr)

        # The inputs_sequence is then one-hot encoded
        inputs_seq = tf.one_hot(inputs_seq, depth=num_problems)

        # Compute the correct map
        inputs_seq = tf.cast(inputs_seq, tf.int32)
        inputs_seq_flat = tf.reshape(inputs_seq, [-1, num_problems])
        inputs_corr_flat = tf.reshape(inputs_corr, [-1])
        inputs_corr_flat = tf.transpose(tf.multiply(tf.transpose(inputs_seq_flat), inputs_corr_flat))
        inputs_corr = tf.reshape(inputs_corr_flat, [-1, num_steps, num_problems])
        
        print(inputs_seq.eval())
        print(inputs_corr.eval())

[[2 1 1 4]
 [1 3 1 1]
 [4 0 1 3]]
[[1 1 1 0]
 [0 1 1 1]
 [1 0 1 1]]
[[[0 0 1 0 0]
  [0 1 0 0 0]
  [0 1 0 0 0]
  [0 0 0 0 1]]

 [[0 1 0 0 0]
  [0 0 0 1 0]
  [0 1 0 0 0]
  [0 1 0 0 0]]

 [[0 0 0 0 1]
  [1 0 0 0 0]
  [0 1 0 0 0]
  [0 0 0 1 0]]]
[[[0 0 1 0 0]
  [0 1 0 0 0]
  [0 1 0 0 0]
  [0 0 0 0 0]]

 [[0 0 0 0 0]
  [0 0 0 1 0]
  [0 1 0 0 0]
  [0 1 0 0 0]]

 [[0 0 0 0 1]
  [0 0 0 0 0]
  [0 1 0 0 0]
  [0 0 0 1 0]]]


In [None]:
X = np.random.randint(124, size=(3, 10))
X

In [None]:
help(tf.gather)

In [None]:
x = np.roll([1,2,3,4], 1)
loc = x[x>3]
x[loc]

In [None]:
x = [1,2,3,4]
del x[-2]
x