In [19]:
import tensorflow as tf
import numpy as np
import pickle, os

In [None]:
PICKLE_DIR = './data/A2009/'
train_file = os.path.join(PICKLE_DIR, 'A2009_train_default_0000.pkl')
test_file = os.path.join(PICKLE_DIR, 'A2009_test_default_0000.pkl')

with open(test_file, 'rb') as f:
    data = pickle.load(f)
    
data

In [None]:
def seq_corr_to_onehot(seq, corr, num_steps, num_problems):
    seq_oh = tf.one_hot(seq, depth=num_problems)
    seq_oh_flat = tf.reshape(seq_oh, [-1, num_problems])
    
    # element-wise multiplication between Matrix and Vector
    # the i-th column of Matrixelement-wisedly multiply the i-th element in the Vector
    corr_flat = tf.reshape(corr, [-1])
    corr_mat = tf.multiply(tf.transpose(seq_oh_flat), tf.cast(corr_flat, dtype=tf.float32))
    corr_mat = tf.transpose(corr_mat)
    corr_mat = tf.reshape(corr_mat, shape=[-1, num_steps, num_problems])
    
    corr_mat_value_two = corr_mat * 2
    
    X = corr_mat_value_two - seq_oh
    
    return seq_oh, corr_mat, X

In [None]:
with tf.Session():
    inputs_seq = np.array([1, 3, 1, 2, 2])
    inputs_corr = np.array([0, 1, 1, 0, 0])
    num_steps = 5
    num_problems = 5
    X_seq, X_corr, X = seq_corr_to_onehot(inputs_seq, inputs_corr, num_steps, num_problems)
    
    
    print(X_seq.eval())
    print(X_corr.eval())
    print(X.eval())
                           

In [None]:
test_auc_v2 = [0.7717, 0.79113, 0.80031, .80366, .808, .80957, .81121,
           .81041, .81076, .81417, .81375, 0.81561, .81528, .81637, .8129,
           .81567, .81418, .81347, .81554, .81457, .81277, .81336, 
           .81009, .80954, .81269]

test_auc_v1 = [0.7407, 0.76954, 0.78276,0.79058, 0.79748, 0.8008,
              0.80528, 0.80584, 0.80985, 0.81029, 0.81104, 0.81355,
              0.81339, 0.81442, 0.81444, 0.81567, 0.81576, 0.81645,
              0.81656, 0.81576, 0.8167, 0.81646, 0.81776, 0.81636,
              0.81645]

len(test_auc_v1), len(test_auc_v2)

In [None]:
import plotly.plotly as py
import plotly.graph_objs as go

x = np.array(range(25))
y1 = np.array(test_auc_v1)
y2 = np.array(test_auc_v2)

trace0 = go.Scatter(
    x=x,
    y=y1,
    name = 'old input representation',
    mode = 'lines+markers'
)

trace1 = go.Scatter(
    x=x,
    y=y2,
    name = 'new input representation',
    mode = 'lines+markers'
)

data = [trace0, trace1]
py.iplot(data, filename='test_auc')

In [None]:
import matplotlib.pyplot as plt

x = np.array(range(25))
y1 = np.array(test_auc_v1)
y2 = np.array(test_auc_v2)

plt.plot(x, y1, 'r', x, y2, 'b')
plt.show()

In [None]:
with tf.Session():
    x = np.array([[[0,0,1,0,0], [0,1,0,0,0], [0,0,0,0,0], [0,0,0,0,0]]])
    print(tf.reduce_max(x, axis=2).eval())

In [None]:
np.array([1,3,5]).reshape((1,3))

In [None]:
with tf.Session() as sess:
    seq_length = 10
    input_length = tf.expand_dims(tf.subtract(seq_length, 1), 0)
    print(input_length.eval())

In [None]:
def batch_with_dynamic_pad(problems_and_corrects,
                           batch_size,
                           queue_capacity,
                           add_summaries=True):
    """
    Batches input problems and corrects, returns the input sequence and output sequence

    This function transforms the problem and correct into a input sequence, and also the
    output sequence as the

    ---
    :param problems_and_corrects:
    :param batch_size:
    :param queue_capacity:
    :param add_summaries:
    :return:
    """
    enqueue_list = []
    for problem, correct in problems_and_corrects:
        seq_length = tf.shape(problem)[0]
        # Lets say the seq_length to be 10, input_length is [9]
        input_length = tf.expand_dims(tf.subtract(seq_length, 1), 0)

        # Lets say the problem = [1, 2, 3, 4, 5] and input_length = 4
        # problem_seq will be [1, 2, 3, 4]
        problem_seq = tf.slice(problem, [0], input_length)
        correct_seq = tf.slice(correct, [0], input_length)
        # target_problem seq will be [2, 3, 4, 5]
        target_problem_seq = tf.slice(problem, [1], input_length)
        target_correct_seq = tf.slice(correct, [1], input_length)

        # indicate the mask of the variables
        # indicator = tf.ones(input_length, dtype=tf.int32)
        enqueue_list.append([problem_seq, correct_seq,
                             target_problem_seq, target_correct_seq])
        
    print("batching...")

    problem_seqs, correct_seqs, target_problem_seqs, target_correct_seqs = tf.train.batch_join(
        enqueue_list,
        batch_size=batch_size,
        capacity=queue_capacity,
        enqueue_many=True,
        dynamic_pad=True,
        name="batch_and_pad"
    )

    return problem_seqs, correct_seqs, target_problem_seqs, target_correct_seqs

In [1]:
from utils import read_data_from_csv
students, num_steps, num_problems = read_data_from_csv('data/skill_id_train.csv')

Reading data/skill_id_train.csv
10119 lines was read
max_num_problems_answered: 1219
num_problems: 124
The number of students is 3137
Finish reading data.


In [2]:
p_and_c = [(student[1], student[2]) for student in students]

In [8]:
import numpy as np
def pad(data, target_length):
    return np.pad(data, (0, target_length - len(data)), 'constant', constant_values=(-1))

def one_hot(indices, depth):
    encoding = np.concatenate((np.eye(depth), [np.zeros(depth)]))
    return encoding[indices]


class BatchGenerator:
    def __init__(self, num_problems, problems_and_corrects, batch_size):
        self.cursor = 0
        self.num_problems = num_problems
        self.problems_and_corrects = problems_and_corrects
        self.num_samples = len(problems_and_corrects)
        self.batch_size = batch_size

    def next_batch(self, is_shuffle=True):
        problem_seqs = []
        correct_seqs = []
        max_seq_length = 0
        for i in range(self.batch_size):
            problem = self.problems_and_corrects[self.cursor][0]
            correct = self.problems_and_corrects[self.cursor][1]
            seq_length = len(problem)

            if seq_length > max_seq_length:
                max_seq_length = seq_length

            problem_seqs.append(problem)
            correct_seqs.append(correct)
            self._update_cursor()
        problem_seqs = [pad(problem, max_seq_length) for problem in problem_seqs]
        correct_seqs = [pad(correct, max_seq_length) for correct in correct_seqs]

        input_problems = problem_seqs[:-1]
        input_corrects = correct_seqs[:-1]

        target_problems = problem_seqs[1:]
        target_corrects = problem_seqs[1:]

        return input_problems, input_corrects, target_problems, target_corrects

    def _update_cursor(self):
        self.cursor = (self.cursor + 1) % self.num_samples

    def shuffle(self):
        pass

In [9]:
tr = BatchGenerator(124, p_and_c, 32)

In [18]:
input_problems, input_corrects, target_problems, target_corrects = tr.next_batch()
print(input_corrects)

[array([ 1,  1,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -

In [None]:
embeddings = tf.Variable(tf.random_uniform([2*num_probs+2, embedding_size], -1.0, 1.0))

In [5]:
import numpy as np
import tensorflow as tf
with tf.Session() as sess:
    num_problems = 4
    problem_seqs = tf.constant([[1,2,3,-1,-1], [2,3,3,3,-1], [1,3,0,3,0]])
    correct_seqs = tf.constant([[1,1,1,-1,-1], [1,0,1,1,-1], [1,0,0,1,1]])
    print("num_problem:", num_problems)
    print("problem_seqs:\n", problem_seqs.eval())
    print("correct_seqs:\n", correct_seqs.eval())
    
    batch_size = tf.shape(problem_seqs)[0]
    seq_length = tf.shape(problem_seqs)[1]
    
    print("batch size: ", batch_size.eval())
    print("seq_length: ", seq_length.eval())

    # problems_oh is in shape (batch_size, num_steps, num_problems)
    problem_seqs_oh = tf.one_hot(problem_seqs, depth=num_problems)
    problem_seqs_oh_flat = tf.reshape(problem_seqs_oh, [-1, num_problems])

    # element-wise multiplication between Matrix and Vector
    # the i-th column of Matrixelement-wisedly multiply the i-th element in the Vector
    correct_seqs_flat = tf.reshape(correct_seqs, [-1])
    corr_mat = tf.multiply(tf.transpose(problem_seqs_oh_flat), tf.cast(correct_seqs_flat, dtype=tf.float32))
    corr_mat = tf.transpose(corr_mat)
    corr_mat = tf.reshape(corr_mat, shape=[batch_size, -1, num_problems])

    input_problem_seqs = tf.slice(problem_seqs_oh, begin=[0,0,0], size=[batch_size, seq_length-1, num_problems])
    input_correct_seqs = tf.slice(corr_mat, begin=[0,0,0], size=[batch_size, seq_length-1, num_problems])
    output_problem_seqs = tf.slice(problem_seqs_oh, begin=[0,1,0], size=[batch_size, seq_length-1, num_problems])
    output_correct_seqs = tf.slice(corr_mat, begin=[0,1,0], size=[batch_size, seq_length-1, num_problems])

    X = tf.concat([input_problem_seqs, input_correct_seqs], axis=2)
    y = output_problem_seqs
    labels = output_correct_seqs
    
    print("X:\n", X.eval())
    print("y:\n", y.eval())
    print("labels:\n", labels.eval())

num_problem: 4
problem_seqs:
 [[ 1  2  3 -1 -1]
 [ 2  3  3  3 -1]
 [ 1  3  0  3  0]]
correct_seqs:
 [[ 1  1  1 -1 -1]
 [ 1  0  1  1 -1]
 [ 1  0  0  1  1]]
batch size:  3
seq_length:  5
X:
 [[[ 0.  1.  0.  0.  0.  1.  0.  0.]
  [ 0.  0.  1.  0.  0.  0.  1.  0.]
  [ 0.  0.  0.  1.  0.  0.  0.  1.]
  [ 0.  0.  0.  0. -0. -0. -0. -0.]]

 [[ 0.  0.  1.  0.  0.  0.  1.  0.]
  [ 0.  0.  0.  1.  0.  0.  0.  0.]
  [ 0.  0.  0.  1.  0.  0.  0.  1.]
  [ 0.  0.  0.  1.  0.  0.  0.  1.]]

 [[ 0.  1.  0.  0.  0.  1.  0.  0.]
  [ 0.  0.  0.  1.  0.  0.  0.  0.]
  [ 1.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  1.  0.  0.  0.  1.]]]
y:
 [[[ 0.  0.  1.  0.]
  [ 0.  0.  0.  1.]
  [ 0.  0.  0.  0.]
  [ 0.  0.  0.  0.]]

 [[ 0.  0.  0.  1.]
  [ 0.  0.  0.  1.]
  [ 0.  0.  0.  1.]
  [ 0.  0.  0.  0.]]

 [[ 0.  0.  0.  1.]
  [ 1.  0.  0.  0.]
  [ 0.  0.  0.  1.]
  [ 1.  0.  0.  0.]]]
labels:
 [[[ 0.  0.  1.  0.]
  [ 0.  0.  0.  1.]
  [-0. -0. -0. -0.]
  [-0. -0. -0. -0.]]

 [[ 0.  0.  0.  0.]
  [ 0.  0.  