In [1]:
from dkt.load_data import ASSISTment2009
from dkt import model as DKTModel
from utils import BasicDKT, GaussianInputNoiseDKT
import os
import tensorflow as tf
import time

"""
Assignable variables:
num_runs: int
num_epochs: int
keep_prob: float
is_early_stopping: boolean
early_stopping: int
batch_size: int
hidden_layer_structure: tuple
data_dir: str
train_file_name: str
test_file_name: str
ckpt_save_dir: str
"""


DATA_DIR = './data/'
train_file = 'skill_id_train.csv'
test_file = 'skill_id_test.csv'
train_path = os.path.join(DATA_DIR, train_file)
test_path = os.path.join(DATA_DIR, test_file)

network_config = {
    'batch_size': 32,
    'hidden_layer_structure': (200,),
    'rnn_cell': tf.contrib.rnn.LSTMCell,
    'learning_rate': 0.01,
    'keep_prob': 0.5,
}

In [2]:
data = ASSISTment2009(train_path, test_path, batch_size=32)

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)

model_name = 'original_gaussian'
save_dir = './checkpoints/' + model_name + '/'
# initialize model
dkt = BasicDKT(sess=sess,
               data=data,
               network_config=network_config,
               num_epochs=1000,
               num_runs=5,
               save_dir=save_dir)

# run optimization of the created model
dkt.model.build_graph()

Reading ./data/skill_id_train.csv
10119 lines was read
max_num_problems_answered: 1219
num_problems: 124
The number of students is 3137
Finish reading data.
Reading ./data/skill_id_test.csv
2532 lines was read
max_num_problems_answered: 1114
num_problems: 124
The number of students is 784
Finish reading data.


In [3]:
dkt.load_model()

INFO:tensorflow:Restoring parameters from ./checkpoints/original_gaussian/model


In [4]:
dkt.evaluate()

(0.88901371909681326, 0.38593635715619512)

# Obtain the hidden layer output
As the hidden layer size is large, the visualization is a bit convoluted to be understanded even if we visualize it.
In order to better visualize the hidden layer result. A rough idea is to extract all the student hidden layer output, and then perform PCA over those vector. Afterwards, check the proportion of variance and its eigen value.

In [None]:
dkt.trainest_data

In [None]:
from sklearn.decomposition import PCA

In [None]:
def get_pca_model(data, n_components):
    pca = PCA(n_components=50)
    pca.fit(data)
    pca_dim = len(pca.explained_variance_ratio_)
    print(pca.explained_variance_ratio_)
    print("PoV:", sum(pca.explained_variance_ratio_))
    return pca

In [None]:
sess = tf.Session()
print("Loading the saved variable to the current session.")
saver.restore(sess=sess, save_path=save_path)

auc_test, loss_test = evaluate(sess, is_train=False)
print ("auc_test: {0:.5}, loss_test: {0:.5}".format(auc_test, loss_test))

In [None]:
hl1_outputs = []
for student in students_train:
    # student basic information
    num_question_answered = student[0]
    
    # it is the hidden layer output sequence of the student. (in shape [max_num_steps, hl1_size])
    hl1 = get_student_hidden_layer(sess, student=student, layer_num=1)
    hl1 = hl1[:num_question_answered]
    
    hl1_outputs += [hl1_output for hl1_output in hl1]

In [None]:
# save the pca
import pickle
with open('data/original_1hl_hl1_newfile_outputs.pkl', 'wb') as f:
    pickle.dump(hl1_outputs, f)

# Visualization
In the following, the student output and hidden layer will be visualized.

In [None]:
sess = tf.Session()
print("Loading the saved variable to the current session.")
saver.restore(sess=sess, save_path=save_path)

auc_test, loss_test = evaluate(sess, is_train=False)
print ("auc_test: {0:.5}, loss_test: {0:.5}".format(auc_test, loss_test))

In [None]:
tf.trainable_variables()

In [None]:
import matplotlib.pyplot as plt
import math
%matplotlib inline
#http://bokeh.pydata.org/en/0.10.0/docs/gallery/cat_heatmap_chart.html

def plot_heatmap(data, x_labels, y_labels, second_x_labels=None, fig_size_inches=[15, 5], cmap=plt.cm.Blues):
#     plt.figure(figsize=(40,100))

    fig, ax = plt.subplots()
    heatmap = ax.pcolor(data, cmap=cmap)
    
    # Format
    fig = plt.gcf()
    
    # turn off the frame
    ax.set_frame_on(False)
    
    # put the major ticks at the middle of each cell
    ax.set_xticks(np.arange(len(x_labels)) + 0.5, minor=False)
    ax.set_yticks(np.arange(len(y_labels)) + 0.5, minor=False)
    
    # want a more natural, table-like display
    ax.invert_yaxis()
    ax.xaxis.tick_top()
    
    # set the label
    ax.set_xticklabels(x_labels, minor=False)
    ax.set_yticklabels(y_labels, minor=False)
    ax.set_xlabel("the skill id answered at the time step")
    ax.set_ylabel("the skill id of the output layer")

    fig.set_size_inches(fig_size_inches[0], fig_size_inches[1])
    
    # second axis label
    if second_x_labels != None:
        ax2 = ax.twiny()
        ax2.set_xticks(np.arange(len(second_x_labels)) + 0.5, minor=False)
        ax2.set_xticklabels(second_x_labels)
        ax2.set_xlabel("Correct Label")
        ax2.xaxis.tick_top()

    # Turn off all the ticks
    ax = plt.gca()
    
    
#     fig.colorbar(heatmap, fraction=0.02, pad=0.04)
    plt.show()

In [None]:
targets = []
for i in range(len(students_test)):
    student = students_test[i]
    num_question_answered = student[0]
    question_ids_answered = np.sort(np.array([int(qid) for qid in set(student[1]) if qid != -1]))
    num_distict_question = len(question_ids_answered)
    
    if 50 >= num_question_answered >= 30 and 10 >= num_distict_question >= 5:
        targets.append(i)
    
print(targets)

In [None]:
# selecting one student to visualize
# bad example: 598
# good example: 30, 738
sid = 126
student = students_test[sid]
num_question_answered = student[0]
question_ids_answered = np.sort(np.array([int(qid) for qid in set(student[1]) if qid != -1]))

question_seq = student[1][:num_question_answered]
correct_seq = student[2][:num_question_answered]

print(num_question_answered)
print(question_seq)
print(correct_seq)

In [None]:
# modify the student-126 to assume that he answer the question-82 correctly for 10 times
a= student[0] + 10
b = student[1][:40] + ['45']*10 + [0]*(1218-40-10+1)
c = student[2][:40] + ['1']*10 + [-1]*(1218-40-10+1)
student = (a, b, c)
num_question_answered = student[0]
question_ids_answered = np.sort(np.array([int(qid) for qid in set(student[1]) if qid != -1]))

question_seq = student[1][:num_question_answered]
correct_seq = student[2][:num_question_answered]

print(num_question_answered)
print(question_seq)
print(correct_seq)

## Visualizing the Output Layer

In [None]:
output_layer = get_student_output_layer(sess, student)

output_layer = output_layer[:num_question_answered, question_ids_answered]
output_layer = np.transpose(output_layer)

In [None]:
plot_heatmap(output_layer, x_labels=question_seq, y_labels=question_ids_answered, second_x_labels=correct_seq)

## Visualizing the Hidden Layer

In [None]:
# load the pca
import pickle
with open('data/original_1hl_hl1_newfile_outputs.pkl', 'rb') as f:
    hl1_outputs = pickle.load(f)
    pca = get_pca_model(hl1_outputs, n_components=50)

In [None]:
hl1 = get_student_hidden_layer(sess, student=student, layer_num=1)
hl1 = hl1[:num_question_answered]
hl1_orginal = np.transpose(hl1)
print(hl1_orginal.shape)

In [None]:
hl1_pca = pca.transform(hl1)
hl1_pac = np.transpose(hl1_pca)
print(hl1_pac.shape)

In [None]:
x_labels=["{}({})".format(question_seq[i], correct_seq[i]) for i in range(num_question_answered)]
print(x_labels)

In [None]:
# red: negative, white: zero, blue: positive
plot_heatmap(hl1_orginal, 
             x_labels=x_labels, 
             y_labels=range(hl1_orginal.shape[0]),
#              second_x_labels=correct_seq, 
             fig_size_inches=[20, 15],
            cmap='RdBu')

In [None]:
# red: negative, white: zero, blue: positive
plot_heatmap(hl1_pac, 
             x_labels=x_labels, 
             y_labels=range(hl1_pac.shape[0]),
#              second_x_labels=correct_seq, 
             fig_size_inches=[15, 15],
            cmap='RdBu')

In [None]:
sess.close()