### Confirm we can use a GPU to run the model

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np

### Define constants

In [None]:
data = "data/kuze_data/evaluations_per_ans_with_taxonomy_ids_PPL.csv"
factorized_taxonomies = "data/kuze_data/factorized_math_taxonomies.csv"
factorized_students = "data/kuze_data/factorized_student_ids.csv"
verbose = 1
best_model_weights = "weights/bestmodel"
log_dir = "logs"
optimizer = "adam"
lstm_units = 200
batch_size = 64
epochs = 1
dropout_rate = 0.3
test_fraction = 0.2
validation_fraction = 0.2

### Pre-processing

In [None]:
import sys

sys.path.append('/home/grenouille/Documents/jenga/final_project/code/kuze_dkt_imp')

In [None]:
from deepkt import deepkt, data_util, metrics

dataset, length, nb_features, nb_taxonomies = data_util.load_dataset(data, factorized_taxonomies, factorized_students, batch_size=batch_size, shuffle=True)

train_set, test_set, val_set = data_util.split_dataset(dataset=dataset, total_size=length, test_fraction=test_fraction, val_fraction=validation_fraction)

set_size = length * batch_size

test_set_size = (set_size * test_fraction)

val_set_size = (set_size - test_set_size) * validation_fraction

train_set_size = set_size - test_set_size - val_set_size

print("============== Data Summary ==============")
print("Total number of students: %d" % set_size)
print("Training set size: %d" % train_set_size)
print("Validation set size: %d" % val_set_size)
print("Testing set size: %d" % test_set_size)
print("Number of skills: %d" % nb_taxonomies)
print("Number of features in the input: %d" % nb_features)
print("========================================= ")

### Building the model

In [None]:
student_model = deepkt.DKTModel(
        nb_features=nb_features,
        nb_taxonomies=nb_taxonomies,
        hidden_units=lstm_units,
        dropout_rate=dropout_rate)

student_model.compile(
        optimizer=optimizer,
        metrics=[
            metrics.BinaryAccuracy(),
            metrics.AUC(),
            metrics.Precision(),
            metrics.Recall()
        ])

student_model.summary()

### Train the model

In [None]:
history = student_model.fit(
    dataset=train_set,
    epochs=epochs,
    verbose=verbose,
    validation_data=val_set,
    callbacks=[
        tf.keras.callbacks.CSVLogger(f"{log_dir}/train.log"),
        tf.keras.callbacks.ModelCheckpoint(best_model_weights, save_best_only=True, save_weights_only=True),
        tf.keras.callbacks.TensorBoard(log_dir=log_dir)
    ]
)

### Load the model with the best validation loss

In [None]:
student_model.load_weights(best_model_weights)

### Test the model

In [None]:
result = student_model.evaluate(test_set, verbose=verbose)

In [None]:
result

In [None]:
student_model.save('student_prediction')

In [None]:
student_model.input_shape

In [None]:
student_model.output_shape

### Prediction

In [None]:
def preprocess_for_prediction(dataframe):
    seq = dataframe.groupby('student_id').apply(
        lambda r: (
            r['factorized_student_id'],
            r['factorized_taxonomy_id']
        )
    )

    dataset = tf.data.Dataset.from_generator(
        generator=lambda: seq,
        output_types=(tf.int32, tf.int32)
    )

    # Add 1 since indexing starts from 0
    student_depth = int(students['factorized_student_id'].max() + 1)
    taxonomy_depth = int(taxonomies['factorized_taxonomy_code'].max() + 1)

    dataset = dataset.map(
        lambda factorized_student_id, factorized_taxonomy_code: (
            tf.one_hot(factorized_student_id, depth=student_depth),
            tf.one_hot(factorized_taxonomy_code, depth=taxonomy_depth)
        )
    )

    dataset = dataset.padded_batch(
        batch_size=64,
        padding_values=(
            tf.constant(-1, dtype=tf.float32),
            tf.constant(-1, dtype=tf.float32)),
        padded_shapes=([None, None], [None, None])
    )
    return dataset

In [None]:
def process_student_data(dataset):
    """Preprocess the tensorflow Dataset type used for prediction.
    The first item in the dataset corresponds to the student information.
    Dimensions:
        -> batch size
        -> number of elements per batch
        -> one-hot encoded data (number of students)
    We want to get the categorical student_id from the one-hot encoding.
    Return a list containing the categorical student_id
    """
    student_id_list = []
    student_val_list = []
    for i in range(len(dataset[0][0])):
        for j in range(len(dataset[0][0][i])):
            array = dataset[0][0][i][j]
            idx = np.argmax(array)
            student_id_list.append(idx)
            student_val_list.append(array[idx].numpy())
    return student_id_list, student_val_list

In [None]:
def process_taxonomy_data(dataset):
    """Preprocess the tensorflow Dataset type used for prediction.
    The second item in the dataset corresponds to the taxonomy information.
    Dimensions:
        -> batch size
        -> number of elements per batch
        -> one-hot encoded data (number of students)
    We want to get the categorical student_id from the one-hot encoding.
    Return a list containing the categorical student_id
    """
    taxonomy_id_list = []
    taxonomy_val_list = []
    for i in range(len(dataset[0][1])):
        for j in range(len(dataset[0][1][i])):
            array = dataset[0][1][i][j]
            idx = np.argmax(array)
            taxonomy_id_list.append(idx)
            taxonomy_val_list.append(array[idx].numpy())
    return taxonomy_id_list, taxonomy_val_list

In [None]:
def preprocess_prediction_data(predictions):
    """Expose relevant predictions from the predictions array.
    Dimensions:
        -> batch size
        -> number of elements per batch
        -> one-hot encoded data (number of taxonomies)
    Return one-hot encoded arrays sequentially ordered.
    """
    prediction_array_list = []
    for i in range(len(predictions)):
        for j in range(len(predictions[i])):
            prediction_array_list.append(predictions[i][j])
    return prediction_array_list

In [None]:
def process_prediction_data(predictions, taxonomy_id_list):
    """Get the predicted value for a taxonomy.
    Predictions is a list of arrays containing predictions for all
    taxonomies.
    The arrays within the list are sequentially ordered.
    To get the relevant array we index into the list of arrays
    with the index of the taxonomy_id of current interest within
    the taxonomy_id_list
    To get the prediction for the taxonomy of interest, we index
    into the array with the taxonomy_id.
    Return a list of predicted values.
    Length should be equal to that of taxonomy_id_list.
    """
    taxonomy_predictions = []
    for idx, taxonomy_code in enumerate(taxonomy_id_list):
        prediction_array = predictions[idx]
        taxonomy_predictions.append(prediction_array[taxonomy_code])
    assert len(taxonomy_predictions) == len(taxonomy_id_list)
    return taxonomy_predictions

In [None]:
def post_prediction_preprocessing(dataset, predictions):
    """Process the dataset and predictions into a pandas DataFrame.
    We want to take the input dataset and match it to the corresponding
    predictions.
    The dataset has paddings in order to conform to expected dimensions.
    Padding value is -1 and that is where the student_val_list and
    taxonomy_val_list come in handy.
    Any values with -1 in those 2 lists corresponds to a padding value
    and can therefore be dropped"""
    # convert the dataset into a list for easy access and manipulation
    dataset = list(dataset)
    student_id_list, student_val_list = process_student_data(dataset)
    taxonomy_id_list, taxonomy_val_list = process_taxonomy_data(dataset)
    preprocessed_prediction_list = preprocess_prediction_data(predictions)
    taxonomy_predictions = process_prediction_data(
            preprocessed_prediction_list, taxonomy_id_list)

    # round off all values in taxonomy_predictions to 2 decimal places
    # for readability
    taxonomy_predictions = [round(i, 4) for i in taxonomy_predictions]

    column_names = ['factorized_student_id', 'one-hot_student_value', 'factorized_taxonomy_id',
                    'one-hot_taxonomy_value', 'prediction']
    prediction_df = pd.DataFrame(list(zip(student_id_list,
                                    student_val_list,
                                    taxonomy_id_list,
                                    taxonomy_val_list,
                                    taxonomy_predictions)),
                           columns=column_names)

    # remove padding values from students and taxonomies
    prediction_df = prediction_df[prediction_df['one-hot_student_value'] != -1]
    prediction_df = prediction_df[prediction_df['one-hot_taxonomy_value'] != -1]

    # if the value of the prediction is greater than or equal to 0.5
    # the predicted answer should be 1 else 0
    # astype('int') converts a boolean value to an integer True == 1, False == 0
    prediction_df['predicted_answer'] = prediction_df['prediction'].ge(0.5).astype('int')
    prediction_df['predicted_answer'] = prediction_df['predicted_answer'].astype('int')
    return prediction_df

In [None]:
data = pd.read_csv('data/kuze_data/predictor_evaluations.csv')

In [None]:
prediction_data = data[data['subject'] == 'math']

In [None]:
taxonomies = pd.read_csv(factorized_taxonomies)

In [None]:
students = pd.read_csv(factorized_students)

In [None]:
prediction_data['factorized_taxonomy_id'] = prediction_data['taxonomy_id_0'].map(
    taxonomies.set_index('taxonomy_id_0')['factorized_taxonomy_code'])

In [None]:
prediction_data['factorized_student_id'] = prediction_data['student_id'].map(
    students.set_index('student_id')['factorized_student_id'])

In [None]:
shape = prediction_data.shape[0]

In [None]:
# due to limitations in dimensionality we want each dataframe we predict on to have
# 95 items
no_of_dataframes = shape // 95

In [None]:
# split the data into n number of dataframes each with at least 95 rows
partitions = np.array_split(prediction_data, no_of_dataframes)

In [None]:
# carry out prediction on a partition of the predicted data and append
# the returned dataframe to a list
predicted_partitions = []
for df in partitions:
    dataset = preprocess_for_prediction(df)
    predictions = student_model.predict(dataset)
    prediction_data = post_prediction_preprocessing(dataset, predictions)
    predicted_partitions.append(prediction_data)

In [None]:
prediction_data = pd.concat(partitions, ignore_index=True)
predictions = pd.concat(predicted_partitions, ignore_index=True)

In [None]:
assert prediction_data.shape[0] == predictions.shape[0]

In [None]:
rows = prediction_data.shape[0]

In [None]:
answer_predictions = predictions['predicted_answer'].values

In [None]:
for i in range(rows):
    prediction_data.at[i, 'answer_selection_prediction'] = answer_predictions[i]

In [None]:
def get_aggregated_evaluation_performance(dataframe, with_preds=True):
    # Group data by student and evaluation id and calculate actual and predicted
    # performance on questions
    eval_id = []
    student = []
    total_questions = []
    actual_performance = []
    predicted_performance = []
    date_of_evaluation = []
    subject = []
    student_full_name = []
    class_name = []
    class_grade = []
    school_name = []

    grouped_data = dataframe.groupby(['evaluation_id', 'student_id'])

    for item in grouped_data:
        evaluation_id, student_id = item[0]
        data = item[1]
        actual = data['answer_selection_correct'].value_counts()
        total_nu_questions = actual.sum()
        first_name = data['student_first_name'].unique()[0]
        last_name = data['student_last_name'].unique()[0]

        if first_name is np.nan:
            first_name = ''
        if last_name is np.nan:
            last_name = ''

        full_name = first_name + ' ' + last_name

        try:
            actual_correct = actual[1]
        except KeyError:
            # if a KeyError occurs it means the student got all of the
            # questions in that evaluation wrong
            actual_correct = 0
        actual_perc = int((actual_correct / total_nu_questions) * 100)

        if with_preds:  # if prediction data is included
            predicted = data['answer_selection_prediction'].astype('int').value_counts()

            # ensure acual no of questions done matches no of questions predicted
            assert actual.sum() == predicted.sum()

            predicted_correct = predicted[1]
            predicted_perc = int((predicted_correct/ total_nu_questions) * 100)
            predicted_performance.append(predicted_perc)
        else:
            predicted_performance.append(0)

        eval_id.append(evaluation_id)
        student.append(student_id)
        total_questions.append(total_nu_questions)
        actual_performance.append(actual_perc)
        date_of_evaluation.append(data['date_of_evaluation'].unique()[0].date())
        subject.append(data['subject'].unique()[0])
        student_full_name.append(full_name)
        class_name.append(data['class_name'].unique()[0])
        class_grade.append(data['class_grade'].unique()[0])
        school_name.append(data['school_name'].unique()[0])

    column_names = ['evaluation_id', 'student_id', 'total_number_of_questions', 'actual_performance (%)',
                    'predicted_performance (%)', 'date_of_evaluation', 'subject', 'student_full_name',
                    'class_name', 'class_grade', 'school_name']
    performance_df = pd.DataFrame(list(zip(eval_id,
                                           student,
                                           total_questions,
                                           actual_performance,
                                           predicted_performance,
                                           date_of_evaluation,
                                           subject,
                                           student_full_name,
                                           class_name,
                                           class_grade,
                                           school_name)),
                                 columns=column_names)
    return performance_df

In [None]:
mask = (ds_evaluation_per_ans_sci_prediction_df['date_of_evaluation'] < '2021-07-01')

training_data = ds_evaluation_per_ans_sci_prediction_df.loc[mask]

# training_data.dropna(subset=['answer_selection_correct'], inplace=True)

In [None]:
aggregated_training_data = get_aggregated_evaluation_performance(training_data, with_preds=False)

In [None]:
performance_data = get_aggregated_evaluation_performance(prediction_data)

In [None]:
aggregated_performance_data = pd.concat([aggregated_training_data, performance_data], ignore_index=True)