## Load exercise-solution and student metadata

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from os.path import exists
from os import mkdir

from shared.load import (
    load_solution_medatadata,
    load_student_metadata,
    load_edit_log_data,
)

raw_solution_metadata = load_solution_medatadata()
student_metadata = load_student_metadata()

## Load attempt data from edit.log (multiple entries for each attempt)

In [None]:
edit_log_attempt_pkl_path = "../__pkl__/edit_log_attempts.pkl"

if not exists("../__pkl__"):
    mkdir("../__pkl__") # directory to store pickles (searialized data)

if exists(edit_log_attempt_pkl_path):
    attempts_el = pd.read_pickle(edit_log_attempt_pkl_path)
else:
    attempts_el = load_edit_log_data()
    attempts_el.to_pickle(edit_log_attempt_pkl_path)

## Filter attempt data

In [None]:
print(f"loaded {attempts_el['attempt'].nunique()} attempts from edit log data")

n = attempts_el["attempt"].nunique()
attempts_el = attempts_el.groupby("attempt").filter(lambda x: len(x) >= 3)
print(f"removed {n - attempts_el['attempt'].nunique()} attempts with less than 3 entries")

n = attempts_el["attempt"].nunique()
attempts_el = attempts_el.groupby("attempt").filter(
    lambda x: not (x[(x["n_tests_green"] / (x["n_tests_green"] + x["n_tests_red"])) > 0.75]["duration_effective"] < 1).any()
)
print(f"removed {n - attempts_el['attempt'].nunique()} attempts which after less than 1 minute already have > 75 % correctness")

n = attempts_el["attempt"].nunique()
attempts_el = attempts_el.groupby("attempt").filter(lambda x: x["duration_effective"].max() >= 1)
print(f"removed {n - attempts_el['attempt'].nunique()} attempts with effective duration less than 1 minute")

print(f"{attempts_el['attempt'].nunique()} attempts left")

## Prepare dataframe with metrics (single entry for each attempt)


In [None]:
from shared.prepare import prepare_solution_metadata, prepare_record_data

solution_metadata, solution_metadata_grouped, solution_metadata_grouped_scaled = prepare_solution_metadata(raw_solution_metadata)

In [None]:
from shared.extract import extract_exercise_metadata

extract_exercise_metadata(attempts_el, solution_metadata)

In [None]:
dfs_pickle_file_path = "../__pkl__/dfs.pkl"

if exists(dfs_pickle_file_path):
    dfs = pd.read_pickle(dfs_pickle_file_path)
else:
    _, dfs = prepare_record_data(attempts_el, solution_metadata)
    dfs.to_pickle(dfs_pickle_file_path)

## Fit Model

In [None]:
from algorithms.fit import HyperParameters, fit_and_evaluate, split_and_prepare_data
from algorithms.models import (
    NN_SVD_UserItemTimeBias_SplineUserBiasShift
)

data_set = dfs
metric = "metric_ps_a_sqrt_x_log_bounded_min_max"

train_set, test_set, meta_data = split_and_prepare_data(df=data_set, metric=metric, test_set_size=0, n_days_grouped=1)

hyper_params = {
    "gamma": 0.1914764221915456,
    "kernel_factor": 20.0,
    "learning_rate": 0.024837016310023538,
    "reg_param_bi": 0.1822901581932833,
    "reg_param_bt": 0.9741206785079223,
    "reg_param_bu": 0.15622270691447798,
    "reg_param_spline_bu": 0.7774414688511561,
    "reg_param_u": 0.01,
    "reg_param_i": 0.01,
    "matrix_i_init": solution_metadata_grouped_scaled, # fix exercise factors
    "n_factors": solution_metadata_grouped_scaled.shape[1] -1,
}

hyper_params = HyperParameters(n_iterations=75, **hyper_params)
score, predictions, params = fit_and_evaluate(
    model=NN_SVD_UserItemTimeBias_SplineUserBiasShift,
    test_set=test_set,
    train_set=train_set,
    meta_data=meta_data,
    hyper_params=hyper_params,
    verbose=True,
)

## Plot model params

In [None]:
if "biases_i_pd" in params.keys():
    biases_i_pd = params["biases_i_pd"]
    fig, axes = plt.subplots(figsize=(17, 5), dpi=300)
    biases_i_pd_sample = biases_i_pd.sample(frac=1).sort_values(ascending=False)
    p = axes.bar(biases_i_pd_sample.index, biases_i_pd_sample.values)
    axes.set_title("Exercise-Bias")
    axes.bar_label(p, labels=list(map(lambda x: f"{len(data_set[data_set['exercise'] == x])}", biases_i_pd_sample.index)), rotation=90, padding=5)
    axes.set_xticklabels(biases_i_pd_sample.index, rotation=90)
    axes.set_xmargin(0.01)
    axes.set_ymargin(0.075)
    axes.text(
        0.99,
        0.95,
        f"$min$ = {round(biases_i_pd.min(), 3)}\n$max$ = {round(biases_i_pd.max(),3)}\n$\mu$ = {round(biases_i_pd.mean(), 3)}\n$\sigma$ = {round(biases_i_pd.std(), 3)}",
        horizontalalignment="right",
        verticalalignment="top",
        transform=axes.transAxes,
        fontsize=14,
    )

In [None]:
if "biases_u_pd" in params.keys():
    biases_u_pd = params["biases_u_pd"]
    fig, axes = plt.subplots(figsize=(17, 5), dpi=300)
    biases_u_pd = biases_u_pd.sort_values(ascending=False)
    p = axes.bar(biases_u_pd.index, biases_u_pd.values, label="User Biases")
    axes.set_title("Student-Bias")
    axes.set_xticklabels(biases_u_pd.index, rotation=90)
    axes.set_xmargin(0.01)
    axes.text(
        0.99,
        0.95,
        f"$min$ = {round(biases_u_pd.min(), 3)}\n$max$ = {round(biases_u_pd.max(),3)}\n$\mu$ = {round(biases_u_pd.mean(), 3)}\n$\sigma$ = {round(biases_u_pd.std(), 3)}",
        horizontalalignment="right",
        verticalalignment="top",
        transform=axes.transAxes,
        fontsize=14,
    )