# iBKT Model Fit & Predictive Validity Evaluation

This notebook demonstrates how to evaluate an Individualized Bayesian Knowledge Tracing (iBKT) model using Model Fit (Log-Likelihood) and Predictive Validity (AUC-ROC) on mock session data for coding and non-coding tasks.

In [16]:
import math, random
from collections import defaultdict
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, roc_curve

# Import your BKT from the project (preferred)
try:
    from user_learning.adaptive_engine import BKTParams, bkt_update_once
except Exception:
    # Fallback minimal BKT to keep the notebook runnable
    from dataclasses import dataclass
    @dataclass
    class BKTParams:
        p_L0: float = 0.20
        p_T: float = 0.10
        p_T_wrong: float = 0.00
        p_S: float = 0.10
        p_G: float = 0.20
        decay_wrong: float = 0.85
        min_floor: float = 1e-6
        max_ceiling: float = 1-1e-6
    def bkt_update_once(p_know: float, correct: bool, p: BKTParams) -> float:
        if correct:
            num = p_know * (1 - p.p_S)
            den = num + (1 - p_know) * p.p_G
            pT = p.p_T
        else:
            num = p_know * p.p_S
            den = num + (1 - p_know) * (1 - p.p_G)
            pT = p.p_T_wrong
        post = 0.0 if den == 0 else num / den
        p_next = post + (1 - post) * pT
        if not correct:
            p_next *= p.decay_wrong
        return max(p.min_floor, min(p.max_ceiling, p_next))

random.seed(7)
np.random.seed(7)


## Mock data generator (coding + non-coding)


We will create mock response data for both coding and non-coding sessions. Each entry represents a learner's response sequence for a subtopic.

In [17]:
def make_mock_df_with_sessions(num_users=3, noncoding_sessions=6, coding_sessions=6,
                               subtopic_pool=range(101, 121), p_correct_mu=0.7, p_correct_sigma=0.15):
    rows = []
    base = datetime.now()
    diffs = ["beginner","intermediate","advanced","master"]
    sid = 1

    for u in range(1, num_users+1):
        # Non-coding sessions (≤10 Q each, subtopics can vary per question)
        for _ in range(noncoding_sessions):
            n_q = np.random.randint(6, 11)  # 6..10
            p_acc = float(np.clip(np.random.normal(p_correct_mu, p_correct_sigma), 0.3, 0.95))
            for q_idx in range(1, n_q+1):
                sub = int(np.random.choice(subtopic_pool))
                diff = np.random.choice(diffs, p=[0.35,0.40,0.20,0.05])
                is_correct = int(np.random.rand() < p_acc)
                tstamp = base + timedelta(minutes=sid, seconds=q_idx*15)
                rows.append({
                    "user_id": u,
                    "session_id": f"NC-{u}-{sid}",
                    "q_idx": q_idx,
                    "subtopic_id": sub,
                    "game_type": "non_coding",
                    "is_correct": is_correct,
                    "estimated_difficulty": diff,
                    "timestamp": tstamp
                })
            sid += 1

        # Coding sessions (exactly 1 Q each)
        for _ in range(coding_sessions):
            p_acc = float(np.clip(np.random.normal(p_correct_mu, p_correct_sigma), 0.3, 0.95))
            sub = int(np.random.choice(subtopic_pool))
            diff = np.random.choice(diffs, p=[0.35,0.40,0.20,0.05])
            is_correct = int(np.random.rand() < p_acc)
            tstamp = base + timedelta(minutes=sid)
            rows.append({
                "user_id": u,
                "session_id": f"C-{u}-{sid}",
                "q_idx": 1,
                "subtopic_id": sub,
                "game_type": "coding",
                "is_correct": is_correct,
                "estimated_difficulty": diff,
                "timestamp": tstamp
            })
            sid += 1

    df = pd.DataFrame(rows).sort_values(["user_id","timestamp","session_id","q_idx"]).reset_index(drop=True)
    return df

df = make_mock_df()
df.head()


Unnamed: 0,user_id,subtopic_id,game_type,is_correct,estimated_difficulty,timestamp
0,1,101,coding,1,master,2025-10-21 14:02:36.841166
1,1,101,non_coding,1,beginner,2025-10-21 14:03:06.841166
2,1,101,non_coding,1,intermediate,2025-10-21 14:03:35.841166
3,1,101,coding,1,intermediate,2025-10-21 14:04:04.841166
4,1,101,non_coding,1,advanced,2025-10-21 14:04:33.841166


## Evaluate Log-Likelihood and AUC-ROC (overall)


In [18]:
# Simple iBKT update function (mock parameters)
def ibkt_forward(responses, p_init=0.3, p_transit=0.2, p_guess=0.25, p_slip=0.1):
    p_know = p_init
    preds = []
    for obs in responses:
        # Predict before seeing response
        p_correct = p_know * (1 - p_slip) + (1 - p_know) * p_guess
        preds.append(p_correct)
        # Update knowledge state
        if obs == 1:
            num = p_know * (1 - p_slip)
            denom = num + (1 - p_know) * p_guess
        else:
            num = p_know * p_slip
            denom = num + (1 - p_know) * (1 - p_guess)
        p_know = num / denom if denom > 0 else p_know
        # Transit to next state
        p_know = p_know + (1 - p_know) * p_transit
    return preds

## Stratified metrics (coding vs non-coding, difficulty bands)



In [19]:
results = []
for session_type, sessions in mock_data.items():
    all_preds = []
    all_obs = []
    log_likelihood = 0.0
    n = 0
    for session in sessions:
        preds = ibkt_forward(session['responses'])
        obs = session['responses']
        # Log-likelihood: sum log(p) for correct, log(1-p) for incorrect
        for p, o in zip(preds, obs):
            if o == 1:
                log_likelihood += np.log(max(p, 1e-8))
            else:
                log_likelihood += np.log(max(1-p, 1e-8))
            n += 1
        all_preds.extend(preds)
        all_obs.extend(obs)
    avg_log_likelihood = log_likelihood / n if n > 0 else float('nan')
    auc = roc_auc_score(all_obs, all_preds) if len(set(all_obs)) > 1 else float('nan')
    results.append({'Session Type': session_type, 'Log-Likelihood': avg_log_likelihood, 'AUC-ROC': auc})

## Output Metrics in Summary Table

Display the Log-Likelihood and AUC-ROC for each session type in a concise table.

In [20]:
# Create and print summary table
df_results = pd.DataFrame(results)
df_results

Unnamed: 0,Session Type,Log-Likelihood,AUC-ROC
0,coding,-0.903199,0.229167
1,non_coding,-0.741343,0.520833


In [21]:
# --- Sanity checks specific to PyGrounds session structure ---

# 1. Non-coding sessions must have ≤10 questions
assert (
    df[df.game_type == "non_coding"]
    .groupby("session_id")["q_idx"]
    .max()
    .le(10)
    .all()
), "Non-coding session exceeds 10 questions!"

# 2. Coding sessions must have exactly 1 question
assert (
    df[df.game_type == "coding"]
    .groupby("session_id")["q_idx"]
    .max()
    .eq(1)
    .all()
), "A coding session has more than one question!"

print("✅ Session structure checks passed.")


# --- Extra breakdowns: useful for debugging model behavior ---

# Overall performance
overall = eval_ibkt_ll_auc(df)
print(f"Overall LL={overall['avg_log_likelihood']:.4f} | AUC={overall['auc_roc']:.3f}")

# Breakdown by game type (coding vs non-coding)
by_game = eval_by_group(df, "game_type")
print("\n=== Breakdown by Game Type ===")
display(by_game)

# Breakdown by difficulty
by_diff = eval_by_group(df, "estimated_difficulty")
print("\n=== Breakdown by Difficulty ===")
display(by_diff)

# Breakdown by subtopic (optional)
by_subtopic = eval_by_group(df, "subtopic_id")
print("\n=== Breakdown by Subtopic (sample of first 10) ===")
display(by_subtopic.head(10))


KeyError: 'session_id'