In [1]:
# Standard libraries
import numpy as np
import pandas as pd
import json
from collections import defaultdict, Counter
from tqdm import tqdm

# PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Converts numpy arrays to Pytorch tensors for custom dataset
class TutorDataset(Dataset):
    def __init__(self, states, actions):
        self.states = torch.tensor(states, dtype=torch.float32)
        self.actions = torch.tensor(actions, dtype=torch.long)

    def __len__(self):
        return len(self.actions)

    def __getitem__(self, i):
        return self.states[i], self.actions[i]

In [3]:
# stores sequences of state-action-reward transitions (s,a,r,s',a',done)
class TransitionBuffer:
    def __init__(self):
        self.states = []
        self.actions = []
        self.rewards = []
        self.next_states = []
        self.next_actions = []
        self.dones = []

    def push(self, s, a, r, s2, a2, done):
        self.states.append(s)
        self.actions.append(a)
        self.rewards.append(r)
        self.next_states.append(s2)
        self.next_actions.append(a2)
        
        self.dones.append(done)

    def __len__(self):
        return len(self.actions)

# converts tabular tutoring data into transitions, encode features, organize data based on convo turns
def build_transition_buffer(
    df: pd.DataFrame,
    reward_fn,
    meta_map: dict,
    orig_to_idx: dict,
    episode_column: str = None
):
    """
    Build a transition buffer for offline RL using a 50-action index map and
    its corresponding metadata map.

    Args:
        df: DataFrame with tutoring data; must contain 'next_action_id' and state features.
        reward_fn: function(prev_state, action_idx, next_state, action_meta) -> reward
        meta_map: nested metadata dict mapping category->strategy->(level->idx) for 50 actions
        orig_to_idx: dict mapping original action IDs -> compact indices (0..49)
        episode_column: optional col to group episodes; if None assumes ordered by 'convo_turn'

    Returns:
        buffer: TransitionBuffer of (state, action_idx, reward, next_state, done)
        orig_to_idx: same dict mapping original IDs -> indices (for model sizing)
    """
    # Flatten meta_map into flat_idx_to_meta
    flat_idx_to_meta = {}
    for cat, strategies in meta_map.items():
        for strat, levels in strategies.items():
            for lvl, idx in levels.items():
                flat_idx_to_meta[idx] = {
                    'category': cat,
                    'strategy': strat,
                    'level': lvl
                }

    # State feature setup
    state_feats = ['misconception_type','convo_turn','previous_action_id',
                   'listen_to_feedback','problem_progress','progress_delta',
                   'correct_solution','next_action_hint_strength']
    cat_feats = ['misconception_type','previous_action_id',
                 'listen_to_feedback','correct_solution']
    num_feats = [f for f in state_feats if f not in cat_feats]

    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    encoder.fit(df[cat_feats])

    # Group episodes
    if episode_column:
        episodes = df.sort_values([episode_column,'convo_turn']).groupby(episode_column)
    else:
        df = df.sort_values('convo_turn')
        df['_tmp_ep'] = 1
        episodes = df.groupby('_tmp_ep')

    buffer = TransitionBuffer()

    # Build transitions
    for _, ep_df in episodes:
        prev_state = None
        prev_idx = None
        prev_meta = None

        for _, row in ep_df.iterrows():
            orig_id = row.get('next_action_id')
            if pd.isna(orig_id) or int(orig_id) not in orig_to_idx:
                continue
            orig_id = int(orig_id)
            idx = orig_to_idx[orig_id]

            # encode state
            cat = encoder.transform(pd.DataFrame([row[cat_feats]], columns=cat_feats))
            num = row[num_feats].to_numpy()
            state = np.hstack((num, cat.flatten()))

            meta = flat_idx_to_meta.get(idx)
            # Read done flag from data
            is_done = row.get('done', False)  # Use the actual 'done' column

            if prev_state is not None:
                if is_done > 0.0:
                    terminal_r = new_terminal(prev_state)
                    buffer.push(prev_state,
                                  prev_idx,
                                  terminal_r,
                                  state,
                                  idx,
                                  done=True)
            
                else:
                    r = reward_fn(prev_state, prev_idx, state, prev_meta)
                    buffer.push(prev_state, prev_idx, r, state, idx, done=is_done)

            prev_state = state
            prev_idx = idx
            prev_meta = meta
    # convert lists to arrays
    buffer.states = np.array(buffer.states)
    buffer.actions = np.array(buffer.actions, dtype=int)
    buffer.rewards = np.array(buffer.rewards, dtype=float)
    buffer.next_states = np.array(buffer.next_states)
    buffer.next_actions = np.array(buffer.next_actions)
    buffer.dones = np.array(buffer.dones, dtype=bool)

    print(buffer.states.shape)
    return buffer, orig_to_idx


# Maximum raw progress
MAX_PROGRESS = 50.0
def new_terminal(state):
    raw_progress = state[4]  # problem_progress in state
    normalized_progress = min(raw_progress, MAX_PROGRESS) / MAX_PROGRESS
    # Higher reward if correct solution was achieved
    return 5.0 if state[6] > 0 else 2.0 * normalized_progress

"""
   The big idea behind the reward function: Good tutors don't just give answers right away... they start with questions 
   and guidance to help students think for themselves, 
   then provide more direct help if the student struggles. 
   This "scaffold then tell" approach is built into the reward function.

   Key design principles:
   1. Always reward student progress, regardless of tutor strategy
   2. Early in a conversation: reward scaffolding (questions, focus), penalize giving answers
   3. Later in a conversation: reduce scaffolding bonuses, reward effective direct instruction
   4. Use a smooth transition between these phases rather than an abrupt switch
   5. Scale penalties based on how "telling" the hint is (revealing answers vs gentle hints)
   6. Keep it simple and avoid hard thresholds or complex calculations

   The reward transitions happen over ~8 turns, which is carefully calibrated for our dataset:
   - With average episode lengths of 15-25 turns, using 8 as the transition denominator makes it so that:
     * First ~3 turns: Strong scaffolding emphasis (turn_progress < 0.4)
     * Middle ~4 turns: Balanced transition phase (turn_progress 0.4-0.7)
     * Remaining turns: Increasing emphasis on problem progress
   - This 8-turn transition worked better than:
     * 5-turn (too fast, not enough guiding emphasis)
     * 10-turn (too slow, many episodes ended before reaching full direct instruction phase)
"""
def hybrid_reward(state, action_id, next_state=None, action_meta=None):
    # Extract key state information
    progress_delta = state[5]
    turn = state[1]

    # Core progress reward and step penalty
    progress_reward = 5.0 * progress_delta
    step_penalty = -0.1

    # Single transition factor (0 to 1) based on turn number
    # Represents how far into the conversation we are
    turn_progress = min(1.0, turn / 8.0)

    # Action-specific adjustment
    strategy_bonus = 0.0

    if action_meta is not None:
        cat = action_meta['category']

        # Guiding/scaffolding actions (Focus, Probing)
        if cat in ['Focus', 'Probing']:
            # Decreasing bonus for scaffolding actions as conversation progresses
            strategy_bonus = 0.2 * (1.0 - turn_progress)

        # Telling actions
        elif cat == 'Telling':
            # Simple hint severity factor
            severity = 1.0
            if 'strategy' in action_meta:
                if action_meta['strategy'] == 'Full Reveal (Answer)':
                    severity = 1.5
                elif action_meta['strategy'] == 'Conceptual Hint':
                    severity = 0.6

            # Early: penalty for telling, Late: bonus for effective telling
            early_penalty = -0.3 * (1.0 - turn_progress) * severity
            late_bonus = 0.0
            if progress_delta > 0:  # Only reward effective telling
                late_bonus = 0.2 * turn_progress

            strategy_bonus = early_penalty + late_bonus

    return progress_reward + step_penalty + strategy_bonus


In [4]:
# Use the predefined action mappings from system
action_map = {0: 0, 1: 1, 2: 2, 3: 3, 5: 4, 6: 5, 7: 6, 8: 7, 11: 8, 12: 9,
          13: 10, 16: 11, 17: 12, 18: 13, 20: 14, 21: 15, 22: 16, 23: 17,
          26: 18, 27: 19, 28: 20, 31: 21, 32: 22, 36: 23, 37: 24, 38: 25,
          41: 26, 42: 27, 43: 28, 45: 29, 46: 30, 47: 31, 48: 32, 54: 33,
          55: 34, 56: 35, 57: 36, 58: 37, 59: 38, 60: 39, 65: 40, 66: 41,
          67: 42, 70: 43, 71: 44, 72: 45, 73: 46, 75: 47, 76: 48, 77: 49}

# Define the action metadata map
action_meta_map = {
    "Focus": {
        "Seek Next Step": {1: 0, 2: 1, 3: 2},
        "Confirm Calculation": {1: 5, 2: 6, 3: 7, 4: 8},
        "Re-direct to Sub-Problem": {2: 11, 3: 12, 4: 13},
        "Highlight Missing Info": {2: 16, 3: 17, 4: 18}
    },
    "Probing": {
        "Ask for Explanation": {1: 20, 2: 21, 3: 22, 4: 23},
        "Seek Self-Correction": {2: 26, 3: 27, 4: 28},
        "Hypothetical Variation": {2: 31, 3: 32},
        "Check Understanding/Concept": {2: 36, 3: 37, 4: 38},
        "Encourage Comparison": {2: 41, 3: 42, 4: 43}
    },
    "Telling": {
        "Partial Reveal (Strategy)": {1: 45, 2: 46, 3: 47, 4: 48},
        "Full Reveal (Answer)": {1: 54, 2: 55, 3: 56, 4: 57, 5: 58, 6: 59},
        "Corrective Explanation": {1: 60}
    },
    "Generic": {
        "Acknowledgment/Praise": {1: 65, 2: 66, 3: 67},
        "Summarize Progress": {1: 70, 2: 71, 3: 72, 4: 73},
        "General Inquiry/Filler": {1: 75, 2: 76, 3: 77}
    }
}

feature_names = [
    'misconception_type', 'convo_turn', 'previous_action_id',
    'listen_to_feedback', 'problem_progress', 'progress_delta',
    'correct_solution', 'next_action_hint_strength'
]

csv_file_path = "data.csv"
# First build the transition buffer with all required parameters
df = pd.read_csv(csv_file_path)

buffer, action_mapping = build_transition_buffer(
    df=df,
    reward_fn=hybrid_reward,
    meta_map=action_meta_map,
    orig_to_idx=action_map,
    episode_column=None
)

(12016, 64)


In [5]:
12016 * 0.8

9612.800000000001

In [6]:
train_buffer = TransitionBuffer()
train_buffer.states = buffer.states[:9612]
train_buffer.actions = buffer.actions[:9612]
train_buffer.rewards = buffer.rewards[:9612]
train_buffer.next_states = buffer.next_states[:9612]
train_buffer.next_actions = buffer.next_actions[:9612]
train_buffer.dones = buffer.dones[:9612]

test_buffer = TransitionBuffer()
test_buffer.states = buffer.states[9612:]
test_buffer.actions = buffer.actions[9612:]
test_buffer.rewards = buffer.rewards[9612:]
test_buffer.next_states = buffer.next_states[9612:]
test_buffer.next_actions = buffer.next_actions[9612:]
test_buffer.dones = buffer.dones[9612:]



In [7]:
import pickle
file = open('train_replay_buffer.pkl', 'wb')
pickle.dump(train_buffer, file)
file.close()

file = open('test_replay_buffer.pkl', 'wb')
pickle.dump(test_buffer, file)
file.close()