### Sketch of hierarchical VI

In [None]:
from typing import Any, Tuple, Union, TypeVar

Action = TypeVar("Action")
State = TypeVar("State")
StateDist = list[Tuple[State, float]]
StateRewardDist = list[Tuple[Tuple[State, float], float]]

class MDP:
    state_list : list[State]
    def actions(self, state: State) -> list[Action]: pass
    def next_state_reward_dist(self, state: State, action: Action) -> StateRewardDist: pass
    def is_terminal(self, state: State) -> bool: pass

class SubTask:
    mdp: MDP
    def child_subtasks(self, state: State) -> list[Union["SubTask", Action]]: pass
    def continuation_prob(self, state: State) -> float: pass
    def exit_reward(self, state: State) -> float: pass
    def exit_distribution(self, state: State) -> StateDist: pass #eq 8


In [None]:
def value(subtask: SubTask, s: State):
    if subtask.mdp.is_terminal(s):
        return 0 # HACK: do we need to add pseudoreward???
    continue_prob = subtask.continuation_prob(s)

    # max_a \sum_{s', r} p(s', r | s, a) [r + gamma_j(s)*V(s')]
    max_qval = float("-inf")
    for a in subtask.child_subtasks(s):
        # get next-state/reward distribution for semi-MDP
        if isinstance(a, Action):
            ns_r_prob = subtask.mdp.next_state_reward_dist(s, a)
        elif isinstance(a, SubTask):
            ns_r_prob = []
            for ns, prob in a.exit_distribution(s):
                r = value(a, s) - a.exit_reward(ns)
                ns_r_prob.append((ns, r, prob))
        # calculate expected value of action Q(subtask, s, a)
        qval = 0
        for ns, r, prob in ns_r_prob:
            qval += prob * (r + continue_prob*value(subtask, ns))
        # update max value
        max_qval = max(max_qval, qval)

    # + (1 - gamma_j(s))*eps_j(s)
    max_qval += (1 - continue_prob)*subtask.exit_reward(s)
    return max_qval
