In [1]:
import numpy as np
import matplotlib.pyplot as plt
import math

from typing import Iterable, Iterator, TypeVar, Callable, Mapping, Tuple
from rl.distribution import Categorical
from rl.iterate import last
from rl.markov_decision_process import MarkovDecisionProcess, Policy, TransitionStep, NonTerminal
from rl.policy import DeterministicPolicy, RandomPolicy, UniformPolicy
import rl.markov_process as mp
from rl.markov_process import MarkovRewardProcess
from rl.returns import returns
import itertools


## Tabular MC Prediction

In [3]:
S = TypeVar('S')
def tabular_mc_prediction(
    mrp: MarkovRewardProcess[S],
    traces: Iterable[Iterable[Tuple[NonTerminal[S], float]]],
    gamma: float,
) -> Mapping[NonTerminal[S], float]:
    count: Mapping[NonTerminal[S], int] = {s: 0 for s in mrp.non_terminal_states}
    sum_return: Mapping[NonTerminal[S], float] = {s: 0.0 for s in mrp.non_terminal_states}
    for trace in traces:
        for state_return in trace:
            state, return_ = state_return
            count[state] += 1
            sum_return[state] += return_
            
    value_function: Mapping[NonTerminal[S], float] = {}
    for state in mrp.non_terminal_states:
        value_function[state] = sum_return[state] / count[state]
    return value_function

### Test on SimpleInventory MRPFinite

In [4]:
from rl.chapter2.simple_inventory_mrp import SimpleInventoryMRPFinite
from rl.chapter2.simple_inventory_mrp import InventoryState
from rl.distribution import Choose
from rl.iterate import last
from itertools import islice
from pprint import pprint

In [5]:
# exact value
user_capacity = 2
user_poisson_lambda = 1.0
user_holding_cost = 1.0
user_stockout_cost = 10.0
user_gamma = 0.9
si_mrp: SimpleInventoryMRPFinite = SimpleInventoryMRPFinite(
    capacity = user_capacity,
    poisson_lambda = user_poisson_lambda,
    holding_cost = user_holding_cost,
    stockout_cost = user_stockout_cost
)
si_mrp.display_value_function(gamma=user_gamma)

{NonTerminal(state=InventoryState(on_hand=0, on_order=0)): -35.511,
 NonTerminal(state=InventoryState(on_hand=0, on_order=1)): -27.932,
 NonTerminal(state=InventoryState(on_hand=0, on_order=2)): -28.345,
 NonTerminal(state=InventoryState(on_hand=1, on_order=0)): -28.932,
 NonTerminal(state=InventoryState(on_hand=1, on_order=1)): -29.345,
 NonTerminal(state=InventoryState(on_hand=2, on_order=0)): -30.345}


In [6]:
# tabular_MC
episodes_num = 60000
def episode(
    mrp: MarkovRewardProcess[S],
    gamma = float,
    episode_length_tolerance: float = 1e-6
):
    max_steps = round(math.log(episode_length_tolerance)/ math.log(gamma))
    start_state_distribution = Choose(si_mrp.non_terminal_states)
    state: State[S] = start_state_distribution.sample()
    reward: float = 0.0
    state_reward_pair = []; i = 0;
    while isinstance(state, NonTerminal) and i <= 2 * max_steps:
        next_distribution = mrp.transition_reward(state)
        next_state, reward = next_distribution.sample()
        state_reward_pair.append((state,reward))
        state = next_state
        i = i + 1
    
    accumulate_return: float = 0.0
    state_return_pair = []
    for i in range (2*max_steps,-1,-1):
        state, reward = state_reward_pair[i]
        accumulate_return = gamma * accumulate_return + reward
        if i < max_steps:
            state_return_pair.append((state, accumulate_return))      
    return state_return_pair  

episodes = []
for i in range (episodes_num):
    episodes.append(episode(si_mrp, gamma = user_gamma, episode_length_tolerance = 1e-6))

tabular_mc_prediction(si_mrp, episodes, user_gamma)

{NonTerminal(state=InventoryState(on_hand=0, on_order=0)): -35.51513860043066,
 NonTerminal(state=InventoryState(on_hand=0, on_order=1)): -27.930186469270527,
 NonTerminal(state=InventoryState(on_hand=0, on_order=2)): -28.348531656197533,
 NonTerminal(state=InventoryState(on_hand=1, on_order=0)): -28.93292531437083,
 NonTerminal(state=InventoryState(on_hand=1, on_order=1)): -29.346984361537285,
 NonTerminal(state=InventoryState(on_hand=2, on_order=0)): -30.34950462107344}

#### We see that the Value Function computed by Tabular Monte-Carlo Prediction with 60000 trace experiences is within 0.01 of the exact Value Function, for each of the states.

## Tabular TD Prediction

In [7]:
def tabular_td_prediction(
    mrp: MarkovRewardProcess[S],
    traces: Iterable[Iterable[Tuple[NonTerminal[S], float, NonTerminal[S]]]],
    gamma: float,
    alpha_func: Callable[[int], float]
) -> Mapping[NonTerminal[S], float]:
    count: Mapping[NonTerminal[S], int] = {s: 0 for s in mrp.non_terminal_states}
    value_function: Mapping[NonTerminal[S], float] = {s: 0.0 for s in mrp.non_terminal_states}
    for trace in traces:
        for state_pair in trace:
            state, reward, next_state = state_pair
            count[state] += 1
            n = count[state]
            alpha = alpha_func(n)
            value_function[state] = (1-alpha) * value_function[state] + alpha * (reward + gamma * value_function[next_state])
    return value_function

### Test on SimpleInventory MRPFinite

In [9]:
# exact value
user_capacity = 2
user_poisson_lambda = 1.0
user_holding_cost = 1.0
user_stockout_cost = 10.0
user_gamma = 0.9
si_mrp: SimpleInventoryMRPFinite = SimpleInventoryMRPFinite(
    capacity = user_capacity,
    poisson_lambda = user_poisson_lambda,
    holding_cost = user_holding_cost,
    stockout_cost = user_stockout_cost
)
si_mrp.display_value_function(gamma=user_gamma)

{NonTerminal(state=InventoryState(on_hand=0, on_order=0)): -35.511,
 NonTerminal(state=InventoryState(on_hand=0, on_order=1)): -27.932,
 NonTerminal(state=InventoryState(on_hand=0, on_order=2)): -28.345,
 NonTerminal(state=InventoryState(on_hand=1, on_order=0)): -28.932,
 NonTerminal(state=InventoryState(on_hand=1, on_order=1)): -29.345,
 NonTerminal(state=InventoryState(on_hand=2, on_order=0)): -30.345}


In [8]:
# tabular_TD
episodes_num = 60000
episode_length: int = 100
initial_learning_rate: float = 0.03
half_life: float = 1000.0
exponent: float = 0.5
gamma: float = 0.9

# learning rate function
def learning_rate_schedule(
    initial_learning_rate: float,
    half_life: float,
    exponent: float
) -> Callable[[int], float]:
    def lr_func(n: int) -> float:
        return initial_learning_rate * (1 + (n - 1) / half_life) ** -exponent
    return lr_func

def episode(
    mrp: MarkovRewardProcess[S],
    episode_length: int 
):
    start_state_distribution = Choose(si_mrp.non_terminal_states)
    state: State[S] = start_state_distribution.sample()
    state_reward_pair = []; 
    for i in range (episode_length):
        next_distribution = mrp.transition_reward(state)
        next_state, reward = next_distribution.sample()
        state_reward_pair.append((state,reward,next_state))
        state = next_state  
    return state_reward_pair 

episodes = []
for i in range (episodes_num):
    episodes.append(episode(si_mrp, episode_length))


tabular_td_prediction(si_mrp, episodes, user_gamma, learning_rate_schedule(initial_learning_rate, half_life, exponent))

{NonTerminal(state=InventoryState(on_hand=0, on_order=0)): -35.5302286312704,
 NonTerminal(state=InventoryState(on_hand=0, on_order=1)): -28.00534413402638,
 NonTerminal(state=InventoryState(on_hand=0, on_order=2)): -28.328497787464332,
 NonTerminal(state=InventoryState(on_hand=1, on_order=0)): -28.95427280815482,
 NonTerminal(state=InventoryState(on_hand=1, on_order=1)): -29.39868260310957,
 NonTerminal(state=InventoryState(on_hand=2, on_order=0)): -30.3936992936391}

#### We see that the Value Function computed by Tabular Monte-Carlo Prediction with 60000 trace experiences is within 0.06 of the exact Value Function, for each of the states.

### How to generate traces using yield? 

In [27]:
trace_gen = si_mrp.simulate_reward(start_state_distribution)
trace = next(trace_gen)
print(trace)
while True:
    try:
        next(trace)
    except StopIteration:
        break

TransitionStep(state=NonTerminal(state=InventoryState(on_hand=1, on_order=0)), next_state=NonTerminal(state=InventoryState(on_hand=0, on_order=1)), reward=-4.678794411714423)


TypeError: 'TransitionStep' object is not an iterator