Code for 1 2 3

In [23]:
import itertools
import numpy as np

from typing import Iterable, Callable, Mapping, TypeVar, List
from rl.markov_process import TransitionStep, ReturnStep
from rl.returns import returns
from rl.function_approx import Tabular
from rl.iterate import last
from collections import defaultdict
from rl.monte_carlo import mc_prediction
from rl.td import td_prediction
from rl.chapter2.simple_inventory_mrp import SimpleInventoryMRPFinite, InventoryState
from rl.distribution import Constant

In [50]:
S=TypeVar('S')
def tb_mc_prediction(transitions:Iterable[Iterable[ReturnStep[S]]],
                    gamma:float,
                    tol:float=1e-06)->List[Mapping[S,float]]:
    episodes=[returns(trace,gamma,tol) for trace in transitions]
    vf:List[Mapping[S,float]]=[]
    count:Mapping[S,int]=defaultdict(lambda:0)
    for epi in episodes:
        if(len(vf)==0):
            current:Mapping[S,float]=defaultdict(lambda:0)
        else:
            current:Mapping[S,float]={key:val for (key,val) in vf[-1].items()}
        for step in epi:
            count[step.state]+=1
            current[step.state]=current[step.state]+1.0/(count[step.state])*(step.return_-current[step.state])
        vf.append(current)
    return vf
def tb_td_prediction(transitions:Iterable[TransitionStep[S]],
                    gamma:float)->List[Mapping[S,float]]:
    vf:Mapping[S,float]=defaultdict(lambda:0)
    count:Mapping[S,int]=defaultdict(lambda:0)
    vfs:List[Mapping[S,float]]=[]
    for step in transitions:
        count[step.state]+=1
        vf[step.state]=vf[step.state]+1.0/(count[step.state])*(step.reward+gamma*vf[step.next_state]-vf[step.state])
        vfs.append(vf)
    return vfs

In [51]:
user_capacity = 2
user_poisson_lambda = 1.0
user_holding_cost = 1.0
user_stockout_cost = 10.0

user_gamma = 0.9

si_mrp = SimpleInventoryMRPFinite(
    capacity=user_capacity,
    poisson_lambda=user_poisson_lambda,
    holding_cost=user_holding_cost,
    stockout_cost=user_stockout_cost
)
start = InventoryState(on_hand = 0, on_order = 0)
sample = [list(itertools.islice(si_mrp.simulate_reward(Constant(start)),1000)) 
          for _ in range(100)]
reward = tb_mc_prediction(sample, user_gamma)[-1]
print(reward)
reward2=last(mc_prediction(sample,Tabular(),user_gamma)).values_map
print(reward2)

{InventoryState(on_hand=0, on_order=0): -35.48668974722294, InventoryState(on_hand=0, on_order=2): -28.325958710622196, InventoryState(on_hand=1, on_order=0): -28.79412051879173, InventoryState(on_hand=0, on_order=1): -27.827449105824858, InventoryState(on_hand=1, on_order=1): -29.221346226213964, InventoryState(on_hand=2, on_order=0): -30.146375426668925}
{InventoryState(on_hand=0, on_order=0): -35.48668974722309, InventoryState(on_hand=0, on_order=2): -28.325958710622164, InventoryState(on_hand=1, on_order=0): -28.794120518791715, InventoryState(on_hand=0, on_order=1): -27.82744910582485, InventoryState(on_hand=1, on_order=1): -29.22134622621395, InventoryState(on_hand=2, on_order=0): -30.146375426668886}


In [52]:
sample2=list(itertools.islice(si_mrp.simulate_reward(Constant(start)),100000))
reward=tb_td_prediction(sample2,user_gamma)[-1]
print(reward)
reward2=last(td_prediction(sample2,Tabular(),user_gamma)).values_map
print(reward2)

defaultdict(<function tb_td_prediction.<locals>.<lambda> at 0x7ff869013550>, {InventoryState(on_hand=0, on_order=0): -24.775000173865344, InventoryState(on_hand=0, on_order=2): -17.612531898647614, InventoryState(on_hand=1, on_order=0): -18.195668598431055, InventoryState(on_hand=1, on_order=1): -18.57732451012334, InventoryState(on_hand=0, on_order=1): -17.190344188108035, InventoryState(on_hand=2, on_order=0): -19.535314370593206})
{InventoryState(on_hand=0, on_order=0): -24.775000173865042, InventoryState(on_hand=0, on_order=2): -17.612531898647305, InventoryState(on_hand=1, on_order=0): -18.195668598430817, InventoryState(on_hand=1, on_order=1): -18.577324510122814, InventoryState(on_hand=0, on_order=1): -17.19034418810812, InventoryState(on_hand=2, on_order=0): -19.535314370593106}
