# Assignment 13

In [1]:
# Change notebook width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:70% !important; }</style>"))

# Problem 2

In [4]:
from dataclasses import dataclass
from typing import *
from rl.markov_decision_process import *
from rl.markov_process import *
from rl.distribution import *
from rl.dynamic_programming import *
from rl.returns import *
from scipy.stats import poisson
import pprint as pp
import numpy as np
import itertools
import time
from rl.monte_carlo import mc_prediction
from rl.td import td_prediction
from rl.function_approx import Tabular
from rl.function_approx import FunctionApprox
from rl.markov_process import FiniteMarkovRewardProcess
import matplotlib.pyplot as plt
from rl.chapter3.simple_inventory_mdp_cap import SimpleInventoryMDPCap

In [5]:
@dataclass(frozen=True)
class InventoryState:
    on_hand: int
    on_order: int

    def inventory_position(self) -> int:
        return self.on_hand + self.on_order
    
    def __eq__(self, other):
        return (self.on_hand == other.on_hand) and  (self.on_order == other.on_order)


class SimpleInventoryMRP(MarkovRewardProcess[InventoryState]):

    def __init__(
        self,
        capacity: int,
        poisson_lambda: float,
        holding_cost: float,
        stockout_cost: float
    ):
        self.capacity = capacity
        self.poisson_lambda: float = poisson_lambda
        self.holding_cost: float = holding_cost
        self.stockout_cost: float = stockout_cost

    def transition_reward(
        self,
        state: InventoryState
    ) -> SampledDistribution[Tuple[InventoryState, float]]:

        def sample_next_state_reward(state=state) ->\
                Tuple[InventoryState, float]:
            demand_sample: int = np.random.poisson(self.poisson_lambda)
            ip: int = state.inventory_position()
            next_state: InventoryState = InventoryState(
                max(ip - demand_sample, 0),
                max(self.capacity - ip, 0)
            )
            reward: float = - self.holding_cost * state.on_hand\
                - self.stockout_cost * max(demand_sample - ip, 0)
            return next_state, reward

        return SampledDistribution(sample_next_state_reward)


class SimpleInventoryMRPFinite(FiniteMarkovRewardProcess[InventoryState]):

    def __init__(
        self,
        capacity: int,
        poisson_lambda: float,
        holding_cost: float,
        stockout_cost: float
    ):
        self.capacity: int = capacity
        self.poisson_lambda: float = poisson_lambda
        self.holding_cost: float = holding_cost
        self.stockout_cost: float = stockout_cost

        self.poisson_distr = poisson(poisson_lambda)
        super().__init__(self.get_transition_reward_map())

    def get_transition_reward_map(self) -> RewardTransition[InventoryState]:
        d: Dict[InventoryState, Categorical[Tuple[InventoryState, float]]] = {}
        for alpha in range(self.capacity + 1):
            for beta in range(self.capacity + 1 - alpha):
                state = InventoryState(alpha, beta)
                ip = state.inventory_position()
                beta1 = self.capacity - ip
                base_reward = - self.holding_cost * state.on_hand
                sr_probs_map: Dict[Tuple[InventoryState, float], float] =\
                    {(InventoryState(ip - i, beta1), base_reward):
                     self.poisson_distr.pmf(i) for i in range(ip)}
                probability = 1 - self.poisson_distr.cdf(ip - 1)
                reward = base_reward - self.stockout_cost *\
                    (probability * (self.poisson_lambda - ip) +
                     ip * self.poisson_distr.pmf(ip))
                sr_probs_map[(InventoryState(0, beta1), reward)] = probability
                d[state] = Categorical(sr_probs_map)
        return d

In [9]:
S = TypeVar('S')
A = TypeVar('A')
V = Mapping[S, float]
Q = Mapping[Tuple[S, A], float]

def initiate_SARSA(mdp):
    # Init Dicts
    q: Q = {}
    actions: Mapping[InventoryState, List[int]] = {}
    counts_per_state_action: Mapping[Tuple[S, A], int] = {}
    policy_map: Mapping[S, Optional[Categorical[A]]] = {}
    
    # Get States
    states:List[InventoryState] = mdp.non_terminal_states
    
    # Populate actions
    for i in mdp.mapping:
        actions[i] = list(mdp.mapping[i].keys()) 
    
    # Loop over states
    for state in states:
        # Set counts to zero for all state action pairs
        for action in actions[state]:
            q[(state, action)] = 0.
            counts_per_state_action[(state, action)] = 0
            
        # Setup policy map
        if actions[state] is None:
            policy_map[state] = None
        else:
            policy_map[state] = Categorical({action:1 for action in actions[state]})
    
    # Get Policy from policy map
    Policy: FinitePolicy[S, A] = FinitePolicy(policy_map)
        
    return states, actions, q, counts_per_state_action, Policy


def get_Q_vf_π(q :Q) -> Tuple[V, FinitePolicy[S,A]]:
        # Init Dicts
        vf: V = {}
        policy_map: Mapping[S, Optional[Constant[A]]] = {}
        
        # Loop over state and action of Q function
        for state, action in q:
            if state not in vf.keys() or q[(state, action)] > vf[state]:
                vf[state] = q[(state, action)]
                policy_map[state] = Constant(action)
        Policy = FinitePolicy(policy_map)
        
        return (vf,Policy)

In [13]:
def tabular_sarsa(
        mdp:FiniteMarkovDecisionProcess,
        states: List[S],
        actions:Mapping[S,List[A]],
        γ: float,
        q: Q,
        counts :Mapping[Tuple[S, A], int],
        policy :FinitePolicy[S, A],
        ε:float = 0.1,
        αi:float = 0.05,
        num_episodes:float = 10000,
        half_life:float = 1000.0,
        exponent:float = 0.5
                                                                                            ) ->V:
    # Get initial state sample
    state = Categorical({state:1 for state in states}).sample()
    for i in range(num_episodes):
        # Get Action
        action = policy.act(state).sample()
        # Get Next State and Reward
        next_state, reward = mdp.step(state, action).sample()
        # Get Next Action
        next_action = policy.act(next_state).sample()
        
        # Increment state action count
        counts[(state, action)] += 1
        # Get Learning rate
        α = αi / (1 + ((counts[(state, action)] -1) / half_life)**exponent)
        #Choose the next action based on epsilon greedy policy
        q[(state,action)] += α * (reward + γ*q[(next_state, next_action)] - q[(state, action)])
        

        # Update if at terminal state
        new_policy = policy.policy_map
        if actions[state] is None:
            new_policy[state] = None
            
        # Update new policy
        policy_map = {action: ε / len(actions[state]) for action in actions[state]}
        optimal_action = actions[state][np.argmax([q[(state, action)] for action in actions[state]])]
        policy_map[optimal_action] += 1 - ε
        new_policy[state] = Categorical(policy_map)
        policy = FinitePolicy(new_policy)
        
        # Set state for next iteration
        state = next_state
        
        # Start over with new sample if next state is a terminal state
        if next_state is None:
            state = Categorical({state:1 for state in states}).sample()
            
    return q

In [15]:
user_capacity = 2
user_poisson_lambda = 1.0
user_holding_cost = 1.0
user_stockout_cost = 10.0
γ = 0.9

mdp: FiniteMarkovDecisionProcess[InventoryState, int] = SimpleInventoryMDPCap(
                capacity=user_capacity,
                poisson_lambda=user_poisson_lambda,
                holding_cost=user_holding_cost,
                stockout_cost=user_stockout_cost
    )
    
print("Ye Old MDP Value Iteration")
opt_vf_vi, opt_policy_vi = value_iteration_result(mdp, gamma=γ)
pprint(opt_vf_vi)
print()
print(opt_policy_vi)
print()


states, actions, q, counts, Policy = initiate_SARSA(mdp)

print("SARSA")
q_sarsa = tabular_sarsa(mdp,
                        states,
                        actions,
                        γ,
                        q,
                        counts,
                        Policy)

sarsa_vf, sarsa_policy = get_Q_vf_π(q_sarsa)
print()
pprint(sarsa_vf)
print()
pprint(sarsa_policy)
print()




Ye Old MDP Value Iteration
{InventoryState(on_hand=0, on_order=0): -34.89484576629397,
 InventoryState(on_hand=1, on_order=0): -28.660950216301437,
 InventoryState(on_hand=0, on_order=2): -27.991890076067463,
 InventoryState(on_hand=0, on_order=1): -27.66095021630144,
 InventoryState(on_hand=1, on_order=1): -28.991890076067467,
 InventoryState(on_hand=2, on_order=0): -29.991890076067463}

For State InventoryState(on_hand=0, on_order=0):
  Do Action 1 with Probability 1.000
For State InventoryState(on_hand=0, on_order=1):
  Do Action 1 with Probability 1.000
For State InventoryState(on_hand=0, on_order=2):
  Do Action 0 with Probability 1.000
For State InventoryState(on_hand=1, on_order=0):
  Do Action 1 with Probability 1.000
For State InventoryState(on_hand=1, on_order=1):
  Do Action 0 with Probability 1.000
For State InventoryState(on_hand=2, on_order=0):
  Do Action 0 with Probability 1.000


SARSA

{InventoryState(on_hand=0, on_order=0): -35.31084044326533,
 InventoryState(on_hand