In [1]:
import numpy as np
import matplotlib.pyplot as plt
from dataclasses import dataclass
from scipy.stats import poisson
from __future__ import annotations
from policy import FiniteDeterministicPolicy
from markov_process import FiniteMarkovProcess, FiniteMarkovRewardProcess
from markov_decision_process import FiniteMarkovDecisionProcess
from distribution import Categorical, SampledDistribution, FiniteDistribution
from typing import (Callable, Dict, Generic, Iterator, Iterable,
                    Mapping, Optional, Sequence, Tuple, TypeVar)

Assume that transfer inventory from one store to another is decided before order inventory from the common supplier, and goods also take holding cost if it is on transportation. We assume $m_t$ to be the number of units that are transformed from store 1 to 2 at time t ($m_t$<0 means we transform good from store 2 to 1), and denote $o^{(l)}_t$ to be the order made from store $l=1,2$ at time t. Then in this two stores case, the action $a_t = (m_t, o^{(1)}_t, o^{(2)}_t)$.

In [27]:

@dataclass(frozen=True)
class InventoryState:
    on_hand_1: int
    on_order_1: int
    on_hand_2: int
    on_order_2: int

    def inventory_position_1(self) -> int:
        return self.on_hand_1 + self.on_order_1
    def inventory_position_2(self) -> int:
        return self.on_hand_2 + self.on_order_2
    
    

InvOrderMapping = Mapping[
    InventoryState,
    Mapping[int, Categorical[Tuple[InventoryState, float]]]
]


class SimpleInventoryMDPCap(FiniteMarkovDecisionProcess[InventoryState, int]):

    def __init__(
        self,
        capacity_1: int,
        capacity_2: int,
        poisson_lambda_1: float,
        poisson_lambda_2: float,
        holding_cost_1: float,
        holding_cost_2: float,
        stockout_cost_1: float,
        stockout_cost_2: float,
        trans_cost_supplier: float,
        trans_cost_betweenstore: float,
    ):
        self.capacity_1: int = capacity_1; 
        self.capacity_2: int = capacity_2;
        self.poisson_lambda_1: float = poisson_lambda_1; 
        self.poisson_lambda_2: float = poisson_lambda_2;
        self.holding_cost_1: float = holding_cost_1; 
        self.holding_cost_2: float = holding_cost_2;
        self.stockout_cost_1: float = stockout_cost_1; 
        self.stockout_cost_2: float = stockout_cost_2;
        self.trans_cost_supplier: float = trans_cost_supplier; 
        self.trans_cost_betweenstore: float = trans_cost_betweenstore

        self.poisson_distr_1 = poisson(poisson_lambda_1)
        self.poisson_distr_2 = poisson(poisson_lambda_2)
        super().__init__(self.get_action_transition_reward_map())

    def get_action_transition_reward_map(self) -> InvOrderMapping:
        d = {}

        for alpha1 in range(self.capacity_1 + 1):
            for beta1 in range(self.capacity_1 + 1 - alpha1):
                for alpha2 in range(self.capacity_2 + 1):
                    for beta2 in range(self.capacity_2 + 1 - alpha2):
                        state: InventoryState = InventoryState(alpha1, beta1, alpha2, beta2)
                        ip_1: int = state.inventory_position_1()
                        ip_2: int = state.inventory_position_2()
                
                        d1 = {}  
                        largest_move_to_2 = min(alpha1, self.capacity_2 - ip_2)
                        largest_move_to_1 = min(alpha2, self.capacity_1 - ip_1)
                        for move in range (largest_move_to_1 + largest_move_to_2 + 1):
                            move_from_1: int = move - largest_move_to_1
                            new_ip_1 = ip_1 - move_from_1
                            new_ip_2 = ip_2 + move_from_1
                            
                            base_reward: float = - self.holding_cost_1 * alpha1 - self.holding_cost_2 * alpha2
                        
                            if move_from_1 != 0 :
                                base_reward = base_reward - self.trans_cost_betweenstore
                                
                            for order1 in range(self.capacity_1 - new_ip_1 + 1):
                                for order2 in range(self.capacity_2 - new_ip_2 + 1):
                                    
                                    sr_probs_dict = {}

                                    if order1 != 0 :
                                        base_reward = base_reward - self.trans_cost_supplier
                                    if order2 != 0 :
                                        base_reward = base_reward - self.trans_cost_supplier
                                    
                                    
                                    probability1: float = 1 - self.poisson_distr_1.cdf(new_ip_1 - 1)
                                    probability2: float = 1 - self.poisson_distr_2.cdf(new_ip_2 - 1)
                                    reward1 = base_reward - self.stockout_cost_1 * (probability1 * (self.poisson_lambda_1 - new_ip_1) +\
                                                                                    new_ip_1 * self.poisson_distr_1.pmf(new_ip_1))
                                    reward2 = base_reward - self.stockout_cost_2 * (probability2 * (self.poisson_lambda_2 - new_ip_2) +\
                                                                                new_ip_2 * self.poisson_distr_2.pmf(new_ip_2))
                                    reward = reward1 + reward2 - base_reward

                                    for i1 in range (new_ip_1):
                                        for i2 in range (new_ip_2):
                                            sr_probs_dict[(InventoryState(new_ip_1 - i1, order1, new_ip_2 - i2, order2), base_reward)] = self.poisson_distr_1.pmf(i1) * self.poisson_distr_1.pmf(i2)
                                        sr_probs_dict[(InventoryState(new_ip_1-i1, order1, 0, order2), reward2)] = self.poisson_distr_1.pmf(i1) * probability2
                                    for i2 in range (new_ip_2):
                                        sr_probs_dict[(InventoryState(0, order1, new_ip_2 - i2, order2), reward1)] = probability1 * self.poisson_distr_2.pmf(i2)       
                                    sr_probs_dict[(InventoryState(0,order1, 0, order2), reward)] = probability1 * probability2
  
                                    d1[(move_from_1,order1,order2)] = Categorical(sr_probs_dict)
                        d[state] = d1
        return d

In [37]:
from dynamic_programming import policy_iteration_result
from dynamic_programming import value_iteration_result

if __name__ == '__main__':
    from pprint import pprint
    
    user_capacity_1 = 2; 
    user_capacity_2 = 2
    user_poisson_lambda_1 = 1.0; 
    user_poisson_lambda_2 = 1.0
    user_holding_cost_1 = 2.0; 
    user_holding_cost_2 = 2.0
    user_stockout_cost_1 = 10.0; 
    user_stockout_cost_2 = 10.0
    user_trans_cost_supplier = 2.0; 
    user_trans_cost_betweenstore = 3.0
    user_gamma = 0.9

    si_mdp: FiniteMarkovDecisionProcess[InventoryState, int] =\
        SimpleInventoryMDPCap(
            capacity_1 = user_capacity_1, capacity_2 = user_capacity_2,
            poisson_lambda_1 = user_poisson_lambda_1, poisson_lambda_2 = user_poisson_lambda_2,
            holding_cost_1 = user_holding_cost_1, holding_cost_2 = user_holding_cost_2,
            stockout_cost_1 = user_stockout_cost_1, stockout_cost_2 = user_stockout_cost_2,
            trans_cost_supplier = user_trans_cost_supplier, 
            trans_cost_betweenstore = user_trans_cost_betweenstore
        )


    
    print("MDP Policy Iteration Optimal Value Function and Optimal Policy")
    print("--------------")
    opt_vf_pi, opt_policy_pi = policy_iteration_result(
        si_mdp,
        gamma=user_gamma
    )
    print(opt_policy_pi)
    print()
    

    print("MDP Value Iteration Optimal Value Function and Optimal Policy")
    print("--------------")
    opt_vf_vi, opt_policy_vi = value_iteration_result(si_mdp, gamma=user_gamma)
    print(opt_policy_vi)
    print()

MDP Policy Iteration Optimal Value Function and Optimal Policy
--------------
For State InventoryState(on_hand_1=0, on_order_1=0, on_hand_2=0, on_order_2=0): Do Action (0, 0, 2)
For State InventoryState(on_hand_1=0, on_order_1=0, on_hand_2=0, on_order_2=1): Do Action (0, 1, 1)
For State InventoryState(on_hand_1=0, on_order_1=0, on_hand_2=0, on_order_2=2): Do Action (0, 2, 0)
For State InventoryState(on_hand_1=0, on_order_1=0, on_hand_2=1, on_order_2=0): Do Action (0, 1, 1)
For State InventoryState(on_hand_1=0, on_order_1=0, on_hand_2=1, on_order_2=1): Do Action (0, 2, 0)
For State InventoryState(on_hand_1=0, on_order_1=0, on_hand_2=2, on_order_2=0): Do Action (0, 2, 0)
For State InventoryState(on_hand_1=0, on_order_1=1, on_hand_2=0, on_order_2=0): Do Action (0, 0, 2)
For State InventoryState(on_hand_1=0, on_order_1=1, on_hand_2=0, on_order_2=1): Do Action (0, 0, 1)
For State InventoryState(on_hand_1=0, on_order_1=1, on_hand_2=0, on_order_2=2): Do Action (0, 1, 0)
For State InventorySta

For state InventoryState(on_hand_1=2, on_order_1=0, on_hand_2=0, on_order_2=0), when the fixed transportation cost K1 make from the supplier is 2 and the transportation cost K2 from moving between the two stores is 1, the Action (1,0,1) is optimal; But when we change K2 to 3, then Action (0,0,2) is optimal. This makes perfect sense because when the cost K2 is large, we don't want to transform good between the two stores.