State Space: $S={0,w1,....wn}$

Action Space $A={reject,accept}$

Transition function: P(s1,a,s2)
1. if $s1\neq 0$, $P(s1,a,0)=\alpha$, $P(s1,a,s1)=\alpha$
2. if $s1=0$,$P(s1,accept,w_n)=p_n$ $P(s1,reject,0)=1$

Reward function: R(s1,a,s2)=log(s1) if s1>0
R(s1,a,s2)=0 if s1<0

In [19]:
from dataclasses import dataclass
from typing import Tuple, Dict,Mapping, Iterator
from rl.markov_decision_process import FiniteMarkovDecisionProcess
from rl.markov_decision_process import FinitePolicy, StateActionMapping
from rl.markov_process import FiniteMarkovProcess, FiniteMarkovRewardProcess
from rl.distribution import Categorical, Constant
from scipy.stats import poisson
from rl.dynamic_programming import value_iteration_result,value_iteration,almost_equal_vfs
from rl.iterate import converged, iterate,converge
import math
import matplotlib.pyplot as plt

##Define the stateclass
@dataclass(frozen=True)
class SalaryState:
    salary: int

# 0 is A,1 is B
SalaryStateMapping = StateActionMapping[SalaryState, int]


class SimpleSalaryMDP(FiniteMarkovDecisionProcess[SalaryState, int]):

    def __init__(
        self,
        alpha:float,
        wages,
        prob
    ):
        self.alpha:float=alpha
        self.wages=wages
        self.prob=prob
        super().__init__(self.get_action_transition_reward_map())
    def get_action_transition_reward_map(self) -> SalaryStateMapping:
        d: Dict[SalaryState, Dict[int, Categorical[Tuple[SalaryState,float]]]] = {}
        d0:Dict[int,Categorical[Tuple[SalaryState,float]]]={}
        sr_probs_dict0:Dict[Tuple[SalaryState,float]]={}
        sr_probs_dict1:Dict[Tuple[SalaryState,float]]={}
        for i in range (len(self.wages)):
            sr_probs_dict0[(SalaryState(0),0)]=1
            sr_probs_dict1[(SalaryState(self.wages[i]),0)]=self.prob[i]
        d0[0]=Categorical(sr_probs_dict0)
        d0[1]=Categorical(sr_probs_dict1)
        d[SalaryState(0)]=d0
        for i in range(len(self.wages)):
            d1:Dict[int,Categorical[Tuple[SalaryState,float]]]={}
            for action in range(2):
                sr_probs_dict:Dict[Tuple[SalaryState,float]]={}
                sr_probs_dict[(SalaryState(0),math.log(self.wages[i]))]=self.alpha
                sr_probs_dict[(SalaryState(self.wages[i]),math.log(self.wages[i]))]=1-self.alpha
                d1[action]=Categorical(sr_probs_dict)
            d[SalaryState(self.wages[i])]=d1
        return d

In [20]:
salary=[1,2,3]
prob=[0.2,0.3,0.5]
si_mdp1=SimpleSalaryMDP(alpha=0.1,wages=salary,prob=prob)
opt_vf_vi, opt_policy_vi = value_iteration_result(si_mdp1, gamma=0.9)

In [21]:
opt_vf_vi

{SalaryState(salary=0): 6.2524282258739845,
 SalaryState(salary=1): 2.961625384312576,
 SalaryState(salary=2): 6.609768439164322,
 SalaryState(salary=3): 8.743795323518912}

In [22]:
opt_policy_vi

For State SalaryState(salary=0):
  Do Action 1 with Probability 1.000
For State SalaryState(salary=1):
  Do Action 0 with Probability 1.000
For State SalaryState(salary=2):
  Do Action 0 with Probability 1.000
For State SalaryState(salary=3):
  Do Action 0 with Probability 1.000