In [2]:
import numpy as np

In [122]:
class MP:
    def __init__(self,transition):
        self.transition = transition
    def computeStationnary(self):
        Ker = self.transition.transpose() - np.identity(self.transition.shape[0])
        values, vectors = np.linalg.eig(Ker)
        zeroIndex = np.argmin(abs(values))
        zeroVect = vectors[:,zeroIndex]
        return zeroVect/np.sum(zeroVect)

    
class MRP(MP):
    def __init__(self, transition, reward, gamma):
        super().__init__(transition)
        self.reward = reward
        self.gamma = gamma
          
class MRP_2(MP):
    def __init__(self, transition, reward, gamma):
        super().__init__(transition)
        self.reward = reward
        self.gamma = gamma
    def cast(self):
        reward = np.diag(np.dot(self.reward, np.transpose(self.transition)))
        return MRP_1(self.transition, reward, self.gamma)

The Bellman equation for an MRP value function $v$ can be expressed as

<center>
    <br>
    $v=R + \gamma pv$
    
</center>

In [27]:
def value(mrp):
    return np.dot(np.linalg.inv(np.identity(mrp.transition.shape[0]) - mrp.gamma*mrp.transition), mrp.reward)

A Markov Decision Process is defined by:

   - A finite set of states $\mathcal S$ 
   
   - A finite set of actions $\mathcal A$ 

   - A transition probability $\mathcal P^{a}_{ss'}$ which gives the probability of moving from state $\mathcal s$ to $\mathcal {s'}$ after action $a$.
   
   - A reward function $\mathcal R^{a}_s = \mathbb E[R_{t+1}|S_{t}=\mathcal s, A_t=\mathcal a]$
   
   - A discount factor $\gamma$ in $[0,1]$
   

A policy is a distribution of actions to take over the state space

<center>
    $\pi(a|s) = \mathbb P [A_t = a | S_t = s]$
</center>

The value function is the expected future return starting from state $\mathcal s$ and following policy $\pi$

<center>
    <br>
    $v_\pi(s) = E_\pi[G_t| S_t = s]$
</center>

In [125]:
class MDP():
    def __init__(self, gamma, transition=None, reward=None,  mrp_list=None):
        self.gamma = gamma
        if mrp_list != None:
            self.mrps = [MRP(transition[i], reward[i], gamma) for i in range(transition.shape[0])]
        else:
            self.mrps = mrp_list

In [69]:
class Policy():
    def __init__(self,policy):
        self.policy = policy
    def forward(self,s):
        return np.random.choice(a = range(self.policy.shape[1]), p=self.policy[:,s])

In [121]:
def toMRP(mdp, policy):
    A, S = policy.shape
    transition = np.zeros((S,S))
    reward = np.zeros(S)
    for s in range(S):
        proba = 0
        r = 0
        for a in range(A):
            r += policy[a, s] * self.mrps[a].reward[s]
            proba += policy[a, s]*mdp.mrps[a].transition[s]
        transition[s] = proba
        reward[s] = r
    return MRP(transition, reward, mdp.gamma)

In [127]:
class MDP_2():
    def __init__(self, transition, reward, gamma):
        self.gamma = gamma
        self.mrps = [MRP_2(transition[i], reward[i], gamma) for i in range(transition.shape[0])]
    def cast():
        return MDP(self.gamma, mrp_list = [mrp.cast() for mrp in self.mrps])