In [2]:
import numpy as np

In [122]:
class MP:
    def __init__(self,transition):
        self.transition = transition
    def computeStationnary(self):
        Ker = self.transition.transpose() - np.identity(self.transition.shape[0])
        values, vectors = np.linalg.eig(Ker)
        zeroIndex = np.argmin(abs(values))
        zeroVect = vectors[:,zeroIndex]
        return zeroVect/np.sum(zeroVect)

    
class MRP(MP):
    def __init__(self, transition, reward, gamma):
        super().__init__(transition)
        self.reward = reward
        self.gamma = gamma

In [27]:
def MRPValue(mrp):
    return np.dot(np.linalg.inv(np.identity(mrp.transition.shape[0]) - mrp.gamma*mrp.transition), mrp.reward)

In [1]:
def fromDeterministic(policy):
    return np.argmax(policy, axis=0)

In [125]:
class MDP():
    def __init__(self, gamma, transition=None, reward=None,  mrp_list=None):
        self.gamma = gamma
        if mrp_list != None:
            self.mrps = [MRP(transition[i], reward[i], gamma) for i in range(transition.shape[0])]
        else:
            self.mrps = mrp_list
        self.S = self.mrps[0].transition.shape[0]
        self.A = len(self.mrps)
            
    def toMRP(self, policy):
    A, S = policy.shape
    transition = np.zeros((S,S))
    reward = np.zeros(S)
    for s in range(S):
        proba = 0
        r = 0
        for a in range(A):
            r += policy[a, s] * self.mrps[a].reward[s]
            proba += policy[a, s]*self.mrps[a].transition[s]
        transition[s] = proba
        reward[s] = r
    return MRP(transition, reward, self.gamma)

    def evaluation(policy):
        return MRPValue(toMRP(self, policy))
    
    def iteration(start=np.zeros(self.S)):
        value = np.zeros(self.S)
        policy = start
        output = np.zeros(self.S)
        eq = False
        while(!eq):
            dv = self.evaluation(fromDeterministic(policy)) - value
            if dv.sum() == 0:
                eq = False
                continue
            for s in range(self.S):
                old = start[s]
                best = start[s]
                maxVal = value[s]
                for a in range(self.A):
                    start[s] = a
                    newVal = self.evaluation(fromDeterministic)[s]
                    if newVal > maxVal:
                        newVal = maxVal
                        best = a
                output[s] = best
                start[s] = old
            start = np.copy(output)
        return start, value
    
    def valueIteration(start=0, eps):
        value = np.zeros(self.S)
        dv = np.inf
        while(dv < eps):
            prevValue = value.copy()
            for s in range(self.S):
                for a in range(self.A):
                    val = self.mrps[a].reward[s] + np.sum(self.mrps[a].transition[s]*self.mrps[a].reward)
                    if val > value[s]:
                        value[s] = val
            dv = np.sum(value-prevValue)
        return value
        

In [69]:
class Policy():
    def __init__(self,policy):
        self.policy = policy
    def forward(self,s):
        return np.random.choice(a = range(self.policy.shape[1]), p=self.policy[:,s])       