In [317]:
import numpy as np
import scipy.linalg
import operator

## 1. MP Representation

In [23]:
class MarkovProcess(object):
    
    def __init__(self,transition_proba):
        """
        Initialize the MarkovProcess instance.
 
        Parameters
        ----------
        transition_proba: dictionary of dictionary
            {state : {state : probability}} representing the probabilities
            of transitioning from one state to another in the Markov 
            Process.
        
        Three attributes
        ----------
        transition_proba: dictionary of dictionary
        
        states: vector
            a vector to represent the state space of the Markov Process
        
        stationary_dist: dictionary of float
            {state: proba} to represent the long-run probability of being in each state
        """
        self.transition_proba = transition_proba
        self.states = list(transition_proba.keys())
        self.stationary_dist = None
    
    def calculate_stationary_distribution(self):
        # probability dictionary to matrix
        num = len(self.states)
        T = np.empty([num,num])
        i = 0
        for state, proba in self.transition_proba.items():
            T[i,:] = list(proba.values())
            i = i+1
        """
        Stationary distribution is given by solving
        pi = pi*T.
        Solve for the left (row) eigen vector of transition matrix
        """      
        value, left, right = scipy.linalg.eig(T, left=True,right=True)
        eig_vec = left[:,np.where(np.abs(value-1)<1e-8)[0][0]].astype(float)
        eig_vec = eig_vec/np.sum(eig_vec) # normalize
        self.stationary_dist = {}
        for i in range(num):
            self.stationary_dist[self.states[i]] = eig_vec[i]
        

Example:

In [111]:
T_0 = {'Facebook':{'Facebook':0.9,'Class 1':0.1,'Class 2':0,'Class 3':0,'Pass':0,'Sleep':0,'Pub':0},
     'Class 1':{'Facebook':0.5,'Class 1':0,'Class 2':0.5,'Class 3':0,'Pass':0,'Sleep':0,'Pub':0},
     'Class 2':{'Facebook':0,'Class 1':0,'Class 2':0,'Class 3':0.8,'Pass':0,'Sleep':0.2,'Pub':0},
     'Class 3':{'Facebook':0,'Class 1':0,'Class 2':0,'Class 3':0,'Pass':0.6,'Sleep':0,'Pub':0.4},
     'Pass':{'Facebook':0,'Class 1':0,'Class 2':0,'Class 3':0,'Pass':0,'Sleep':1,'Pub':0},
     'Sleep':{'Facebook':0,'Class 1':0,'Class 2':0,'Class 3':0,'Pass':0,'Sleep':1,'Pub':0},
     'Pub':{'Facebook':0,'Class 1':0.2,'Class 2':0.4,'Class 3':0.4,'Pass':0,'Sleep':0,'Pub':0}}

In [112]:
mp = MarkovProcess(T_0)
print('The state space of the student MP problem:')
print(mp.states)
print('\n')
print('The stationary distribution of the student MP problem:')
mp.calculate_stationary_distribution()
print(mp.stationary_dist)

The state space of the student MP problem:
['Facebook', 'Class 1', 'Class 2', 'Class 3', 'Pass', 'Sleep', 'Pub']


The stationary distribution of the student MP problem:
{'Facebook': 0.0, 'Class 1': 0.0, 'Class 2': 0.0, 'Class 3': 0.0, 'Pass': 0.0, 'Sleep': 1.0, 'Pub': 0.0}




## 2. MRP Representation 

In [341]:
class MarkovRewardProcess(object):
    
    def __init__(self,transition_proba,reward,gamma):
        """
        Initialize the MarkovRewardProcess instance.
 
        Parameters
        ----------
        transition_proba: dictionary of dictionary
            {state : {state : probability}} representing the probabilities
            of transitioning from one state to another in the Markov 
            (Reward) Process.
        
        reward: dictionary of dictionary
            {state : {state : reward}} representing the immediate reward
            of transitioning from one state to another in the Markov 
            Reward Process.
        
        gamma: float
            a float between 0 and 1 to discount long-run rewards
            
        Five attributes
        ----------
        transition_proba: dictionary of dictionary
        
        states: vector
            a vector to represent the state space of the Markov Process
        
        reward: dictionary of dictionary
        
        gamma: float
        
        values: dictionary of float
            {state: value} to represent the optimal value function of each state
        """
        self.transition_proba = transition_proba
        self.states = list(transition_proba.keys())
        self.reward = reward
        self.gamma = gamma
        
        self.values = None
    
    def is_terminal(self):
        """
        Check whether each state is self-absorbing
        
        return a dictionary of bools
        """
        is_terminal_dict = {s: t[s] == 1 for s,t in self.transition_proba.items()}
        return is_terminal_dict
            
    def get_transition_matrix(self):
        """
        A helper function to transform the transition dictionary to a 2-D matrix
        """
        num = len(self.states)
        T = np.empty([num,num])
        for i in range(num):
            for j in range(num):
                T[i,j] = self.transition_proba[self.states[i]][self.states[j]]
        return T
    
    def get_expected_reward(self):
        """
        Transform the r(s,s') representation to R(s) = E[r(s,s')] representation
        Return a dictionary {state: reward}
        """
        T = self.get_transition_matrix()
        num = len(self.states)
        R = np.empty([num,num])
        for i in range(num):
            for j in range(num):
                R[i,j] = self.reward[self.states[i]][self.states[j]]        
        R_vec = np.diagonal(np.dot(T,R.T))
        R_dict = {}
        for i in range(num):
            R_dict[self.states[i]] = R_vec[i]
        return R_dict
    
    def get_value_function(self):
        """
        Solve Bellman Equations 
            u = R + T*u
        s.t. the terminal conditions
        
        Return a dictionary {state:value}
        """
        R_dict = self.get_expected_reward()
        is_terminal_dict = self.is_terminal()
        R = np.empty(len(self.states))
        is_terminal = list()
        for i in range(len(self.states)):
            R[i] = R_dict[self.states[i]]
            is_terminal.append(is_terminal_dict[self.states[i]])
        is_terminal = np.array(is_terminal)
        T = self.get_transition_matrix()
        v_vec = np.empty(len(self.states))
        # terminal states
        v_vec[is_terminal] = R[is_terminal]
        # solve bellman equations for non-terminal states
        v_recur = v_vec[~is_terminal]
        R_inter = R[~is_terminal]+ np.dot(T[:,is_terminal][~is_terminal],v_vec[is_terminal])
        v_recur = np.linalg.solve(self.gamma*T[:,~is_terminal][~is_terminal]-np.eye(T[:,~is_terminal][~is_terminal].shape[0]),-R_inter)
        v_vec[~is_terminal] = v_recur
        self.values={}
        for i in range(len(self.states)):
            self.values[self.states[i]] = v_vec[i]
        return self.values

Example:

In [115]:
R_0 = {'Facebook':{'Facebook':-1,'Class 1':-1,'Class 2':0,'Class 3':0,'Pass':0,'Sleep':0,'Pub':0},
     'Class 1':{'Facebook':-2,'Class 1':0,'Class 2':-2,'Class 3':0,'Pass':0,'Sleep':0,'Pub':0},
     'Class 2':{'Facebook':0,'Class 1':0,'Class 2':0,'Class 3':-2,'Pass':0,'Sleep':-2,'Pub':0},
     'Class 3':{'Facebook':0,'Class 1':0,'Class 2':0,'Class 3':0,'Pass':-2,'Sleep':0,'Pub':-2},
     'Pass':{'Facebook':0,'Class 1':0,'Class 2':0,'Class 3':0,'Pass':0,'Sleep':10,'Pub':0},
     'Sleep':{'Facebook':0,'Class 1':0,'Class 2':0,'Class 3':0,'Pass':0,'Sleep':0,'Pub':0},
     'Pub':{'Facebook':0,'Class 1':1,'Class 2':1,'Class 3':1,'Pass':0,'Sleep':0,'Pub':0}}

In [116]:
my_mrp = MarkovRewardProcess(T_0,R_0,0.9)
print('The expectation of immediate reward of each state is')
print(my_mrp.get_expected_reward())
print('\n')
print('The long-run value of each state is')
print(my_mrp.get_value_function())

The expectation of immediate reward of each state is
{'Facebook': -1.0, 'Class 1': -2.0, 'Class 2': -2.0, 'Class 3': -2.0, 'Pass': 10.0, 'Sleep': 0.0, 'Pub': 1.0}


The long-run value of each state is
{'Facebook': -7.637608431059512, 'Class 1': -5.0127289100145225, 'Class 2': 0.9426552976939073, 'Class 3': 4.087021246797093, 'Pass': 10.0, 'Sleep': -2.026954639512266e-16, 'Pub': 1.908392352214145}


## 3. MDP Representation

In [377]:
class MarkovDecisionProcess(object):
        def __init__(self,transition_proba,reward,gamma,tol):
            """
            Initialize the MarkovDecisionProcess instance.

            Parameters
            ----------
            transition_proba: dictionary of dictionary of dictionary
                {state : {action: {state : probability}}} representing the probabilities
                of transitioning from one state to another in the Markov 
                Decision Process.

            reward: dictionary of dictionary of dictionary
                {state : {action: {state : reward}}} representing the immediate reward
                of transitioning from one state to another in the Markov 
                Decision Process.

            gamma: float
                a float between 0 and 1 to discount long-run rewards
            
            tol: float
                the tolerance level of error in iteration
                
            Eight attributes
            ----------
            transition_proba

            states: vector
                a vector to represent the state space of the Markov Process
    
            actions: vector
                a vector to represent the action space of the Markov
            
            reward

            gamma
            
            tol
            
            policy: dictionary of dictionary
                {state: {action: proba}} to represent the stochastic policy
            
            values: dictionary of float
                {state: value} to represent the optimal value function of each state
            """
            self.transition_proba = transition_proba
            self.reward = reward
            self.gamma = gamma
            self.tol = tol
            self.states = list(transition_proba.keys())
            self.actions = list(transition_proba[self.states[0]].keys())

            self.policy = None
            self.values = None

        
        def dict_to_matrix_1(self,dictionary):        
            M = np.empty([len(self.states),len(self.actions),len(self.states)])
            for i in range(len(self.states)):
                for a in range(len(self.actions)):
                    for j in range(len(self.states)):
                        M[i][a][j]= dictionary[self.states[i]][self.actions[a]][self.states[j]]
            return M
        
        def dict_to_matrix_2(self,dictionary):
            M = np.empty([len(self.states),len(self.actions)])
            for i in range(len(self.states)):
                for a in range(len(self.actions)):
                    M[i][a]= dictionary[self.states[i]][self.actions[a]]
            return M
        
        def get_expected_reward(self):
            """
            Calculate the expected reward per (state,action)
            
            Return {state: {action: reward}}
            """  
            R = self.dict_to_matrix_1(self.reward)
            T = self.dict_to_matrix_1(self.transition_proba)
            R_expected_dict = {}
            for i in range(len(self.states)):
                R_state_reward_vec = np.diagonal(np.dot(T[i][:][:],(R[i][:][:]).T))
                R_state_reward = {}
                for a in range(len(self.actions)):
                    R_state_reward[self.actions[a]] = R_state_reward_vec[a]
                R_expected_dict[self.states[i]]= R_state_reward
            return R_expected_dict
        
        def to_MRP(self,policy_dict):
            """
            Given a policy, represent the problem as a Markov Reward Process
            """       
            num = len(self.states)
            
            T_dict = {}
            T_matrix = self.dict_to_matrix_1(self.transition_proba)
            for i in range(num):
                T_state_vec = np.zeros(num)
                T_state_dict = {}
                for a in range(len(self.actions)):
                    T_state_vec += policy_dict[self.states[i]][self.actions[a]]*T_matrix[i][a][:]
                for j in range(num):
                    T_state_dict[self.states[j]] = T_state_vec[j]
                T_dict[self.states[i]] = T_state_dict
            
            R_dict = {}
            R_matrix = self.dict_to_matrix_1(self.reward)
            for i in range(num):
                R_state_vec = np.zeros(num)
                R_state_dict = {}
                for a in range(len(self.actions)):
                    R_state_vec += policy_dict[self.states[i]][self.actions[a]]*R_matrix[i][a][:]
                for j in range(num):
                    R_state_dict[self.states[j]] = R_state_vec[j]
                R_dict[self.states[i]] = R_state_dict            
            
            return MarkovRewardProcess(T_dict,R_dict,self.gamma)
        
        def get_state_value_function(self,policy_dict):
            """
            Given a policy, return the value function of the corresponding MRP problem
            
            Return {state: value}
            """                 
            mrp = self.to_MRP(policy_dict)
            values = mrp.get_value_function()
            return values
        
        def get_state_act_value_function(self,policy_dict):
            """
            Given a policy, return the action value function of the corresponding MRP problem
            
            Return {state: {action: value}}
            """ 
            v_dict = self.get_state_value_function(policy_dict)
            v_vec = np.empty(len(self.states))
            for i in range(len(self.states)):
                v_vec[i] = v_dict[self.states[i]]
                
            R_expected_dict = self.get_expected_reward()
            R_matrix = self.dict_to_matrix_2(R_expected_dict)
                    
            T = self.dict_to_matrix_1(self.transition_proba)
            
            q = {}
            q_matrix = R_matrix + np.dot(T,v_vec)
            for i in range(len(self.states)):
                q_state = {}
                for a in range(len(self.actions)):
                    q_state[self.actions[a]] = q_matrix[i][a]
                q[self.states[i]] = q_state
    
            return q
        
        def get_improved_policy(self,policy_dict):
            """
            Given a policy, find the best possible action for each state based
            on the action value function
            """ 
            q = self.get_state_act_value_function(policy_dict)
            greedy_policy = {}
            for state,action_value in q.items():
                policy_state = {}
                greedy_action = max(action_value.items(),key=operator.itemgetter(1))[0]
                for action in self.actions:
                    if action == greedy_action: policy_state[action] = 1
                    else: policy_state[action] = 0
                greedy_policy[state] = policy_state
            return greedy_policy
        
        def policy_iteration(self):
            # initialization
            policy_dict = {s:{a:1./len(self.actions) for a in self.actions } for s in self.states }
            v_max = self.get_state_value_function(policy_dict)
            epsilon = 1000
            while epsilon >= self.tol:
                policy_dict = self.get_improved_policy(policy_dict)
                v = self.get_state_value_function(policy_dict)
                epsilon = max(abs(v[s] - v_max[s]) for s in v) 
                v_max = v
            
            return policy_dict
        
        def value_iteration(self):
            v = np.zeros(len(self.states))
            T = self.dict_to_matrix_1(self.transition_proba)
            R_matrix = self.dict_to_matrix_2(self.get_expected_reward())

            iterate = True           
            while iterate:
                OldValues = v.copy()
                v = np.max(R_matrix+self.gamma*np.dot(T,OldValues),axis=1)
                if np.max(np.abs(v-OldValues)) < self.tol:
                    iterate = False
            
            self.values={}
            for i in range(num_s):
                self.values[self.states[i]] = v[i]

Example: smoov & curly's bogus journey

https://www.youtube.com/playlist?list=PLw0qyFP7t4IYuI9XcDIGsARfO1QKMrpbN

In [367]:
T = {}
R = {}
for i in range(4):
    for j in range(4):
        if not(i==1 and j == 2):
            T_state = {}
            R_state = {}
            for direction in ['Up','Right','Down','Left']:
                T_state_action = {}
                R_state_action = {}
                for m in range(4):
                    for n in range(4):
                        if not(m==1 and n == 2):
                            T_state_action[(m,n)] = 0
                            if i==3 and j == 3: R_state_action[(m,n)] = 10
                            elif i == 3 and j == 2: R_state_action[(m,n)] = -10
                            else: R_state_action[(m,n)] = -1
                T_state[direction] = T_state_action
                R_state[direction] = R_state_action
            T[(i,j)] = T_state
            R[(i,j)] = R_state

In [368]:
T[(0,0)]['Up'][(0,1)] = 0.8
T[(0,0)]['Up'][(1,0)] = 0.1
T[(0,0)]['Up'][(0,0)] = 0.1
T[(0,0)]['Right'][(0,1)] = 0.1
T[(0,0)]['Right'][(1,0)] = 0.8
T[(0,0)]['Right'][(0,0)] = 0.1
T[(0,0)]['Left'][(0,0)] = 0.9
T[(0,0)]['Left'][(0,1)] = 0.1
T[(0,0)]['Down'][(0,1)] = 0.1
T[(0,0)]['Down'][(0,0)] = 0.9

T[(0,1)]['Up'][(0,2)] = 0.8
T[(0,1)]['Up'][(0,1)] = 0.1
T[(0,1)]['Up'][(1,1)] = 0.1
T[(0,1)]['Right'][(1,1)] = 0.8
T[(0,1)]['Right'][(0,2)] = 0.1
T[(0,1)]['Right'][(0,0)] = 0.1
T[(0,1)]['Left'][(0,1)] = 0.8
T[(0,1)]['Left'][(0,0)] = 0.1
T[(0,1)]['Left'][(0,2)] = 0.1
T[(0,1)]['Down'][(0,0)] = 0.8
T[(0,1)]['Down'][(0,1)] = 0.1
T[(0,1)]['Down'][(1,1)] = 0.1

T[(0,2)]['Up'][(0,3)] = 0.8
T[(0,2)]['Up'][(0,2)] = 0.2
T[(0,2)]['Right'][(0,2)] = 0.8
T[(0,2)]['Right'][(0,3)] = 0.1
T[(0,2)]['Right'][(0,1)] = 0.1
T[(0,2)]['Left'][(0,2)] = 0.8
T[(0,2)]['Left'][(0,3)] = 0.1
T[(0,2)]['Left'][(0,1)] = 0.1
T[(0,2)]['Down'][(0,1)] = 0.8
T[(0,2)]['Down'][(0,2)] = 0.2

T[(0,3)]['Up'][(0,3)] = 0.9
T[(0,3)]['Up'][(1,3)] = 0.1
T[(0,3)]['Right'][(1,3)] = 0.8
T[(0,3)]['Right'][(0,3)] = 0.1
T[(0,3)]['Right'][(0,2)] = 0.1
T[(0,3)]['Left'][(0,3)] = 0.9
T[(0,3)]['Left'][(0,2)] = 0.1
T[(0,3)]['Down'][(0,2)] = 0.8
T[(0,3)]['Down'][(0,3)] = 0.1
T[(0,3)]['Down'][(1,3)] = 0.1

T[(1,0)]['Up'][(1,1)] = 0.8
T[(1,0)]['Up'][(0,0)] = 0.1
T[(1,0)]['Up'][(2,0)] = 0.1
T[(1,0)]['Right'][(2,0)] = 0.8
T[(1,0)]['Right'][(1,0)] = 0.1
T[(1,0)]['Right'][(1,1)] = 0.1
T[(1,0)]['Left'][(0,0)] = 0.8
T[(1,0)]['Left'][(1,0)] = 0.1
T[(1,0)]['Left'][(1,1)] = 0.1
T[(1,0)]['Down'][(1,0)] = 0.8
T[(1,0)]['Down'][(2,0)] = 0.1
T[(1,0)]['Down'][(0,0)] = 0.1

T[(1,1)]['Up'][(1,1)] = 0.8
T[(1,1)]['Up'][(0,1)] = 0.1
T[(1,1)]['Up'][(2,1)] = 0.1
T[(1,1)]['Right'][(2,1)] = 0.8
T[(1,1)]['Right'][(1,1)] = 0.1
T[(1,1)]['Right'][(1,0)] = 0.1
T[(1,1)]['Left'][(0,1)] = 0.8
T[(1,1)]['Left'][(1,0)] = 0.1
T[(1,1)]['Left'][(1,1)] = 0.1
T[(1,1)]['Down'][(1,0)] = 0.8
T[(1,1)]['Down'][(2,1)] = 0.1
T[(1,1)]['Down'][(0,1)] = 0.1

T[(1,3)]['Up'][(1,3)] = 0.8
T[(1,3)]['Up'][(0,3)] = 0.1
T[(1,3)]['Up'][(2,3)] = 0.1
T[(1,3)]['Right'][(2,3)] = 0.8
T[(1,3)]['Right'][(1,3)] = 0.2
T[(1,3)]['Left'][(0,3)] = 0.8
T[(1,3)]['Left'][(1,3)] = 0.2
T[(1,3)]['Down'][(1,3)] = 0.8
T[(1,3)]['Down'][(0,3)] = 0.1
T[(1,3)]['Down'][(2,3)] = 0.1

T[(2,0)]['Up'][(2,1)] = 0.8
T[(2,0)]['Up'][(1,0)] = 0.1
T[(2,0)]['Up'][(3,0)] = 0.1
T[(2,0)]['Right'][(3,0)] = 0.8
T[(2,0)]['Right'][(2,0)] = 0.1
T[(2,0)]['Right'][(2,1)] = 0.1
T[(2,0)]['Left'][(1,0)] = 0.8
T[(2,0)]['Left'][(2,0)] = 0.1
T[(2,0)]['Left'][(2,1)] = 0.1
T[(2,0)]['Down'][(1,0)] = 0.1
T[(2,0)]['Down'][(2,0)] = 0.8
T[(2,0)]['Down'][(3,0)] = 0.1

T[(2,1)]['Up'][(2,2)] = 0.8
T[(2,1)]['Up'][(1,1)] = 0.1
T[(2,1)]['Up'][(3,1)] = 0.1
T[(2,1)]['Right'][(3,1)] = 0.8
T[(2,1)]['Right'][(2,0)] = 0.1
T[(2,1)]['Right'][(2,2)] = 0.1
T[(2,1)]['Left'][(1,1)] = 0.8
T[(2,1)]['Left'][(2,0)] = 0.1
T[(2,1)]['Left'][(2,2)] = 0.1
T[(2,1)]['Down'][(1,1)] = 0.1
T[(2,1)]['Down'][(2,0)] = 0.8
T[(2,1)]['Down'][(3,1)] = 0.1

T[(2,2)]['Up'][(2,3)] = 0.8
T[(2,2)]['Up'][(2,2)] = 0.1
T[(2,2)]['Up'][(3,2)] = 0.1
T[(2,2)]['Right'][(3,1)] = 0.8
T[(2,2)]['Right'][(2,0)] = 0.1
T[(2,2)]['Right'][(2,2)] = 0.1
T[(2,2)]['Left'][(1,1)] = 0.8
T[(2,2)]['Left'][(2,0)] = 0.1
T[(2,2)]['Left'][(2,2)] = 0.1
T[(2,2)]['Down'][(2,2)] = 0.1
T[(2,2)]['Down'][(2,1)] = 0.8
T[(2,2)]['Down'][(3,2)] = 0.1

T[(2,3)]['Up'][(2,3)] = 0.8
T[(2,3)]['Up'][(1,3)] = 0.1
T[(2,3)]['Up'][(3,3)] = 0.1
T[(2,3)]['Right'][(3,3)] = 0.8
T[(2,3)]['Right'][(2,3)] = 0.1
T[(2,3)]['Right'][(2,2)] = 0.1
T[(2,3)]['Left'][(1,3)] = 0.8
T[(2,3)]['Left'][(2,2)] = 0.1
T[(2,3)]['Left'][(2,3)] = 0.1
T[(2,3)]['Down'][(1,3)] = 0.1
T[(2,3)]['Down'][(2,2)] = 0.8
T[(2,3)]['Down'][(3,3)] = 0.1

T[(3,0)]['Up'][(3,1)] = 0.8
T[(3,0)]['Up'][(2,0)] = 0.1
T[(3,0)]['Up'][(3,0)] = 0.1
T[(3,0)]['Right'][(3,0)] = 0.9
T[(3,0)]['Right'][(3,1)] = 0.1
T[(3,0)]['Left'][(2,0)] = 0.8
T[(3,0)]['Left'][(3,0)] = 0.1
T[(3,0)]['Left'][(3,1)] = 0.1
T[(3,0)]['Down'][(3,0)] = 0.9
T[(3,0)]['Down'][(2,0)] = 0.1

T[(3,1)]['Up'][(3,2)] = 0.8
T[(3,1)]['Up'][(2,1)] = 0.1
T[(3,1)]['Up'][(3,1)] = 0.1
T[(3,1)]['Right'][(3,1)] = 0.8
T[(3,1)]['Right'][(3,0)] = 0.1
T[(3,1)]['Right'][(3,2)] = 0.1
T[(3,1)]['Left'][(2,1)] = 0.8
T[(3,1)]['Left'][(3,0)] = 0.1
T[(3,1)]['Left'][(3,2)] = 0.1
T[(3,1)]['Down'][(3,0)] = 0.8
T[(3,1)]['Down'][(2,1)] = 0.1
T[(3,1)]['Down'][(3,1)] = 0.1

for direction in ['Up','Right','Down','Left']:
    T[(3,2)][direction][(3,2)] = 1
    T[(3,3)][direction][(3,3)] = 1

In [381]:
my_mdp = MarkovDecisionProcess(T,R,1,1)

In [382]:
optimal_policy = my_mdp.policy_iteration()
optimal_policy

{(0, 0): {'Up': 1, 'Right': 0, 'Down': 0, 'Left': 0},
 (0, 1): {'Up': 1, 'Right': 0, 'Down': 0, 'Left': 0},
 (0, 2): {'Up': 1, 'Right': 0, 'Down': 0, 'Left': 0},
 (0, 3): {'Up': 0, 'Right': 1, 'Down': 0, 'Left': 0},
 (1, 0): {'Up': 0, 'Right': 1, 'Down': 0, 'Left': 0},
 (1, 1): {'Up': 0, 'Right': 1, 'Down': 0, 'Left': 0},
 (1, 3): {'Up': 0, 'Right': 1, 'Down': 0, 'Left': 0},
 (2, 0): {'Up': 1, 'Right': 0, 'Down': 0, 'Left': 0},
 (2, 1): {'Up': 1, 'Right': 0, 'Down': 0, 'Left': 0},
 (2, 2): {'Up': 1, 'Right': 0, 'Down': 0, 'Left': 0},
 (2, 3): {'Up': 0, 'Right': 1, 'Down': 0, 'Left': 0},
 (3, 0): {'Up': 0, 'Right': 0, 'Down': 0, 'Left': 1},
 (3, 1): {'Up': 0, 'Right': 0, 'Down': 0, 'Left': 1},
 (3, 2): {'Up': 1, 'Right': 0, 'Down': 0, 'Left': 0},
 (3, 3): {'Up': 1, 'Right': 0, 'Down': 0, 'Left': 0}}

In [None]:
my_mdp.value_iteration()

In [None]:
my_mdp.values