# Class 2

In [7]:
import numpy as np
import scipy.linalg

## 1. MP Representation

In [89]:
class MarkovProcess(object):
    
    def __init__(self,transition_proba):
        """
        Initialize the MarkovProcess instance.
 
        Parameters
        ----------
        transition_proba: dictionary of dictionary
            {state : {state : probability}} representing the probabilities
            of transitioning from one state to another in the Markov 
            Process.
        
        Two attributes
        ----------
        transition_proba: dictionary of dictionary
        
        states: vector
            a vector to represent the state space of the Markov Process
        """
        self.transition_proba = transition_proba
        self.states = list(transition_proba.keys())
        self.stationary_dist = None
    
    def calculate_stationary_distribution(self):
        # probability dictionary to matrix
        num = len(self.states)
        T = np.empty([num,num])
        i = 0
        for state, proba in self.transition_proba.items():
            T[i,:] = list(proba.values())
            i = i+1
        """
        Stationary distribution is given by solving
        pi = pi*T.
        Solve for the left (row) eigen vector of transition matrix
        """      
        value, left, right = scipy.linalg.eig(T, left=True,right=True)
        eig_vec = left[:,np.where(np.abs(value-1)<1e-8)[0][0]].astype(float)
        eig_vec = eig_vec/np.sum(eig_vec) # normalize
        self.stationary_dist = {}
        for i in range(num):
            self.stationary_dist[self.states[i]] = eig_vec[i]
        

Example:

In [90]:
T = {'Facebook':{'Facebook':0.9,'Class 1':0.1,'Class 2':0,'Class 3':0,'Pass':0,'Sleep':0,'Pub':0},
     'Class 1':{'Facebook':0.5,'Class 1':0,'Class 2':0.5,'Class 3':0,'Pass':0,'Sleep':0,'Pub':0},
     'Class 2':{'Facebook':0,'Class 1':0,'Class 2':0,'Class 3':0.8,'Pass':0,'Sleep':0.2,'Pub':0},
     'Class 3':{'Facebook':0,'Class 1':0,'Class 2':0,'Class 3':0,'Pass':0.6,'Sleep':0,'Pub':0.4},
     'Pass':{'Facebook':0,'Class 1':0,'Class 2':0,'Class 3':0,'Pass':0,'Sleep':1,'Pub':0},
     'Sleep':{'Facebook':0,'Class 1':0,'Class 2':0,'Class 3':0,'Pass':0,'Sleep':1,'Pub':0},
     'Pub':{'Facebook':0,'Class 1':0.2,'Class 2':0.4,'Class 3':0.4,'Pass':0,'Sleep':0,'Pub':0}}

In [91]:
mp = MarkovProcess(T)
print('The state space of the student MP problem:')
print(mp.states)
print('\n')
print('The stationary distribution of the student MP problem:')
mp.calculate_stationary_distribution()
print(mp.stationary_dist)

The state space of the student MP problem:
['Facebook', 'Class 1', 'Class 2', 'Class 3', 'Pass', 'Sleep', 'Pub']


The stationary distribution of the student MP problem:
{'Facebook': 0.0, 'Class 1': 0.0, 'Class 2': 0.0, 'Class 3': 0.0, 'Pass': 0.0, 'Sleep': 1.0, 'Pub': 0.0}




## 2. MRP Representation 

In [78]:
class MarkovRewardProcess(object):
    
    def __init__(self,transition_proba,reward,gamma):
        """
        Initialize the MarkovRewardProcess instance.
 
        Parameters
        ----------
        transition_proba: dictionary of dictionary
            {state : {state : probability}} representing the probabilities
            of transitioning from one state to another in the Markov 
            (Reward) Process.
        
        reward: dictionary of dictionary
            {state : {state : reward}} representing the immediate reward
            of transitioning from one state to another in the Markov 
            Reward Process.
        
        gamma: float
            a float between 0 and 1 to discount long-run rewards
            
        Four attributes
        ----------
        transition_proba: dictionary of dictionary
        
        states: vector
            a vector to represent the state space of the Markov Process
        
        reward: dictionary of dictionary
        
        gamma: float
        """
        self.transition_proba = transition_proba
        self.states = list(transition_proba.keys())
        self.reward = reward
        self.gamma = gamma
        
        self.values = None
    
    def get_transition_matrix(self):
        num = len(self.states)
        T = np.empty([num,num])
        i = 0
        for state, proba in self.transition_proba.items():
            T[i,:] = list(proba.values())
            i = i+1
        return T
    
    def get_expected_reward(self):
        T = self.get_transition_matrix()
        num = len(self.states)
        R = np.empty([num,num])
        i = 0
        for state in self.states:
            reward_dic = self.reward[state]
            R[i,:] = list(reward_dic.values())
            i = i+1  
        R_vec = np.diagonal(np.dot(T,R.T))
        R_dict = {}
        for i in range(num):
            R_dict[self.states[i]] = R_vec[i]
        return R_dict
    
    def value_function(self):
        R = np.array(list(self.get_expected_reward().values()))
        T = self.get_transition_matrix()
        v_vec = np.linalg.solve(self.gamma*T-np.eye(T.shape[0]),-R)
        self.values={}
        for i in range(len(self.states)):
            self.values[self.states[i]] = v_vec[i]
        return self.values

Example:

In [79]:
R = {'Facebook':{'Facebook':-1,'Class 1':-1,'Class 2':0,'Class 3':0,'Pass':0,'Sleep':0,'Pub':0},
     'Class 1':{'Facebook':-2,'Class 1':0,'Class 2':-2,'Class 3':0,'Pass':0,'Sleep':0,'Pub':0},
     'Class 2':{'Facebook':0,'Class 1':0,'Class 2':0,'Class 3':-2,'Pass':0,'Sleep':-2,'Pub':0},
     'Class 3':{'Facebook':0,'Class 1':0,'Class 2':0,'Class 3':0,'Pass':-2,'Sleep':0,'Pub':-2},
     'Pass':{'Facebook':0,'Class 1':0,'Class 2':0,'Class 3':0,'Pass':0,'Sleep':10,'Pub':0},
     'Sleep':{'Facebook':0,'Class 1':0,'Class 2':0,'Class 3':0,'Pass':0,'Sleep':0,'Pub':0},
     'Pub':{'Facebook':0,'Class 1':1,'Class 2':1,'Class 3':1,'Pass':0,'Sleep':0,'Pub':0}}

In [80]:
my_mrp = MarkovRewardProcess(T,R,0.9)
print('The expectation of immediate reward of each state is')
print(my_mrp.get_expected_reward())
print('\n')
print('The long-run value of each state is')
print(my_mrp.value_function())

The expectation of immediate reward of each state is
{'Facebook': -1.0, 'Class 1': -2.0, 'Class 2': -2.0, 'Class 3': -2.0, 'Pass': 10.0, 'Sleep': 0.0, 'Pub': 1.0}


The long-run value of each state is
{'Facebook': -7.637608431059512, 'Class 1': -5.0127289100145225, 'Class 2': 0.9426552976939073, 'Class 3': 4.087021246797093, 'Pass': 10.0, 'Sleep': -2.026954639512266e-16, 'Pub': 1.908392352214145}


## 3. MDP Representation

In [None]:
class MarkovDecisionProcess(object):
        def __init__(self,transition_proba,reward,gamma,tol):
        """
        Initialize the MarkovRewardProcess instance.
 
        Parameters
        ----------
        transition_proba: dictionary of dictionary
            {state : {state : probability}} representing the probabilities
            of transitioning from one state to another in the Markov 
            (Reward) Process.
        
        reward: dictionary of dictionary
            {state : {state : reward}} representing the immediate reward
            of transitioning from one state to another in the Markov 
            Reward Process.
        
        gamma: float
            a float between 0 and 1 to discount long-run rewards
            
        Four attributes
        ----------
        transition_proba: dictionary of dictionary
        
        states: vector
            a vector to represent the state space of the Markov Process
        
        reward: dictionary of dictionary
        
        gamma: float
        """
        self.transition_proba = transition_proba
        self.reward = reward
        self.gamma = gamma
        self.tol = tol
        self.states = list(transition_proba.keys())
        self.actions = list(transition_proba[self.states[0]].keys())
        
        self.policy = None
        self.values = None

        def to_MRP(self,policy_dict):
            T = {}
            R = {}
            for state,policy in policy_dict.items():
                T[state] = self.transition_proba[state][policy]
                R[state] = self.reward[state][policy]   
            return MarkovRewardProcess(T,R,self.gamma)
        
        def get_transition_matrix(self):
            num_s = len(self.states)
            num_a = len(self.actions)
            T = np.empty([num_s,num_a,num_s])
            i = 0
            for state, proba_1 in self.transition_proba.items():
                j = 0
                for action, proba_2 in proba_1.items():
                    T[i,j,:] = list(proba_2.values())
                    j = j+1
                i = i+1
            return T
    
        def get_expected_reward(self):
            T = self.get_transition_matrix()
            num_s = len(self.states)
            num_a = len(self.actions)
            R = np.empty([num_s,num_a])
            i = 0
            for state in self.states:
                j = 0
                reward_dic = self.reward[state]
                for action,rewards in reward_dic.items():
                    R[i,j] = np.dot(T[i,j,:],np.array(list(rewards.values())))
                    j = j+1
                i = i+1  
#             R_dict = {}
#             for i in range(num_s):
#                 rewards = {}
#                 for j in range(num_a)
#                     rewards[self.actions[j]] = R[i:j]
#                 R_dict[self.states[i]] = rewards
            return R
        

        def value_iteration(self):
            num_s = len(self.states)
            num_a = len(self.actions)
            v = np.zeros(num_s)
            T = self.get_transition_matrix()
            R = self.get_expected_reward()
            
            iterate = True
            
            while iterate:
                OldValues = v.copy()
                
                for i in range(num_s):
                    v[i] = np.max(R[i,:] + self.gamma*np.dot(T[i,:,:],OldValues))
                if np.max(np.abs(v-OldValues)) < self.tol:
                    iterate = False
            
            self.values={}
            for i in range(num_s):
                self.values[self.states[i]] = v[i]
        
        def extract_policy(self):
            if self.values == None:
                print('Do value iteration first!')
            else:
                num_s = len(self.states)
                num_a = len(self.actions)
                policy = np.zeros(num_s,num_a)
                

In [85]:
a = np.array([[[1,1],[1,1]],[[1,1],[1,1]],[[1,1],[1,1]]])
a

array([[[1, 1],
        [1, 1]],

       [[1, 1],
        [1, 1]],

       [[1, 1],
        [1, 1]]])

In [87]:
b = np.dot(a[0,:,:],[1,1])

In [88]:
b.shape

(2,)