In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# 1. The environment

In [163]:
class Environment():
    def __init__(self):
        self.n_states = 12
        self.h = 3
        self.w = 4
        self.states_locations = [(0,0), (0,1), (0,2), (0,3),
                    (1,0), (1,1), (1,2),(1,3),
                    (2,0), (2,1), (2,2),(2,3)]
        self.states_index = np.array([i for i in range(0,self.n_states)]).reshape(self.h,self.w)
        self.current_state= 0 #index of the current state

        self.terminal_states = [7,11]
        self.impossible_states = [5]
        self.n_actions = 4
        # in order E,W, N, S
        self.actions = [(0,-1),(0,+1),(-1,0),(+1,0)]
        self.action_symbols = np.array(['<','>','^','v'])
        self.rewards = np.array([-0.04, -0.04, -0.04, -0.04,
               -0.04, 0.0, -0.04,  -1.0,
               -0.04, -0.04, -0.04,   1.0])
        self.initial_reward = self.rewards[self.current_state]
        # initialize transition model
        self.transition_model = self.generate_transition_model()

        
    def step(self, action):
        # transition to new state
        self.current_state = np.random.choice(self.n_states, 
                     p=self.transition_model[:,action,self.current_state])
        if self.current_state in self.terminal_states:
            end_state = self.current_state
            reward = self.rewards[end_state]

            print("Episode finished")
            #Reinitialize
            self.__init__()
            return end_state, reward, True # Last bool indicates that episode finished
                
        return self.current_state, self.rewards[self.current_state], False
        
    def out_of_bounds(self,state_location):
        if state_location[0] in range(0,self.h) and state_location[1] in range(0,self.w):
            if self.states_index[state_location] in self.impossible_states:
                return True
            return False
        else:
            return True
        
    def generate_transition_model(self):
        P = np.zeros((self.n_states,self.n_actions,self.n_states))

        for s in range(0,self.n_states):
            for a in range(0,self.n_actions):
                if s in self.terminal_states or s in self.impossible_states:
                    continue


                s_location = self.states_locations[s]

                sp_location =  (s_location[0] + self.actions[a][0],s_location[1] + self.actions[a][1])
                if self.out_of_bounds(sp_location):
                    sp_location = s_location
                sp = self.states_index[sp_location]
                prob = 0.8
                P[sp,a,s]+=prob

                opposite_actions = 1-np.abs(self.actions[a])

                sp_location =  (s_location[0] + opposite_actions[0], s_location[1] +opposite_actions[1])
                if self.out_of_bounds(sp_location):
                    sp_location = s_location
                sp = self.states_index[sp_location]
                prob = 0.1
                P[sp,a,s]+=prob


                sp_location =  (s_location[0] - opposite_actions[0], s_location[1] -opposite_actions[1])
                if self.out_of_bounds(sp_location):
                    sp_location = s_location
                sp = self.states_index[sp_location]
                prob = 0.1
                P[sp,a,s]+=prob
        return P

In [124]:
class Agent():
    def __init__(self):
        self.n_states = 12
        self.h = 3
        self.w = 4
        self.states_locations = [(0,0), (0,1), (0,2), (0,3),
                    (1,0), (1,1), (1,2),(1,3),
                    (2,0), (2,1), (2,2),(2,3)]
        self.states_index = np.array([i for i in range(0,self.n_states)]).reshape(self.h,self.w)
        self.n_actions = 4
        # in order E,W, N, S
        self.actions = [(0,-1),(0,+1),(-1,0),(+1,0)]
        self.action_symbols = np.array(['<','>','^','v'])
        
        # Defining policy
        policy_symbols = np.array([['v' ,'<', '<' ,'<'],
                                   ['v', None ,'v', None],
                                   ['>' ,'>', '>', None]])
        self.policy = self.translate_policy(policy_symbols)
        
    def translate_policy(self,policy_symbols):
        policy_symbols = policy_symbols.ravel()
        policy = np.zeros_like(policy_symbols, dtype=int)
        for i, symbol in enumerate(policy_symbols):
            if symbol is None:
                policy[i] = -1
            else:
                policy[i] = np.argmax(self.action_symbols==symbol)

        return policy
    
    def step(self, state):
        action = self.policy[state]
        print("Action: ",self.action_symbols[action])
        return action


In [165]:
def TD_episode(utility, alpha=0.1, gamma=0.9):
    agent = Agent()
    env = Environment()
    state = env.current_state
    reward = env.initial_reward
    episode_finished=False
    print(utility.reshape(3,4))
    while not episode_finished:
        action = agent.step(state)
        state_prime, reward_prime, episode_finished = env.step(action)
        utility[state] = utility[state]+alpha*(reward_prime+gamma*utility[state_prime]-utility[state])
        print(env.states_locations[state]," to ", env.states_locations[state_prime])
        print(utility.reshape(3,4))
        
        state, reward = state_prime, reward_prime
        
    return utility

In [178]:
utility_i = np.zeros(12)
for i in range(0,1):
    df = pd.DataFrame([[0]+list(utility_i)],columns=['iter','(0, 0)',
                                 '(0, 1)',
                                 '(0, 2)',
                                 '(0, 3)',
                                 '(1, 0)',
                                 'x',
                                 '(1, 2)',
                                 '-1',
                                 '(2, 0)',
                                 '(2, 1)',
                                 '(2, 2)',
                                 '1'])
    
    utility = TD_episode(utility)
    df.loc[i+1] = [i+1]+list(utility)

[[-0.0124636  0.         0.         0.       ]
 [-0.0182956  0.         0.01103    0.       ]
 [-0.011038   0.01436    0.2503     0.       ]]
Action:  v
(0, 0)  to  (1, 0)
[[-0.01686384  0.          0.          0.        ]
 [-0.0182956   0.          0.01103     0.        ]
 [-0.011038    0.01436     0.2503      0.        ]]
Action:  v
(1, 0)  to  (2, 0)
[[-0.01686384  0.          0.          0.        ]
 [-0.02145946  0.          0.01103     0.        ]
 [-0.011038    0.01436     0.2503      0.        ]]
Action:  >
(2, 0)  to  (2, 1)
[[-0.01686384  0.          0.          0.        ]
 [-0.02145946  0.          0.01103     0.        ]
 [-0.0126418   0.01436     0.2503      0.        ]]
Action:  >
(2, 1)  to  (2, 2)
[[-0.01686384  0.          0.          0.        ]
 [-0.02145946  0.          0.01103     0.        ]
 [-0.0126418   0.031451    0.2503      0.        ]]
Action:  >
Episode finished
(2, 2)  to  (2, 3)
[[-0.01686384  0.          0.          0.        ]
 [-0.02145946  0.       

In [179]:
df

Unnamed: 0,iter,"(0, 0)","(0, 1)","(0, 2)","(0, 3)","(1, 0)",x,"(1, 2)",-1,"(2, 0)","(2, 1)","(2, 2)",1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,-0.016864,0.0,0.0,0.0,-0.021459,0.0,0.01103,0.0,-0.012642,0.031451,0.32527,0.0
