In [7]:
import gym

In [8]:
envi = gym.make("Taxi-v3").env
envi.render()

+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| :[43m [0m|B: |
+---------+



In [9]:
print("Action space {}".format(envi.action_space))
print("State space {}".format(envi.observation_space))

Action space Discrete(6)
State space Discrete(500)


In [10]:
#we encode the state of environemtn wrt to the curr location of the taxi and end point
state = envi.encode(4,2,4,0)
print("State : ",state)
#we set the state of environemnt manually with the encoded number
envi.s = state
envi.render()

State :  456
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| :[42m_[0m|B: |
+---------+



In [11]:
#when the Taxi env is created,a reward table is also created called as  "P"
#It is a matrix with shape #states X #actions
print("No of columns/actions = {}".format(len(envi.P[0])))
print("No of rows/state = {}".format(len(envi.P)))  #(5*5)*(4+1)*5

No of columns/actions = 6
No of rows/state = 500


In [12]:
#let us use brute force method to solve the problem
#WITHOUT RL
#we will create infinite loop until one passenger reaches one destination 
#ie untill reward = 20

#current env state
envi.s = 456

epochs = 0
penalties,reward = 0,0

done = False #until passenger is dropped

#list containing the details o each frame
frames = []

while not done:
    
    #choose a random action from the 6 actions
    action = envi.action_space.sample()
    #collect info about what our actions are doing to the environemnt
    #after performing the action
    state,reward,done,info = envi.step(action)
    
    if reward == -10: #wrong pickup or drop actions
        penalties +=1
    #if done:
        #print("epoch number {}".format(epochs))
        #print("No of frames {}".format(len(frames)))
        
    currFrame = ({
        "frame":envi.render(mode = "ansi"), #ansi graphic of the state
        "state":state, #state number
        "action":action, #what action caused it
        "reward":reward  #reward of the action
    })
    frames.append(currFrame)

    epochs +=1
    
print("Number of timesteps taken {}".format(epochs))
print("Number of penalties incurred {}".format(penalties))

Number of timesteps taken 813
Number of penalties incurred 233


In [13]:
#Each frame has a dictionary containing the frame graphics,
#what action caused it to go to the particular state and
#what was the reward in doing so

#example
print("The first frame's graphic")
print(frames[0]["frame"])

The first frame's graphic
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| :[42m_[0m|B: |
+---------+
  (East)



In [17]:
#create an animation playing all the frames until dropping the passenger off
#we dont want to print one below the other,hence we clear the screen everytime
#clearing the screen is like the refresh rate
import IPython.display as jupyter
import time 

simulate = False

if (simulate):
    for i in range(len(frames)):
        jupyter.clear_output(wait = True)
        time.sleep(0.1)
        print(frames[i]["frame"])
else:
    print("Simulation not simulated")

Simulation not simulated


In [None]:
#the above simulation was not good
#the agent wasnt learning from its previuos steps and doesnt have memory of
#its best state

#Q-learning algorithm will give our agent some memory
#the agent will use the environemnt's reward system to learn over time the best
#action to take in a given state
#what it does is that it compares the reward for an particluar action in a 
#particular state and sees if the action was benefecial
#if the ction was benefecial,then it will remember it by updating the q-value table
#the q-value table maps a particular state to the action taken in that state

#in the q-learning algorithm,
#alpha is the learning rate ie rate at which Q-values are being updated
#gamma is the discount factor that gives the importance we want to give to long
#term effective award rather than immediate reward

#we update the Q-value with (1−α)Q(state,action) and then add the learned value
#which is a combination of the reward for taking the current action in the current state and
#discounted maximum reward from the next state we are going to be if we take the
#current action.Its given by α(reward+γmaxaQ(next state,all actions)
#here gamma controls the importance we are gonna give to the long term reward

#hence we are learning the proper actions to take in the current state by looking
#at the reward for the current state-action combo and the reward for the next state
#hence our taxi will consider the best route
#the number of rows in q-state: #states
#number of columns = #actions

#initially the Q-value are initialized to zero and then updated during the training
#to the values that optimize the agent's traversal through the environment for
#maximum reward

