### Reinforcement learning - Example
### Computer Vision

In [16]:
import numpy as np
R = np.matrix( [ [-1, -1, -1, -1, 0, -1 ],
                 [-1, -1, -1, 0, -1, 100 ],
                 [-1, -1, -1, 0, -1, -1 ],
                 [-1, 0, 0, -1, 0, -1 ],
                 [0, -1, -1, 0, -1, 100 ],
                 [-1, 0, -1, -1, 0, 100 ], 
               ])

In [17]:
R

matrix([[ -1,  -1,  -1,  -1,   0,  -1],
        [ -1,  -1,  -1,   0,  -1, 100],
        [ -1,  -1,  -1,   0,  -1,  -1],
        [ -1,   0,   0,  -1,   0,  -1],
        [  0,  -1,  -1,   0,  -1, 100],
        [ -1,   0,  -1,  -1,   0, 100]])

In [18]:
# Q = np.zeros((6,6))
Q = np.matrix(np.zeros([6,6]))

In [19]:
Q

matrix([[0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.]])

In [20]:
initial_state = 3

In [21]:
def available_actions(state):
    current_state_row = R[state]
    return np.where(current_state_row >= 0)[1]

def next_action(available_actions):
    return(int(np.random.choice(available_actions,1)))

In [22]:
available_actions(3)

array([1, 2, 4])

In [23]:
available_action = available_actions(initial_state)

In [24]:
available_action

array([1, 2, 4])

In [25]:
action = next_action(available_action)

In [26]:
# Formula
# Q(state, action) = R(state, action) + 0.8 * Max[Q(next_state, all_action)]

In [27]:
def update_q_matrix(current_state, action):
    max_index = np.where(Q[action] == np.max(Q[action]))[1]
    if max_index.shape[0] > 1:
        max_index = int(np.random.choice(max_index, size=1))
    else:
        max_index = int(max_index)
    max_value = Q[action, max_index]
    Q[current_state, action] = R[current_state, action] + 0.8 * max_value

In [28]:
update_q_matrix(initial_state, action)

In [29]:
Q

matrix([[0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.]])

In [30]:
for i in range(10000):
    current_state = np.random.randint(0, int(Q.shape[0]))
    available_action = available_actions(current_state)
    action = next_action(available_action)
    update_q_matrix(current_state, action)

In [31]:
Q

matrix([[  0.,   0.,   0.,   0., 400.,   0.],
        [  0.,   0.,   0., 320.,   0., 500.],
        [  0.,   0.,   0., 320.,   0.,   0.],
        [  0., 400., 256.,   0., 400.,   0.],
        [320.,   0.,   0., 320.,   0., 500.],
        [  0., 400.,   0.,   0., 400., 500.]])

In [32]:
# normalize trained Q matrix
(Q / np.max(Q)) * 100

matrix([[  0. ,   0. ,   0. ,   0. ,  80. ,   0. ],
        [  0. ,   0. ,   0. ,  64. ,   0. , 100. ],
        [  0. ,   0. ,   0. ,  64. ,   0. ,   0. ],
        [  0. ,  80. ,  51.2,   0. ,  80. ,   0. ],
        [ 64. ,   0. ,   0. ,  64. ,   0. , 100. ],
        [  0. ,  80. ,   0. ,   0. ,  80. , 100. ]])

In [36]:
# Testing the model. Goal state=5
goal_state = 5
current_state = 2 # place an agent in any room
steps = []
while current_state != goal_state:
    next_step_index = np.where(Q[current_state] == np.max(Q[current_state]))[1]
    if next_step_index.shape[0] > 1:
        next_step_index = int(np.random.choice(next_step_index, size=1))
    else:
        next_step_index = int(next_step_index)
    steps.append(next_step_index)
    current_state = next_step_index
print('Best policy returned: ', steps)

Best policy returned:  [3, 4, 5]


In [4]:
# ! pip install gym
import gym

In [5]:
env = gym.make('MountainCar-v0')
env.reset()
done = False
while not done:
    action = 2
    next_state, reward, done, err = env.step(action)
    #env.render()

In [6]:
# Now apply the formula to calculate Q values
# Q(State, Action) = R(State, Action) + Gamma * Max[Q(next state, all actions)]

In [8]:
# Build the Q-table
DESCRETE_OBSERVATION_SPACE_SIZE = [20] * len(env.observation_space.high)

In [9]:
DESCRETE_OBSERVATION_SPACE_SIZE

[20, 20]

In [11]:
descrete_os_win_size = (env.observation_space.high - env.observation_space.low) / DESCRETE_OBSERVATION_SPACE_SIZE

In [12]:
descrete_os_win_size

array([0.09 , 0.007])

In [13]:
# Create Q-table
# Actions --> 0, 1, 2
import numpy as np
q_table = np.random.uniform(low=-1, high=0, size=(DESCRETE_OBSERVATION_SPACE_SIZE + [env.action_space.n]))

In [14]:
q_table.shape

(20, 20, 3)

In [1]:
import gym
import random
streets = gym.make("Taxi-v3").env
streets.render()

+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | : |[43m [0m: |
|[34;1mY[0m| : |B: |
+---------+



In [2]:
initial_state = streets.encode(2,3,2,0)
streets.s = initial_state
streets.render()

+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : :[43m [0m: |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+



In [3]:
import numpy as np
q_table = np.zeros([streets.observation_space.n, streets.action_space.n])
learning_rate = 0.1
discount_factor = 0.6
exploration = 0.1

for taxi_run in range(10000):
    state = streets.reset()
    done = False
    
    while not done:
        random_val = random.uniform(0,1)
        if (random_val < exploration):
            action = streets.action_space.sample() #explore a random  action
        else:
            action = np.argmax(q_table[state]) # use action with highest q-value
        
        next_state, reward, done, info = streets.step(action)
        
        prev_q = q_table[state, action]
        next_max_q = np.max(q_table[next_state])
        new_q = (1-learning_rate) * prev_q + learning_rate * (reward + discount_factor*next_max_q)
        q_table[state, action] = new_q
        
        state = next_state

In [None]:
from IPython.display import clear_output
from time import sleep
for tripnum in range(1,11):
    state = streets.reset()
    done = False
    while not done:
        action = np.argmax(q_table[state])
        next_state, reward, done, info = streets.step(action)
        clear_output(wait=True)
        print('Trip number: ' + str(tripnum))
        print(streets.render(mode='ansi'))
        sleep(0.5)
        state = next_state
    sleep(2)

Trip number: 5
+---------+
|[43mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (West)

