# Install Requireed Packages

In [1]:
# %%capture
!pip install gym
!pip install imageio-ffmpeg
!pip install pyglet




# Import Requireed Packages

In [1]:
# from gym import wrappers
import numpy as np
import gym
import time

In [2]:
iteration_max = 10000
test_max = 2000

min_learning_rate = 0.03
eps = 0.2

In [3]:
def obs_to_state(obs):
    """ Maps an observation to state """
    state_cos_theta = int(np.digitize(obs[0], Sample_cos_theta))
    state_sin_theta = int(np.digitize(obs[1], Sample_sin_theta))
    state_theta_dot = int(np.digitize(obs[2], Sample_theta_dot))
    return (state_cos_theta, state_sin_theta, state_theta_dot)

In [4]:
if __name__ == '__main__':
    
    env = gym.make('Pendulum-v1')
    env.seed(0)
    np.random.seed(0)
    
    print ('----- Start Learning -----')
    
    Sample_cos_theta = np.around(np.arange(env.observation_space.low[0], env.observation_space.high[0], 0.1), 1)[1:]
    Sample_sin_theta = Sample_cos_theta
    Sample_theta_dot = np.around(np.arange(env.observation_space.low[2], env.observation_space.high[2], 1), 0)[1:]

    Sample_out = np.around(np.arange(-2, 2.2, 0.2), 1)

    q_state_table = np.zeros((len(Sample_cos_theta) + 1, len(Sample_cos_theta) + 1, len(Sample_cos_theta) + 1, len(Sample_out)))
    
    scores = []
    
    for i in range(iteration_max):
        obs = env.reset()
        new_state = obs_to_state(obs)
        total_reward = 0

        alpha = max(min_learning_rate, 1.0 * (0.85 ** (i//100)))
        
        for j in range(test_max):
            current_state = new_state

            #select action : random or using q_state(best action from current state)

            if np.random.random() < eps:
                action_index = np.random.randint(len(Sample_out))
            
            else:
                action_index = np.argmax(q_state_table[current_state])
            
            # map index to action value
            action = Sample_out[action_index]  
            obs, reward, done, _ = env.step([action])
            total_reward += reward

            new_state = obs_to_state(obs)
            q_state_table[current_state][action_index] = q_state_table[current_state][action_index] \
                + alpha * (reward + np.max(q_state_table[new_state])- q_state_table[current_state][action_index])
            
            if done:
                break
        
        if i % 1000 == 0:
            print('Iteration %d -- alpha = %f Total reward = %d.' %(i, alpha, total_reward))    

    scores.append(total_reward)


----- Start Learning -----
Iteration 0 -- alpha = 1.000000 Total reward = -1517.
Iteration 1000 -- alpha = 0.196874 Total reward = -1219.
Iteration 2000 -- alpha = 0.038760 Total reward = -128.
Iteration 3000 -- alpha = 0.030000 Total reward = -1383.
Iteration 4000 -- alpha = 0.030000 Total reward = -1305.
Iteration 5000 -- alpha = 0.030000 Total reward = -870.
Iteration 6000 -- alpha = 0.030000 Total reward = -964.
Iteration 7000 -- alpha = 0.030000 Total reward = -899.
Iteration 8000 -- alpha = 0.030000 Total reward = -1163.
Iteration 9000 -- alpha = 0.030000 Total reward = -367.


In [5]:
    frames = []
    env = gym.wrappers.Monitor(env, "./Pendulum-v1_result", force=True)
    obs = env.reset()
    while True:
        frames.append(env.render(mode = 'rgb_np'))
        state = obs_to_state(obs)
        action_idx = np.argmax(q_state_table[state])
        obs, reward, done, _ = env.step([Sample_out[action_idx]])  # conversion index to value
        if done:
            print("done")
            time.sleep(1)
            break

    env.close()

done
