In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import gym

In [6]:
from JSAnimation.IPython_display import display_animation
from matplotlib import animation
from IPython.display import display

def display_frames_as_gif(frames):
    plt.figure(figsize=(frames[0].shape[1]/72.0, frames[0].shape[0]/72.0), dpi=72)
    patch = plt.imshow(frames[0])
    plt.axis("off")
    
    def animate(i):
        patch.set_data(frames[i])
    
    anim = animation.FuncAnimation(plt.gcf(), animate, frames=len(frames), interval=50)
    
    anim.save("movie_cartpole.mp4")
    display(display_animation(anim, default_mode="loop"))

In [None]:
# random movement in CartPole

frames = []
env = gym.make("CartPole-v0")
observation = env.reset()

for step in range(0, 200):
    frames.append(env.render(mode="rgb_array"))
    action = np.random.choice(2)
    observation, reward, done, info = env.step(action)

In [None]:
display_frames_as_gif(frames)

In [None]:
# dizitize state

ENV = "CartPole-v0"
NUM_DIZITIZED = 6

env = gym.make(ENV)
observation = env.reset()

In [None]:
observation

In [None]:
def bins(clip_min, clip_max, num):
    return np.linspace(clip_min, clip_max, num + 1)[1:-1]

In [None]:
def digitize_state(observation):
    cart_pos, cart_v, pole_angle, pole_v = observaton
    digitized = [
        np.digitize(cart_pos, bins=bins(-2.4, 2.4, NUM_DIZITIZED)),
        np.digitize(cart_v, bins=bins(-3.0, 3.0, NUM_DIZITIZED)),
        np.digitize(pole_angle, bins=bins(-0.5, 0.5, NUM_DIZITIZED)),
        np.digitize(pole_v, bins=bins(-2.0, 2.0, NUM_DIZITIZED))
    ]
    return sum([x * (NUM_DIZITIZED**i) for i, x in enumerate(digitized)])

In [7]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import gym

ENV = "CartPole-v0"
NUM_DIZITIZED = 6
GAMMA = 0.99
ETA = 0.5
MAX_STEPS = 200
NUM_EPISODES = 1000

In [8]:
class Agent:
    def __init__(self, num_states, num_actions):
        self.brain = Brain(num_states, num_actions)
    
    def update_Q_function(self, observation, action, reward, observation_next):
        self.brain.update_Q_table(observation, action, reward, observation_next)
    
    def get_action(self, observation, step):
        action = self.brain.decide_action(observation, step)
        return action

In [9]:
class Brain:
    
    def __init__(self, num_states, num_actions):
        self.num_actions = num_actions
        
        self.q_table = np.random.uniform(low=0, high=1, size=(NUM_DIZITIZED**num_states, num_actions))
    
    def bins(self, clip_min, clip_max, num):
        return np.linspace(clip_min, clip_max, num + 1)[1:-1]
    
    def digitize_state(self, observation):
        cart_pos, cart_v, pole_angle, pole_v = observation
        digitized = [
            np.digitize(cart_pos, bins=self.bins(-2.4, 2.4, NUM_DIZITIZED)),
            np.digitize(cart_v, bins=self.bins(-3.0, 3.0, NUM_DIZITIZED)),
            np.digitize(pole_angle, bins=self.bins(-0.5, 0.5, NUM_DIZITIZED)),
            np.digitize(pole_v, bins=self.bins(-2.0, 2.0, NUM_DIZITIZED))
        ]
        return sum([x * (NUM_DIZITIZED**i) for i, x in enumerate(digitized)])
    
    def update_Q_table(self, observation, action, reward, observation_next):
        state = self.digitize_state(observation)
        state_next = self.digitize_state(observation_next)
        Max_Q_next = max(self.q_table[state_next][:])
        self.q_table[state, action] = self.q_table[state, action] + ETA * (reward + GAMMA * Max_Q_next - self.q_table[state, action])
        
    def decide_action(self, observation, episode):
        state = self.digitize_state(observation)
        epsilon = 0.5 * (1 / (episode + 1))
        
        if epsilon <= np.random.uniform(0, 1):
            action = np.argmax(self.q_table[state][:])
        else:
            action = np.random.choice(self.num_actions)
        return action

In [16]:
class Environment:
    
    def __init__(self):
        self.env = gym.make(ENV)
        num_states = self.env.observation_space.shape[0]
        num_actions = self.env.action_space.n  
        self.agent = Agent(num_states, num_actions)
    
    def run(self):
        complete_episodes = 0
        is_episode_final = False
        frames = []
        
        for episode in range(NUM_EPISODES):
            observation = self.env.reset()
            
            for step in range(MAX_STEPS):
                
                if is_episode_final is True:
                    frames.append(self.env.render(mode="rgb_array"))
                    
                action = self.agent.get_action(observation, episode)
                
                observation_next, _, done, _ = self.env.step(action)
                
                if done:
                    if step < 195:
                        reward = -1
                        complete_episodes = 0
                    else:
                        reward = 1
                        complete_episodes += 1
                else:
                    reward = 0
                
                self.agent.update_Q_function(observation, action, reward, observation_next)
                
                observation = observation_next
                
                if done:
                    print("{0} Episode: Finished after {1} time steps".format(episode, step + 1))
                    break

            if is_episode_final is True:
                display_frames_as_gif(frames)
                break
            
            if complete_episodes >= 10:
                print("great!")
                is_episode_final = True

In [17]:
cartpole_env = Environment()
cartpole_env.run()

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
0 Episode: Finished after 15 time steps
1 Episode: Finished after 11 time steps
2 Episode: Finished after 10 time steps
3 Episode: Finished after 24 time steps
4 Episode: Finished after 9 time steps
5 Episode: Finished after 17 time steps
6 Episode: Finished after 10 time steps
7 Episode: Finished after 9 time steps
8 Episode: Finished after 8 time steps
9 Episode: Finished after 11 time steps
10 Episode: Finished after 10 time steps
11 Episode: Finished after 10 time steps
12 Episode: Finished after 10 time steps
13 Episode: Finished after 9 time steps
14 Episode: Finished after 10 time steps
15 Episode: Finished after 9 time steps
16 Episode: Finished after 10 time steps
17 Episode: Finished after 10 time steps
18 Episode: Finished after 10 time steps
19 Episode: Finished after 9 time steps
20 Episode: Finished after 10 time steps
21 Episode: Finished after 10 time steps
22 Epi

288 Episode: Finished after 10 time steps
289 Episode: Finished after 10 time steps
290 Episode: Finished after 9 time steps
291 Episode: Finished after 10 time steps
292 Episode: Finished after 10 time steps
293 Episode: Finished after 10 time steps
294 Episode: Finished after 11 time steps
295 Episode: Finished after 9 time steps
296 Episode: Finished after 11 time steps
297 Episode: Finished after 9 time steps
298 Episode: Finished after 9 time steps
299 Episode: Finished after 9 time steps
300 Episode: Finished after 10 time steps
301 Episode: Finished after 10 time steps
302 Episode: Finished after 9 time steps
303 Episode: Finished after 10 time steps
304 Episode: Finished after 9 time steps
305 Episode: Finished after 9 time steps
306 Episode: Finished after 8 time steps
307 Episode: Finished after 10 time steps
308 Episode: Finished after 10 time steps
309 Episode: Finished after 10 time steps
310 Episode: Finished after 9 time steps
311 Episode: Finished after 10 time steps
31

546 Episode: Finished after 10 time steps
547 Episode: Finished after 10 time steps
548 Episode: Finished after 8 time steps
549 Episode: Finished after 9 time steps
550 Episode: Finished after 9 time steps
551 Episode: Finished after 9 time steps
552 Episode: Finished after 10 time steps
553 Episode: Finished after 11 time steps
554 Episode: Finished after 10 time steps
555 Episode: Finished after 9 time steps
556 Episode: Finished after 10 time steps
557 Episode: Finished after 10 time steps
558 Episode: Finished after 10 time steps
559 Episode: Finished after 9 time steps
560 Episode: Finished after 10 time steps
561 Episode: Finished after 10 time steps
562 Episode: Finished after 11 time steps
563 Episode: Finished after 10 time steps
564 Episode: Finished after 9 time steps
565 Episode: Finished after 9 time steps
566 Episode: Finished after 9 time steps
567 Episode: Finished after 8 time steps
568 Episode: Finished after 9 time steps
569 Episode: Finished after 9 time steps
570 

775 Episode: Finished after 9 time steps
776 Episode: Finished after 9 time steps
777 Episode: Finished after 9 time steps
778 Episode: Finished after 10 time steps
779 Episode: Finished after 10 time steps
780 Episode: Finished after 9 time steps
781 Episode: Finished after 10 time steps
782 Episode: Finished after 9 time steps
783 Episode: Finished after 9 time steps
784 Episode: Finished after 8 time steps
785 Episode: Finished after 9 time steps
786 Episode: Finished after 10 time steps
787 Episode: Finished after 11 time steps
788 Episode: Finished after 10 time steps
789 Episode: Finished after 9 time steps
790 Episode: Finished after 10 time steps
791 Episode: Finished after 8 time steps
792 Episode: Finished after 10 time steps
793 Episode: Finished after 10 time steps
794 Episode: Finished after 9 time steps
795 Episode: Finished after 10 time steps
796 Episode: Finished after 8 time steps
797 Episode: Finished after 9 time steps
798 Episode: Finished after 9 time steps
799 Ep