In [None]:

import gym
import time
env = gym.make('CartPole-v1')
obs = env.reset()
for _ in range(1000):
        obs, reward, done, info = env.step(env.action_space.sample())
        env.render()
        time.sleep(0.001)
        if done:
            env.reset()
# Close the env
env.close()

In [None]:
#Lista todos los ambientes instalados en Gym
from gym import envs
env_names = [spec.id for spec in envs.registry.all()]
i=0
for name in sorted(env_names):
    print(i,name)
    i+=1

In [None]:
import gym
import atari_py
env = gym.make("CartPole-v0")
MAX_NUM_EPISODES = 13
MAX_STEPS_PER_EPISODE = 500
for episode in range(MAX_NUM_EPISODES):
    obs = env.reset()
    for step in range(MAX_STEPS_PER_EPISODE):
        env.render()
        action = env.action_space.sample()
        next_state, reward, done, info = env.step(action)
        # Send the action to theenvironment and receive the next_state, reward and whether done or not
        obs = next_state
        if done is True:
            print("\n Episode #{} ended in {} steps.".format(episode, step+1))
            break
env.close()

In [None]:
import gym
import numpy as np
MAX_NUM_EPISODES = 50000
STEPS_PER_EPISODE = 200
EPSILON_MIN = 0.005
max_num_steps = MAX_NUM_EPISODES * STEPS_PER_EPISODE
EPSILON_DECAY = 500 * EPSILON_MIN / max_num_steps
ALPHA = 0.05
GAMMA = 0.98
NUM_DISCRETE_BINS = 30

class Q_Learner(object):
    
    def __init__(self, env):
        self.obs_shape = env.observation_space.shape
        self.obs_high = env.observation_space.high
        self.obs_low = env.observation_space.low
        self.obs_bins = NUM_DISCRETE_BINS
        self.bin_width = (self.obs_high - self.obs_low) / self.obs_bins
        self.action_shape = env.action_space.n
        self.Q = np.zeros((self.obs_bins + 1, self.obs_bins + 1,self.action_shape))  # (51 x 51 x 3)
        self.alpha = ALPHA  # Learning rate        
        self.gamma = GAMMA  # Discount factor        
        self.epsilon = 1.0 
        
  
    def discretize(self, obs):
        return tuple(((obs - self.obs_low) / self.bin_width).astype(int))
   
    def get_action(self, obs):
        discretized_obs = self.discretize(obs)
        # Epsilon-Greedy action selection  
        if self.epsilon > EPSILON_MIN:
            self.epsilon -= EPSILON_DECAY
            if np.random.random() > self.epsilon:
                return np.argmax(self.Q[discretized_obs])
            else:  # Choose a random action
                return np.random.choice([a for a in range(self.action_shape)])

    def learn(self, obs, action, reward, next_obs):
        discretized_obs = self.discretize(obs)
        discretized_next_obs = self.discretize(next_obs)
        td_target = reward + self.gamma * np.max(self.Q[discretized_next_obs])
        td_error = td_target - self.Q[discretized_obs][action]
        self.Q[discretized_obs][action] += self.alpha * td_error
        
def train(agent, env):
        best_reward = -float('inf')
        for episode in range(MAX_NUM_EPISODES):
            done = False
            obs = env.reset()
            total_reward = 0.0
            while not done:
                action = agent.get_action(obs)
                next_obs, reward, done, info = env.step(action)
                agent.learn(obs, action, reward, next_obs)
                obs = next_obs
                total_reward += reward
                if total_reward > best_reward:
                    best_reward = total_reward
                #print("Episode#:{} reward:{} best_reward:{} eps:{}".format(episode, total_reward, best_reward, agent.epsilon))
        # Return the trained policy
        return np.argmax(agent.Q, axis=2)
               
def test(agent, env, policy):
        done = False
        obs = env.reset()
        total_reward = 0.0
        while not done:
            action = policy[agent.discretize(obs)]
            next_obs, reward, done, info = env.step(action)
            obs = next_obs
            total_reward += reward
        return total_reward
    
if __name__ == "__main__":
    env = gym.make('MountainCar-v0')
    agent = Q_Learner(env)
    learned_policy = train(agent, env)
    # Use the Gym Monitor wrapper to evalaute the agent and record video
    gym_monitor_path = "/home/yero/Escritorio/gym"
    env = gym.wrappers.Monitor(env, gym_monitor_path, force=True)
    for _ in range(1000):
        test(agent, env, learned_policy)
    env.close()
