# Import `gym` first

In [1]:
import gym

# To setup a Reinforcement Learning problem in `gym`, call `gym.make()` with the name of the problem
- Returns an environment.

In [2]:
env = gym.make("CartPole-v0")

# To initialize the problem, call `env.reset()`
- Returns the initial observation of the Agent once the environment is initialized.
- 1st number: cart position (initialized to nearly the center)
- 2nd number: cart velocity (initialized to nearly zero; hardly moving)
- 3rd number pole angle with the vertical (initialized to nearly vertical)
- 4rth number: pole velocity at tip (initialized to nearly zero; hardly swinging)

In [3]:
observation = env.reset()
print(observation)

[ 0.03619636  0.03696681 -0.02245762 -0.04815029]


# To take an action, call `env.step()` with the action as argument
- The first element returned by the `env.step()` function is the new environment state.
- The second element returned by the `env.step()` is called the reward. The reward is a judgement of the environment state and the action at any time step.
- The third element is called `done`. It denotes if the `observation` is a terminal state. It is `bool`. 
    - It is `True` if it a terminal state 
    - `False` if not a terminal state.

In [4]:
observation, reward, done, _ = env.step(0)
print("observation is {}".format(observation))
print("reward is {}".format(reward))
print("done is {}".format(done))

observation is [ 0.0369357  -0.15782604 -0.02342063  0.23736332]
reward is 1.0
done is False


# Some problems are inherently "episodic"
- Terminal states: states after which taking actions are not allowed.
- Episode: the period between the first observation (`env.reset()`) to the terminal state(s)

# Terminal states in CartPole-v0
1. Terminal state 1: when the pole angle becomes 12 degrees for the first time ("Game over")
2. Terminal state 2: when the cart touches the X axis bounds for the first time ("Game over")
3. Terminal state 3: comes after 200 time steps ("You win the game")

# The learning goal (balancing the pole without crashing for 200 time steps) is equivalent to maximizing the reward function over an episode

# In any episodic RL problem, the first step is to express the goal as the maximization of some reward function over an episode

# Taking actions for many episodes

In [8]:
# import time
num_episodes = 5
for _ in range(num_episodes):
    observation = env.reset()
    while True:
        observation, reward, done, _ = env.step(0)
        env.render()
        # time.sleep(0.1)
        if done:
            break

# To close the visual representation, call `env.close()`

In [None]:
env.close()