# OpenAI GYM CartPole-v1 problem

The description of the CartPole-v1 as given on the OpenAI gym website -

"""

A pole is attached by an un-actuated joint to a cart, which moves along a frictionless track. The system is controlled by applying a force of +1 or -1 to the cart. The pendulum starts upright, and the goal is to prevent it from falling over. A reward of +1 is provided for every timestep that the pole remains upright. The episode ends when the pole is more than 15 degrees from vertical, or the cart moves more than 2.4 units from the center.

"""

This environment corresponds to the version of the cart-pole problem described by Barto, Sutton, and Anderson.

We train an agent to solve OpenAI Gym's Cartpole-v0 environment. The implementation is in the most recent version of the PyTorch frameworkfor building deep learning models.

https://medium.com/@thechrisyoon/deriving-policy-gradients-and-implementing-reinforce-f887949bd63

- Actor Critic Methods (A2C) - OpenAI Gym CartPole-v0 
https://towardsdatascience.com/understanding-actor-critic-methods-931b97b6df3f

- Deep Deterministic Policy Gradients - OpenAI Gym  Pendulum-v0

https://towardsdatascience.com/deep-deterministic-policy-gradients-explained-2d94655a9b7b

In [1]:
import os
import gym
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch as T

import matplotlib.pyplot as plt

In [2]:
class LinearDeepQNetwork(nn.Module):
    def __init__(self, lr, n_actions, input_dims):
        super(LinearDeepQNetwork, self).__init__()
        
        self.fc1 = nn.Linear(*input_dims, 128)
        self.fc2 = nn.Linear(128, n_actions)
        
        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        self.loss = nn.MSELoss()
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device) # device selection
        
    def forward(self, state):
        layer1  = F.relu(self.fc1(state))
        actions = self.fc2(layer1) # no activation necessary for regression=output
        
        return actions

In [3]:
class Agent():
    '''
    Agent is capable to choose actions the ability to learn from it's experiences
    as well as the ability to decrement the agent's epsilon over time.
    '''
    def __init__(self, input_dims, n_actions, lr=0.0001, gamma=0.99, epsilon=1.0, eps_dec=1e-5, 
                eps_min=0.01):
        self.lr = lr
        self.input_dims = input_dims
        self.n_actions  = n_actions
        self.gamma = gamma
        self.epsilon = epsilon
        self.eps_dec = eps_dec
        self.eps_min = eps_min
        self.action_space = [i for i in range(self.n_actions)]
        
        # Q-value function for the Agent 
        self.Q = LinearDeepQNetwork(self.lr, self.n_actions, self.input_dims)
        
    def choose_action(self, observation):
        if np.random.random() > self.epsilon:
            state = T.tensor(observation, dtype=T.float).to(self.Q.device)
            actions = self.Q.forward(state)
             # action with maximum Q-value
            action = T.argmax(actions).item() # dereference to numpy array for Gym API
        else:
            action = np.random.choice(self.action_space)
        
        return action
    
    def decrement_epsilon(self): # linear annealing
        self.epsilon = self.epsilon - self.eps_dec if self.epsilon > self.eps_min else self.eps_min
    
    def learn(self, state, action, reward, state_):
        self.Q.optimizer.zero_grad()
        states  = T.tensor(state, dtype=T.float).to(self.Q.device)
        actions = T.tensor(action).to(self.Q.device)
        rewards = T.tensor(reward).to(self.Q.device)
        states_ = T.tensor(state_, dtype=T.float).to(self.Q.device)
        
        # prediction valeus for the current state of the environment
        q_pred = self.Q.forward(states)[actions]
        
        # target value for the maximum action of the agents estimate of the value of the resulting states
        q_next = self.Q.forward(states_).max()
        
        # the target that is the direction we want to move in is going to be
        q_target = reward + self.gamma * q_next
        
        loss = self.Q.loss(q_target, q_pred).to(self.Q.device)
        loss.backward()
        self.Q.optimizer.step()
        self.decrement_epsilon()

In [4]:
env = gym.make('CartPole-v1')

  result = entry_point.load(False)


In [None]:
## Cartpole random

In [None]:
total_reward = 0.0
total_steps = 0

obs = env.reset() # random initialization of environment
print(obs) # random observation

In [None]:
# run through episodes
while True:
    # select random action from action space
    action = env.action_space.sample()
    
    # execute selected action
    obs, reward, done, _ = env.step(action)
    
    total_reward += reward
    total_steps += 1
    
    if done:
        break

print("Episode done in %d steps, total reward %.2f" % (total_steps, total_reward))
env.close()
env.env.close()

### Random actionwrapper

In [None]:
class RandomActionWrapper(gym.ActionWrapper):
    def __init__(self, env, epsilon=0.1):
        super(RandomActionWrapper, self).__init__(env)
        self.epsilon = epsilon
    def action(self, action):
        if random.random() < self.epsilon:
            print("Random action taken!")
            return self.env.action_space.sample()
        # else original action taken
        return action

In [None]:
env_cartpole = gym.make("CartPole-v0")
env = RandomActionWrapper(env_cartpole)

In [None]:
total_reward = 0.0
total_steps = 0

obs = env.reset()
print(obs)

In [None]:
# run through episodes
while True:
    # select predefined action (move left)
    action = 0
    obs, reward, done, _ = env.step(action)
    
    total_reward += reward
    total_steps += 1
    
    if done:
        break

print("Episode done in %d steps, total reward %.2f" % (total_steps, total_reward))
env.close()
env.env.close()

### Cartpole random monitor

In [None]:
env = gym.make("CartPole-v0")
env = gym.wrappers.Monitor(env, "log_recording", force=True) # log folder called recording

In [None]:
obs = env.reset()
print(obs)

In [None]:
# run through episodes
while True:
    # select random action from action space
    action = env.action_space.sample()
    
    obs, reward, done, _ = env.step(action)
    
    total_reward += reward
    total_steps += 1
    
    if done:
        break

print("Episode done in %d steps, total reward %.2f" % (total_steps, total_reward))
env.close()
env.env.close()

## CartPole solved with Cross-Entropy

In [None]:
# default parameter configuration
HIDDEN_SIZE = 128
BATCH_SIZE = 16
PERCENTILE = 70 # percentile of episodes' total rewards that are used for elite episode filtering(top 30% of episodes sorted by reward)

In [None]:
class Net(nn.Module):
    def __init__(self, obs_size, hidden_size, n_actions):
        super(Net, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, n_actions)
        )
    def forward(self, x):
        return self.net(x)

In [None]:
# store the observation from environment and action agent completed for one step agent made in the episode
# use episode step from elite episodes as training data
Episode = namedtuple('Episode', field_names=['reward', 'steps'])
# single episode stored as total undiscounted reward and a collection of episode
EpisodeStep = namedtuple('EpisodeStep', field_names=['observation', 'action'])

In [None]:
# Generates batches with episodes
def iterate_batches(env, net, batch_size): # count of episodes to generate on every generation. 
    batch = []
    episode_reward = 0.0
    episode_steps = []
    
    obs = env.reset()
    
    # softmax
    sm = nn.Softmax(dim=1)
    
    while True:
        obs_v = torch.FloatTensor([obs])
        # pass current obervation to the net, sample the action to perform,
        # ask the environment to process the action, and remember the result of this processing.
        # use softmax (sm) to convert the network's output to a probability distribution of actions
        act_probs_v = sm(net(obs_v))
        act_probs = act_probs_v.data.numpy()[0]
        # sampling from action probability distribution
        action = np.random.choice(len(act_probs), p=act_probs)
        
        next_obs, reward, is_done, _ = env.step(action)
        
        episode_reward += reward
        
        episode_steps.append(EpisodeStep(observation=obs, action=action))
        
        if is_done:
            
            batch.append(Episode(reward=episode_reward, steps=episode_steps))

            episode_reward = 0.0
            episode_steps = []
            
            next_obs = env.reset()
            
            # in case batch reaches desired count of episodes return it to caller
            if len(batch) == batch_size:
                yield batch
                batch = []
        # assign observation obtained from environment to the current observation         
        obs = next_obs
        
def filter_batch(batch ,percentile):
    '''
    function at the core of the cross-entropy method, from the given batch episodes
    and percentile value, it calculates a boundary reward, which is used to 
    filter elite episodes to train on. 
    
    '''
    # from list of values and desired percentile 
    rewards = list(map(lambda s: s.reward, batch))
    reward_bound = np.percentile(rewards, percentile) # to obtain the boundary reward 
    reward_mean = float(np.mean(rewards)) # mean reward used for monitor
    
    train_obs = []
    train_act = []
    
    for example in batch: 
        # filter off episode for training 
        # for every episode in the batch, we will check that the episode has high total reward 
        # then our reward boundary and if it has, we will populate list of observations and actions
        # that we train on.
        if example.reward < reward_bound: 
            continue
        # observation and actions from elite episode  
        train_obs.extend(map(lambda step: step.observation, example.steps))
        train_act.extend(map(lambda step: step.action, example.steps))
        
    train_obs_v = torch.FloatTensor(train_obs)
    train_act_v = torch.LongTensor(train_act)
    
    # reward boudary and reward mean only used to check and monitor agent performance
    return train_obs_v, train_act_v, reward_bound, reward_mean

In [None]:
env = gym.make("CartPole-v0")

In [None]:
To check the agent in action we enable Monitor to create videos recorded at different training steps.

In [None]:
env = gym.wrappers.Monitor(env, directory="mon", force=True)

In [None]:
obs_size  = env.observation_space.shape[0]
n_actions = env.action_space.n

In [None]:
# one-hidden-layer neural network, with ReLU and 128 hidden neurons
net = Net(obs_size, HIDDEN_SIZE, n_actions)
objective = nn.CrossEntropyLoss() # expects raw scores (logit) and applies log-softmax on them as opposed to the log probabilities
optimizer = optim.Adam(params=net.parameters(), lr=0.01)

In [None]:
writer = SummaryWriter(comment="-cartpole")

The training of the neural network and the generation of the episodes are performed at the same time. They are not completely in parallel, but every time the loop accumulates enough episodes (16), it passes control to the function supposed to train the network using the gradient descent. The network will have different, slightly better behavior, hopefully.

We do not need to explore proper synchronization, as the training and data gathering activities are performed at the same thrad of execution, but need to understand those constant jumps from network training to its utilization.

In [None]:
# In training loop, we iterate over batches of episodes,
# then perform filtering of the elite episodes. 
# Teh result is variables of observations and taken actions, reward boundary 
# used for filtering and mean reward.
for iter_no, batch in enumerate(iterate_batches(env, net, BATCH_SIZE)):
    obs_v, acts_v, reward_b, reward_m = filter_batch(batch, PERCENTILE)
    optimizer.zero_grad()
    # pass observations to the network, obtaining its actions scores.
    action_scores_v = net(obs_v)
    # These action scores are passed to the objection function, 
    # which calculates cross-entropy between the network output and the actions that 
    # the action agent took.
    loss_v = objective(action_scores_v, acts_v)
    
    loss_v.backward()
    optimizer.step()
    
    print("%d: loss=%.3f, reward_mean=%.1f, reward_bound=%.1f" % (
            iter_no, loss_v.item(), reward_m, reward_b))
    
    # Agent's learning performance recorded
    # monitoring progress of interation number, loss, mean reward of batch, and reward boundary
    writer.add_scalar("loss", loss_v.item(), iter_no)
    writer.add_scalar("reward_bound", reward_b, iter_no)
    writer.add_scalar("reward_mean",  reward_m, iter_no)
    
    # comparison of the mean rewards of the batch episodes solved when the mean reward 
    # for the last 100 episodes is greater than 195. 
    # Gym, environment considers
    if reward_m > 199:
        print("Solved!")
        break

    writer.close()

env.close()
env.env.close()

In [None]:
import gym
import numpy as np
import matplotlib.pyplot as plt

from sklearn.neural_network import MLPClassifier

%matplotlib inline

In [None]:
from IPython.display import clear_output

In [None]:
#if you see "<classname> has no attribute .env", remove .env or update gym
env = gym.make("CartPole-v0").env

In [None]:
env.reset()

In [None]:
env.reset()

In [None]:
#plt.imshow(env.render("rgb_array"))

In [None]:
# create agent
agent = MLPClassifier(hidden_layer_sizes=(20,20),
                      activation='tanh',
                      warm_start=True, # keep progress between .fit(...) calls
                      max_iter=1 # make only 1 iteration on each .fit(...)
                     )

In [None]:
X_train = [env.reset()]*n_actions

In [None]:
y_train = list(range(n_actions))

In [None]:
# initialize agent to the dimension of state an amount of actions
#agent.fit([env.reset()]*n_actions, list(range(n_actions)));
agent.fit(X_train, y_train)

In [None]:
def generate_session(t_max=1000):
    
    states,actions = [],[]
    total_reward = 0
    
    s = env.reset()
    
    for t in range(t_max):
        
        # a vector of action probabilities in current state
        probs = agent.predict_proba([s])[0] 
        
        #a = <sample action with such probabilities>
        a = np.random.choice(2, 1, p=probs)[0]
        # Version 2.
        #a = get_action(s, epsilon=epsilon) 
        
        new_s, r, done, info = env.step(a)
        
        # Version 2.
        #epsilon=0
        #sess.run(train_step,{
        #        states_ph: [s], actions_ph: [a], rewards_ph: [r], 
        #        next_states_ph: [new_s], is_done_ph: [done]
        #    })
        
        #record sessions like you did before
        states.append(s)
        actions.append(a)
        total_reward += r
        
        s = new_s
        if done: break
    return states, actions, total_reward

In [None]:
#epsilon = 0.5
#for i in range(1000):
#    session_rewards = [generate_session(epsilon=epsilon, train=True) for _ in range(100)]
#    print("epoch #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(i, np.mean(session_rewards), epsilon))
    
#    epsilon *= 0.99
#    assert epsilon >= 1e-4, "Make sure epsilon is always nonzero during training"
    
#    if np.mean(session_rewards) > 300:
#        print ("You Win!")
#        break

### Cross-entropy method (CEM) steps

Deep CEM uses exactly the same strategy as the regular CEM. The only difference is that now each observation is not a number but a float32 vector.

In [None]:
def select_elites(states_batch,actions_batch,rewards_batch,percentile=50):
    """
    Select states and actions from games that have rewards >= percentile
    :param states_batch: list of lists of states, states_batch[session_i][t]
    :param actions_batch: list of lists of actions, actions_batch[session_i][t]
    :param rewards_batch: list of rewards, rewards_batch[session_i][t]
    
    :returns: elite_states,elite_actions, both 1D lists of states and respective actions from elite sessions
    
    Please return elite states and actions in their original order 
    [i.e. sorted by session number and timestep within session]
    
    If you're confused, see examples below. Please don't assume that states are integers (they'll get different later).
    """
    
    #reward_threshold = <Compute minimum reward for elite sessions. Hint: use np.percentile>
    reward_threshold = np.percentile(rewards_batch, percentile)
    
    
    #elite_states  = <your code here>
    #elite_actions = <your code here>
    # Version 1.
    elite_states  = [s for i in range(len(states_batch)) if rewards_batch[i]>=reward_threshold for s in states_batch[i]]    
    elite_actions = [a for i in range(len(actions_batch)) if rewards_batch[i]>=reward_threshold for a in actions_batch[i]]

    # Version 2.
    #elite_states  = [state for i in range(len(rewards_batch))   if rewards_batch[i]> reward_threshold for state in states_batch[i] ]
    #elite_actions = [action for i in range(len(rewards_batch))  if rewards_batch[i] > reward_threshold for action in actions_batch[i]]

    # Version 3.
    #elite_states  = list(chain(*[s for s, _ in zip(states_batch,rewards_batch) if _ >= reward_threshold])) 
    #elite_actions = list(chain(*[s for s, _ in zip(actions_batch,rewards_batch) if _ >= reward_threshold])) 

    # Version 4.
    #elite_states = []
    #elite_actions = []
    #for i in range(len(rewards_batch)):
    #    reward = rewards_batch[i]
    #    if reward>=reward_threshold:
    #        for state_element, action_element in zip(states_batch[i], actions_batch[i]):
    #            elite_states.append(state_element)
    #            elite_actions.append(action_element)
    
    return elite_states, elite_actions

In [None]:
## Training loop

Generate sessions, select N best and fit to those.

In [None]:
def show_progress(batch_rewards, log, percentile, reward_range=[-990,+10]):
    """
    A convenience function that displays training progress. 
    No cool math here, just charts.
    """
    
    mean_reward, threshold = np.mean(batch_rewards), np.percentile(batch_rewards, percentile)
    log.append([mean_reward, threshold])

    clear_output(True)
    print("mean reward = %.3f, threshold=%.3f"%(mean_reward, threshold))
    plt.figure(figsize=[8,4])
    plt.subplot(1,2,1)
    plt.plot(list(zip(*log))[0], label='Mean rewards')
    plt.plot(list(zip(*log))[1], label='Reward thresholds')
    plt.legend()
    plt.grid()
    
    plt.subplot(1,2,2)
    plt.hist(batch_rewards, range=reward_range);
    plt.vlines([np.percentile(batch_rewards, percentile)], [0], [100], label="percentile", color='red')
    plt.legend()
    plt.grid()

    return plt.show()

In [None]:
n_sessions = 100
percentile = 70

In [None]:
%%time
log = []

for i in range(100):
    #generate new sessions
    #sessions = [<generate a list of n_sessions new sessions>]
    sessions = [generate_session() for _ in range(n_sessions)]

    batch_states, batch_actions, batch_rewards = map(np.array, zip(*sessions))

    #elite_states, elite_actions = <select elite actions just like before>
    # Version 1. 
    elite_states, elite_actions = select_elites(batch_states,
                                                batch_actions,
                                                batch_rewards,
                                                percentile=percentile)
    
    # Version 2. choose threshold on rewards
    #threshold = np.percentile(batch_rewards,percentile)
    #elite_states = np.concatenate(batch_states[batch_rewards>=threshold])
    #elite_actions = np.concatenate(batch_actions[batch_rewards>=threshold])

    #<fit agent to predict elite_actions(y) from elite_states(X)>
    agent.fit(elite_states, elite_actions)
    
    show_progress(batch_rewards, log, percentile, reward_range=[0,np.max(batch_rewards)])
    
    if np.mean(batch_rewards)> 190:
        print("You Win! You may stop training now via KeyboardInterrupt.")

In [None]:
#record sessions
import gym.wrappers

env = gym.wrappers.Monitor(gym.make("CartPole-v0"), directory="videos", force=True)
sessions = [generate_session() for _ in range(100)]

In [None]:
env.close()

## Show video

In [None]:
from IPython.display import HTML
import os

video_names = list(filter(lambda s:s.endswith(".mp4"), os.listdir("./videos/")))

HTML("""
<video width="640" height="480" controls>
  <source src="{}" type="video/mp4">
</video>
""".format("./videos/"+video_names[-1])) #this may or may not be _last_ video. Try other indices

## Solving CartPole-V1 using REINFORCE Algorithm

In [None]:
import os
import gym
import numpy as np

# MXNET
from mxnet import nd, gluon, init, autograd
from mxnet.gluon import nn
import mxnet as mx

In [None]:
class REINFORCE(object):
    def __init__(self, env, lr=1e-3, seed=42):
        """
        REINFORCE algorithm implementation.
        
        Args:
            env (Gym environment) : the environment that we are training our reinforcement learning.
            lr (float) : the learning rate used for to update the neural network.
            seed (int) : the random seed used to generate data from the environment.
        """

        self.env = env
        self.lr = lr
        self.seed = seed
        self.env.seed(self.seed)
        
        print('Random seed: {} '.format(seed))

        self.build_network()
        
        
    def build_network(self, hidden_size=20):
        """
        Build the neural network and set up the trainer.
        
        Args:
            hidden_size (int) : the size of the hidden layers in the neural network.
        """    
        
        self.policy_net = nn.Sequential()
        self.policy_net.add(nn.Dense(hidden_size, activation="relu"),
                            nn.Dense(hidden_size, activation="relu"),
                            nn.Dense(self.env.action_space.n))
        self.policy_net.initialize(init=init.Xavier())

        self.trainer = gluon.Trainer(self.policy_net.collect_params(), 'adam', {'learning_rate': self.lr})

        
    def update(self, lr_coeff=0.999):
        """
        Perform an update on a batch of data collected during an episode. It will also reduce the learning rate 
        after the update as a way to improve convergence.
        
        Args:
            lr_coeff (float) : the coefficient with which we multiply the current learning rate.
        """
        
        returns    = self.get_returns()
        batch_size = len(self.actions)

        with autograd.record():
            all_actions = nd.softmax(self.policy_net(nd.array(self.states[:-1])))
            
            loss = - nd.log(all_actions[np.array(range(batch_size)), np.array(self.actions)]) * returns

        loss.backward()
        
        self.trainer.step(batch_size)
        self.trainer.set_learning_rate(self.trainer.learning_rate * lr_coeff) 
      
    
    def predict(self,  state):
        """
        Output the probabilities for all actions and choose stochastically one of them.
        
        Args:
            state (array of floats) : the state for which we want to select an action.
        Returns:
            action (int) : the selected action given the state.
        """
        
        actions = nd.softmax(self.policy_net(nd.array([state]))).asnumpy()[0]

        return np.random.choice(len(actions), p=actions)
    
    
    def get_returns(self, discount_factor=0.99):
        """
        Calculate the return for every state. This is defined as the discounted 
        sum of rewards after visiting the state. 
        
        Args:
            discount_factor (float) : determines how much we care about distant 
                                        rewards (1.0) vs immediate rewards (0.).
        Returns:
            normalized_returns (array of float) : the returns, from which the mean is 
                                                 substracted to reduce the variance.
        """
        returns=[]
        curr_sum = 0.
        for r in reversed(self.rewards):
            curr_sum = r + discount_factor*curr_sum
            returns.append(curr_sum)
            
        returns.reverse()
        normalized_returns = nd.array(returns) - nd.mean(nd.array(returns))
        
        return normalized_returns
    
    
    def setup_saving(self):
        """
        Store results.
        
        Args:
            None
        
        Returns:
            printout of location of stored file.
        """
        
        directory= os.getcwd() + '/res/'
        if not os.path.exists(directory):
            os.makedirs(directory)
            
        save_file = "{}cartpole_seed{}.csv".format(directory,self.seed)
        
        return save_file, []   
    
    
    def initialize_episode(self):
        """
        Initialiazes the variables total_rewards, ewards, actions and states, and
        resets the environment.
        
        Returns:
            state (array of float) : the first state of the episode.
        """
        
        self.rewards,self.actions,self.states = [],[],[]
        self.total_rewards = 0.

        state = self.env.reset()
        self.states.append(state) 

        return state

    
    def add_to_trajectory(self, action, next_state, reward):
        """
        Stores in memory the action, next_state and reward. This will later be used for updates.
        
        Args:
            action (int) : the selected action in the current state.
            action (int) : the reward after selectin the action.
            next_state (array of floats) : the next state returned by the environment after selecting the action.
        Returns:
            next_state (array of float) : the next state returned by the environment after selecting the action.
        """
        
        self.total_rewards += reward
        
        self.rewards.append(reward)
        self.actions.append(action)   
        self.states.append(next_state)

        return next_state
    

    def fit(self, num_episodes=1000, save_every=5):
        """
        Implements the training loop. 
        
        Args:
            num_episodes (int) : the number of episodes we train the agent.
            save_every (int) : the rate at which we save the results, which will be used for visualization.
        """
        
        save_file, stats = self.setup_saving()

        for i_episode in range(num_episodes):
            if i_episode % save_every == 0 and i_episode != 0:
                np.savetxt(save_file,stats,delimiter=',') 

            state = self.initialize_episode()
            done=False
            t=0

            while not done:
                t+=1
                action = self.predict(state)
                next_state, reward, done, _ = self.env.step(action)
                state = self.add_to_trajectory(action, next_state, reward)
                if i_episode%50 ==0:self.env.render()

            print("\rEpisode {} Total Rewards {} ".format(i_episode, self.total_rewards) )
            stats.append(t)
            self.update()

In [None]:
env = gym.make("CartPole-v1")

In [None]:
REINFORCE(env).fit()

## Agent taking random actions

In [None]:
import matplotlib.pyplot as plt
from IPython import display
%matplotlib inline

In [None]:
env = wrappers.Monitor(env, "./gym-results", force=True)
observation = env.reset()

In [None]:
for episode in range(200):
    for timestep in range(100):
        env.render()
    
        print(observation)
    
        # Here we’ve chosen to “sample” the action space to get a random action, of which,
        # there are only two: move left or move right.
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)  # take a random action

        #if env_opt:
        #    env.reset()
        if done:
            print("Completed after {} timesteps.".format(timestep + 1))
            break

In [None]:
# close environment
env.close() 

In [None]:
import io
import base64
from IPython.display import HTML

video = io.open('./gym-results/openaigym.video.%s.video000000.mp4' % env.file_infix, 'r+b').read()
encoded = base64.b64encode(video)
HTML(data='''
    <video width="360" height="auto" alt="test" controls><source src="data:video/mp4;base64,{0}" type="video/mp4" /></video>'''
.format(encoded.decode('ascii')))

What we want to do is train an agent to find a policy. we want to train our agent to find a good policy for the CartPole problem. Specifically, we want our agent to learn an ideally optimal policy that takes the four observation values and then make a decision as to what action to take (i.e. move right or move left) given the values the agent is observing at any given time.

In [None]:
Reading material:

- http://testerstories.com/2017/12/the-tester-role-in-machine-learning-part-1/
- http://testerstories.com/2017/12/the-tester-role-in-machine-learning-part-2/
- http://testerstories.com/2017/12/the-tester-role-in-machine-learning-part-3/
- http://testerstories.com/2017/12/the-tester-role-in-machine-learning-part-4/

In [None]:
## Cartpole Environment Reinforcement Learning

In [None]:
import gym
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

import torch
torch.manual_seed(0) 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
from pyvirtualdisplay import Display

display = Display(visible=0, size=(1400, 900))
display.start()

is_ipython = 'inline' in plt.get_backend()

if is_ipython:
    from IPython import display

plt.ion()

#### Watch a Random Agent

It is useful to see how well the agent performs, before training train the agent. We can run the code cell below to watch how well the agent does, if it just pushes the cart randomly at each timestep. We can think of it as flipping a fair coin when deciding whether to push the cart to the left or to the right.

We run the code cell multiple times, to check the score for different episodes (or game rounds). It likely won't get a score above 30, and this is to be expected! Later we'll train the agent to consistently get a score larger than 195!

In [None]:
env = gym.make('CartPole-v0')
env.seed(0)

In [None]:
print('observation space:', env.observation_space)
print('action space:', env.action_space)

In [None]:
state = env.reset()
img = plt.imshow(env.render(mode='rgb_array'))

In [None]:
for t in range(1000):

    action = env.action_space.sample()
    img.set_data(env.render(mode='rgb_array')) 

    plt.axis('off')

    display.display(plt.gcf())
    display.clear_output(wait=True)
    
    state, reward, done, _ = env.step(action)

    if done:
        print('Score: ', t+1)
        break
        
env.close()

#### Define the Architecture of the Policy

We will define a neural network that encodes the agent's stochastic policy.

CartPole environment details:

- The agent has two possible actions: it can either push the cart to the left or to the right.
- The state at each timestep always has four numbers, corresponding to the position and velocity of the cart, along with the pole angle and velocity.
- The network that you will define takes the environment state as input. It returns as output the probability that the agent should select each possible action.

For example, when the agent observes a new state, it passes the state as input to the network. The network returns two numbers, corresponding to the probability that the agent will select each action. So, for instance, if the network returns [0.9, 0.1], the agent pushes the car to the left with 90% probability, and otherwise pushes the car to the right. Then the agent samples from the action space using these probabilities - say it ends up selecting the action that pushes the cart to the left. After selecting this action, it sends the action to the environment and receives a reward and next state. This next state is then fed as input to the network, and so on.

When we initialize the neural network, all of the weights are random. Our agent's goal then will be to figure out the appropriate weights in the neural network, so that for each state, the network always outputs probabilities that encode a good game-playing strategy, and help the agent get a high score!

In [None]:
class Policy(nn.Module):
    def __init__(self, s_size=4, h_size=16, a_size=2):
        """Neural network that encodes the policy.
        
        Params
        ======
            s_size (int): dimension of each state (also size of input layer)
            h_size (int): size of hidden layer
            a_size (int): number of potential actions (also size of output layer)
        """
        super(Policy, self).__init__()
        self.fc1 = nn.Linear(s_size, h_size)
        self.fc2 = nn.Linear(h_size, a_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        
        return F.softmax(x, dim=1)
    
    def act(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        probs = self.forward(state).cpu()
        m = Categorical(probs)
        action = m.sample()
        
        return action.item(), m.log_prob(action)

### Train the Agent with Reinforcement Learning

We'll use the reinforcement algorithm, also known as Monte Carlo Policy Gradients, to guide the agent to train the weights of the neural network, while it's playing the game.

For now, run the training on the network. The OpenAI Gym considers the environment as "solved", if the average score over 100 episodes is at least 195.0.

In [None]:
policy = Policy().to(device)
optimizer = optim.Adam(policy.parameters(), lr=1e-2)

In [None]:
def reinforce(n_episodes=1000, max_t=200, gamma=1.0, print_every=100):
    """PyTorch implementation of the REINFORCE algorithm.
        
    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        gamma (float): discount rate
        print_every (int): how often to print average score (over last 100 episodes)
    """
    scores_deque = deque(maxlen=100)
    scores = []
    for i_episode in range(1, n_episodes+1):
        saved_log_probs = []
        rewards = []
        state = env.reset()
        for t in range(max_t):
            action, log_prob = policy.act(state)
            saved_log_probs.append(log_prob)
            state, reward, done, _ = env.step(action)
            rewards.append(reward)
            if done:
                break 
        scores_deque.append(sum(rewards))
        scores.append(sum(rewards))
        
        discounts = [gamma**i for i in range(len(rewards)+1)]
        R = sum([a*b for a,b in zip(discounts, rewards)])
        
        policy_loss = []
        for log_prob in saved_log_probs:
            policy_loss.append(-log_prob * R)
        policy_loss = torch.cat(policy_loss).sum()
        
        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()
        
        if i_episode % print_every == 0:
            print('Episode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
        if np.mean(scores_deque)>=195.0:
            print('Environment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_deque)))
            break
        
    return scores

In [None]:
### Plot the Scores
scores = reinforce()
Run the code cell below to plot the scores that were received by the agent.

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

In [None]:
### Watch a Smart Agent
Finally, we can watch our smart agent! Doesn't it do much better than the random agent?

In [None]:
env = gym.make('CartPole-v0')

In [None]:
state = env.reset()
img = plt.imshow(env.render(mode='rgb_array'))

In [None]:
for t in range(1000):
    action, _ = policy.act(state)
    img.set_data(env.render(mode='rgb_array')) 
    plt.axis('off')
    display.display(plt.gcf())
    display.clear_output(wait=True)
    state, reward, done, _ = env.step(action)
    if done:
        print('Score: ', t+1)
        break
        
env.close()

### Comment

We are now in the position of trying to train our agent using much more advanced reinforcement learning methods.

https://github.com/y2ee201/Coursera-Practical-RL-NRUSHE/blob/master/practice_approx_qlearning.ipynb


https://github.com/y2ee201/Coursera-Practical-RL-NRUSHE/blob/master/sarsa.ipynb

https://github.com/y2ee201/Coursera-Practical-RL-NRUSHE/blob/master/qlearning.ipynb

https://github.com/y2ee201/Coursera-Practical-RL-NRUSHE/blob/master/practice_mcts.ipynb

https://github.com/y2ee201/Coursera-Practical-RL-NRUSHE/blob/master/practice_reinforce.ipynb

https://github.com/y2ee201/Coursera-Practical-RL-NRUSHE/blob/master/practice_vi.ipynb

https://github.com/y2ee201/Coursera-Practical-RL-NRUSHE/blob/master/crossentropy_method.ipynb

https://github.com/y2ee201/Coursera-Practical-RL-NRUSHE/blob/master/bandits.ipynb

## LinearDeepQNetwork

In [5]:
n_games = 10000
scores = []
eps_history = []

In [6]:
agent = Agent(input_dims=env.observation_space.shape,
              n_actions=env.action_space.n)

In [7]:
# iterate over the number of games
for i in range(n_games):
    score = 0
    done = False
    obs = env.reset()
    
    while not done:
        action = agent.choose_action(obs)
        obs_, reward, done, info = env.step(action)
        score += reward
        # learn from the state, action, reward, and new state
        agent.learn(obs, action, reward, obs_)
        obs = obs_ # state old state to new state
    # at the end of every episode append the score
    scores.append(score)
    eps_history.append(agent.epsilon) # agents epsilon
    
    if i % 100 == 0:
        avg_score = np.mean(scores[-100:]) # last 100 episodes
        print('episode ', i, 'score %.1f avg score %.1f epsilon  %.2f' % (score, avg_score, agent.epsilon))

episode  0 score 23.0 avg score 23.0 epsilon  1.00
episode  100 score 90.0 avg score 24.8 epsilon  0.97
episode  200 score 13.0 avg score 21.8 epsilon  0.95
episode  300 score 46.0 avg score 23.3 epsilon  0.93
episode  400 score 29.0 avg score 22.3 epsilon  0.91
episode  500 score 11.0 avg score 22.0 epsilon  0.89
episode  600 score 25.0 avg score 23.9 epsilon  0.86
episode  700 score 15.0 avg score 21.7 epsilon  0.84
episode  800 score 24.0 avg score 22.9 epsilon  0.82
episode  900 score 15.0 avg score 24.3 epsilon  0.79
episode  1000 score 33.0 avg score 28.4 epsilon  0.76
episode  1100 score 68.0 avg score 22.3 epsilon  0.74
episode  1200 score 20.0 avg score 23.4 epsilon  0.72
episode  1300 score 20.0 avg score 24.8 epsilon  0.69
episode  1400 score 16.0 avg score 25.0 epsilon  0.67
episode  1500 score 24.0 avg score 25.3 epsilon  0.64
episode  1600 score 12.0 avg score 28.2 epsilon  0.62
episode  1700 score 35.0 avg score 28.3 epsilon  0.59
episode  1800 score 34.0 avg score 32.0 

In [8]:
# plot learning curve
filename = 'cartpole_naive_dqn.png'
x = [i+1 for i in range(n_games)]
plot_learning_curve(x, scores, eps_history, filename)

NameError: name 'plot_learning_curve' is not defined