In [1]:
# import gym
# import numpy as np
# import matplotlib.pyplot as plt
# %matplotlib inline

# # Imports specifically so we can render outputs in Jupyter.
# from JSAnimation.IPython_display import display_animation
# from matplotlib import animation
# from IPython.display import display


# def display_frames_as_gif(frames):
#     """
#     Displays a list of frames as a gif, with controls
#     """
#     #plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi = 72)
#     patch = plt.imshow(frames[0])
#     plt.axis('off')

#     def animate(i):
#         patch.set_data(frames[i])

#     anim = animation.FuncAnimation(plt.gcf(), animate, frames = len(frames), interval=50)
#     display(display_animation(anim, default_mode='loop'))

In [2]:
# env = gym.make('MountainCarContinuous-v0')

# # Run a demo of the environment
# observation = env.reset()
# cum_reward = 0
# frames = []
# for t in range(5000):
#     # Render into buffer. 
#     frames.append(env.render(mode = 'rgb_array'))
#     action = env.action_space.sample()
#     observation, reward, done, info = env.step(action)
#     if done:
#         break
# env.render(close=True)
# display_frames_as_gif(frames)

## 1. Import

In [3]:
import gym
import numpy as np
import math
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

## 2. Environment Intro
Ref: https://gym.openai.com/envs/MountainCarContinuous-v0/
- Description: A car is on a one-dimensional track, positioned between two "mountains". The goal is to drive up the mountain on the right; however, the car's engine is not strong enough to scale the mountain in a single pass. Therefore, the only way to succeed is to drive back and forth to build up momentum. Here, the reward is greater if you spend less energy to reach the goal

- Observation: **position** and **velocity**
- Action: **force [-1.0..1.0]**
- Reward:
---
    done = bool(position >= self.goal_position)
    reward = 0
    if done:
        reward = 100.0
    reward-= math.pow(action[0],2)*0.1
---

In [4]:
env = gym.make('MountainCarContinuous-v0')
env.seed(1)

[2019-05-31 09:36:36,490] Making new env: MountainCarContinuous-v0


[1]

In [5]:
obs = env.reset()
print(obs)

[-0.43852191  0.        ]


In [6]:
env.action_space

Box(1,)

In [7]:
for _ in range(10):
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action)
    if done:
        env.reset()
    print(action, observation, reward, done, info)

[0.09762701] [-0.43900664 -0.00048473] -0.0009531032662651797 False {}
[0.43037873] [-0.43947346 -0.00046682] -0.018522585359905353 False {}
[0.20552675] [-0.44025625 -0.00078279] -0.004224124584656844 False {}
[0.08976637] [-0.44152296 -0.00126672] -0.0008058000463731728 False {}
[-0.1526904] [-0.44362808 -0.00210512] -0.002331435865593162 False {}
[0.29178823] [-0.44588956 -0.00226148] -0.008514036891002495 False {}
[-0.12482558] [-0.44891584 -0.00302627] -0.0015581424791871108 False {}
[0.783546] [-0.45132224 -0.0024064 ] -0.0613944336567182 False {}
[0.92732552] [-0.45287549 -0.00155325] -0.08599326219017395 False {}
[-0.23311696] [-0.45530486 -0.00242938] -0.005434351813456613 False {}


## 3. Policy Gradient - REINFORCE

### 3-1 Policy Network

In [8]:
class PolicyNetwork(nn.Module):
    def __init__(self):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(2,8)
        self.fc2 = nn.Linear(8, 4)
        self.fc3 = nn.Linear(4, 1)
        self.fc3_ = nn.Linear(4, 1)
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        mu = self.fc3(x)
        sigma_ = self.fc3_(x)
        return mu, sigma_

### 3-2 Train

In [9]:
policy = PolicyNetwork()
optimizer = optim.Adam(policy.parameters(), lr=1e-3)
max_episodes = 1000
gamma = 0.9
reward_all = []
for episode in range(max_episodes):
    rewards = []
    log_probs = []
    state = env.reset()
    while True:
        mu, sigma_ = policy(torch.FloatTensor([state]))
        sigma_ = F.softplus(sigma_) # sigma_ > 0
        action = mu + sigma_.sqrt() * torch.randn(1) # sampling action from normal distribution
        prob = (-1 * (action - mu).pow(2)/(2 * sigma_)).exp() / (2 * sigma_ * math.pi).sqrt()
        log_prob = prob.log()
        action = F.tanh(action)
        state, reward, done, _ = env.step([action.item()])
        rewards.append(reward)
        log_probs.append(log_prob)
#         print(action, reward)
        if done:
            Return = torch.zeros(1,1)
            for reward, log_prob in zip(reversed(rewards), reversed(log_probs)):
                Return = reward + gamma * Return
                loss = - log_prob * Return
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            print("Episode: %d, reward %.1f" % (episode, np.sum(rewards)))
            reward_all.append(np.sum(rewards))
            rewards = []
            log_probs = []
            break



Episode: 0, reward -38.7
Episode: 1, reward -73.2
Episode: 2, reward -92.2
Episode: 3, reward -95.1
Episode: 4, reward -97.0
Episode: 5, reward 24.5
Episode: 6, reward -96.8
Episode: 7, reward 27.3
Episode: 8, reward -97.3
Episode: 9, reward -98.5
Episode: 10, reward -97.5
Episode: 11, reward 60.9
Episode: 12, reward 13.1
Episode: 13, reward 13.6
Episode: 14, reward -98.1
Episode: 15, reward -98.1
Episode: 16, reward -98.1
Episode: 17, reward -97.8
Episode: 18, reward 34.2
Episode: 19, reward -98.7
Episode: 20, reward -99.0
Episode: 21, reward -98.9
Episode: 22, reward -99.1
Episode: 23, reward -99.2
Episode: 24, reward 11.1
Episode: 25, reward -99.3
Episode: 26, reward -99.0
Episode: 27, reward -99.2
Episode: 28, reward 18.1
Episode: 29, reward -99.4
Episode: 30, reward -99.4
Episode: 31, reward -99.5
Episode: 32, reward 46.2
Episode: 33, reward 26.4
Episode: 34, reward 30.3
Episode: 35, reward -99.3
Episode: 36, reward -99.3
Episode: 37, reward -99.4
Episode: 38, reward -99.5
Episode

Episode: 313, reward -99.9
Episode: 314, reward -99.9
Episode: 315, reward -99.9
Episode: 316, reward 18.9
Episode: 317, reward -99.8
Episode: 318, reward 31.6
Episode: 319, reward -99.9
Episode: 320, reward -99.8
Episode: 321, reward -99.8
Episode: 322, reward -99.9
Episode: 323, reward -99.8
Episode: 324, reward -99.9
Episode: 325, reward -99.9
Episode: 326, reward 34.6
Episode: 327, reward -99.9
Episode: 328, reward -99.9
Episode: 329, reward 16.5
Episode: 330, reward -99.9
Episode: 331, reward 24.0
Episode: 332, reward -99.9
Episode: 333, reward -99.8
Episode: 334, reward -99.9
Episode: 335, reward -99.9
Episode: 336, reward 43.0
Episode: 337, reward -99.9
Episode: 338, reward 24.6
Episode: 339, reward 24.7
Episode: 340, reward 1.6
Episode: 341, reward -99.9
Episode: 342, reward -99.9
Episode: 343, reward -99.9
Episode: 344, reward -99.9
Episode: 345, reward -99.8
Episode: 346, reward -99.9
Episode: 347, reward -99.9
Episode: 348, reward -99.9
Episode: 349, reward 72.1
Episode: 350

KeyboardInterrupt: 

## 4. Advantage Actor-Critic

In [15]:
class StateValueNetwork(nn.Module):
    def __init__(self):
        super(StateValueNetwork, self).__init__()
        self.fc1 = nn.Linear(2, 8)
        self.fc2 = nn.Linear(8, 4)
        self.fc3 = nn.Linear(4, 1)
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [17]:
policy_network = PolicyNetwork()
state_value_network = StateValueNetwork()
optimizer_policy = optim.Adam(policy_network.parameters(), lr=1e-3)
optimizer_state_value = optim.Adam(state_value_network.parameters(), lr=1e-4)
max_episodes = 1000
num_steps = 200
gamma = 0.9
reward_all = []
for episode in range(max_episodes):
    rewards = []
    log_probs = []
    state_values = []
    state = env.reset()
    step = 0
    while True:
        mu, sigma_ = policy(torch.FloatTensor([state]))
        sigma_ = F.softplus(sigma_) # sigma_ > 0
        action = mu + sigma_.sqrt() * torch.randn(1) # sampling action from normal distribution
        prob = (-1 * (action - mu).pow(2)/(2 * sigma_)).exp() / (2 * sigma_ * math.pi).sqrt()
        log_prob = prob.log()
        action = F.tanh(action)
        state_value = state_value_network(torch.FloatTensor([state]))
        state, reward, done, _ = env.step([action.item()])
        rewards.append(reward)
        log_probs.append(log_prob)
        state_values.append(state_value)
        step += 1
        
        if step >= num_steps or done:
            Return = torch.zeros(1,1)
            if done:
                next_state_value = torch.zeros(1,1)
            else:
                mu, sigma_ = policy(torch.FloatTensor([state]))
                sigma_ = F.softplus(sigma_) # sigma_ > 0
                action = mu + sigma_.sqrt() * torch.randn(1)
                action = F.tanh(action)
                state, reward, done, _ = env.step([action.item()])
                next_state_value = state_value_network(torch.FloatTensor([state]))
            for reward, log_prob, state_value in zip(reversed(rewards), reversed(log_probs), reversed(state_values)):
                Return = reward + gamma * Return
                advantage = Return + (gamma ** step) * next_state_value - state_value
                policy_loss = - log_prob * advantage
                
                optimizer_state_value.zero_grad()
                optimizer_policy.zero_grad()
                state_value_loss = advantage.pow(2) ## Advantage * Advantage ??? Advantage doesn't work!
                state_value_loss.backward(retain_graph=True)
                policy_loss.backward(retain_graph=True)
                optimizer_state_value.step()
                optimizer_policy.step()
        if done:
            print("Episode: %d, reward %.1f" % (episode, np.sum(rewards)))
            reward_all.append(np.sum(rewards))
            rewards = []
            log_probs = []
            break

KeyboardInterrupt: 