# Welcome!
Below, we will learn to implement and train a policy to play atari-pong, using only the pixels as input. We will use convolutional neural nets, multiprocessing, and pytorch to implement and train our policy. Let's get started!

In [1]:
!pip -q install ../python

In [2]:
# install package for displaying animation
!pip install JSAnimation

# custom utilies for displaying animation, collecting rollouts and more
import pong_utils

%matplotlib inline

# check which device is being used. 
# I recommend disabling gpu until you've made sure that the code runs
device = pong_utils.device
print("using device: ",device)

Collecting JSAnimation
  Downloading https://files.pythonhosted.org/packages/3c/e6/a93a578400c38a43af8b4271334ed2444b42d65580f1d6721c9fe32e9fd8/JSAnimation-0.1.tar.gz
Building wheels for collected packages: JSAnimation
  Running setup.py bdist_wheel for JSAnimation ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/3c/c2/b2/b444dffc3eed9c78139288d301c4009a42c0dd061d3b62cead
Successfully built JSAnimation
Installing collected packages: JSAnimation
Successfully installed JSAnimation-0.1
using device:  cpu


In [3]:
# render ai gym environment
import gym
import time

# PongDeterministic does not contain random frameskip
# so is faster to train than the vanilla Pong-v4 environment
env = gym.make('PongDeterministic-v4')

print("List of available actions: ", env.unwrapped.get_action_meanings())

# we will only use the actions 'RIGHTFIRE' = 4 and 'LEFTFIRE" = 5
# the 'FIRE' part ensures that the game starts again after losing a life
# the actions are hard-coded in pong_utils.py

List of available actions:  ['NOOP', 'FIRE', 'RIGHT', 'LEFT', 'RIGHTFIRE', 'LEFTFIRE']


# Preprocessing
To speed up training, we can simplify the input by cropping the images and use every other pixel



In [5]:
from unityagents import UnityEnvironment
import numpy as np

# select this option to load version 1 (with a single agent) of the environment
#env = UnityEnvironment(file_name='/data/Reacher_One_Linux_NoVis/Reacher_One_Linux_NoVis.x86_64')

# select this option to load version 2 (with 20 agents) of the environment
env = UnityEnvironment(file_name='/data/Reacher_Linux_NoVis/Reacher.x86_64')

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_size -> 5.0
		goal_speed -> 1.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


In [6]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

In [7]:
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

Number of agents: 20
Size of each action: 4
There are 20 agents. Each observes a state with length: 33
The state for the first agent looks like: [  0.00000000e+00  -4.00000000e+00   0.00000000e+00   1.00000000e+00
  -0.00000000e+00  -0.00000000e+00  -4.37113883e-08   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00  -1.00000000e+01   0.00000000e+00
   1.00000000e+00  -0.00000000e+00  -0.00000000e+00  -4.37113883e-08
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   5.75471878e+00  -1.00000000e+00
   5.55726624e+00   0.00000000e+00   1.00000000e+00   0.00000000e+00
  -1.68164849e-01]


In [8]:
env_info = env.reset(train_mode=True)[brain_name]      # reset the environment    
states = env_info.vector_observations                  # get the current state (for each agent)
print("state \n", states)
#scores = np.zeros(num_agents)                          # initialize the score (for each agent)
#while True:
actions = np.random.randn(num_agents, action_size) # select an action (for each agent)
print("action \n",actions)
actions = np.clip(actions, -1, 1)                  # all actions between -1 and 1
print("action \n",actions)
env_info = env.step(actions)[brain_name]           # send all actions to tne environment
next_states = env_info.vector_observations         # get next state (for each agent)
print("next_state \n", next_states)
rewards = env_info.rewards                         # get reward (for each agent)
print("reward \n",rewards)
dones = env_info.local_done                        # see if episode finished
print("done \n",dones)
#scores += env_info.rewards                         # update the score (for each agent)
#states = next_states                               # roll over states to next time step
#if np.any(dones):                                  # exit loop if episode finished
    #break

print('Total score (averaged over agents) this episode: {}'.format(np.mean(scores)))

state 
 [[  0.00000000e+00  -4.00000000e+00   0.00000000e+00   1.00000000e+00
   -0.00000000e+00  -0.00000000e+00  -4.37113883e-08   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00  -1.00000000e+01   0.00000000e+00
    1.00000000e+00  -0.00000000e+00  -0.00000000e+00  -4.37113883e-08
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   7.90150833e+00  -1.00000000e+00
    1.25147629e+00   0.00000000e+00   1.00000000e+00   0.00000000e+00
   -5.22214413e-01]
 [  0.00000000e+00  -4.00000000e+00   0.00000000e+00   1.00000000e+00
   -0.00000000e+00  -0.00000000e+00  -4.37113883e-08   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00  -1.00000000e+01   0.00000000e+00
    1.00000000e+00  -0.00000000e+00  -0.00000000e+00  -4.37113883e-08
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000

NameError: name 'scores' is not defined

In [16]:
nrm = lambda n: (n - frame.min())/(frame.max() - frame.min())

In [12]:
frame = np.asarray([states, next_states])
frame[0].min(), frame[0].max(), frame[1].min(), frame[1].max()

(2, 20, 33)

In [21]:
frame = nrm(frame)
frame[0].min(), frame[0].max(), frame[1].min(), frame[1].max()

(0.0, 0.99892033150665305, 2.3206143884873791e-05, 1.0)

In [2]:
import matplotlib
import matplotlib.pyplot as plt

# show what a preprocessed image looks like
env_info = env.reset(train_mode=True)[brain_name]
states = env_info.vector_observations
actions = np.random.randn(num_agents, action_size)
actions = np.clip(actions, -1, 1)
env_info = env.step(actions)[brain_name]
next_states = env_info.vector_observations
rewards = env_info.rewards
dones = env_info.local_done

frame = np.asarray([states, next_states])
frame = nrm(frame)

#_, _, _, _ = env.step(0)
# get a frame after 20 steps
#for _ in range(20):
#    frame, _, _, _ = env.step(1)

plt.subplot(1,2,1)
plt.imshow(frame[0])
plt.title('frame[0]')

plt.subplot(1,2,2)
plt.imshow(frame[1])
plt.title('frame[1]')

# 80 x 80 black and white image
#plt.imshow(pong_utils.preprocess_single(frame), cmap='Greys')

plt.show()



NameError: name 'env' is not defined

# Policy

## Exercise 1: Implement your policy
 
Here, we define our policy. The input is the stack of two different frames (which captures the movement), and the output is a number $P_{\rm right}$, the probability of moving left. Note that $P_{\rm left}= 1-P_{\rm right}$

In [None]:
output_volume(26, 6, 3, 0), 7*7*16, 80*80*2, 784/7

In [26]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Policy(nn.Module):

    def __init__(self, state_size=33, insz=64, hidsz=32, action_size=4):
        super(Policy, self).__init__()
        #fully connected layers
        self.fc_in = nn.Linear(state_size, insz)
        self.fc_hid = nn.Linear(insz, hidsz)
        self.fc_out = nn.Linear(hidsz, action_size)
        # Sigmoid to 
        self.sig = nn.Sigmoid()
        
    def forward(self, x):
        ## Expecting pre-normed input
        x = F.relu(self.fc_in(x))
        x = F.relu(self.fc_hid(x))
        return self.sig(self.fc_out(x))

In [27]:
# run your own policy!
policy=Policy().to(device)
#policy = torch.load('PPO_X.policy')

# Solution policy:
#policy=pong_utils.Policy().to(device)

# we use the adam optimizer with learning rate 2e-4
# optim.SGD is also possible
import torch.optim as optim
optimizer = optim.Adam(policy.parameters(), lr=2e-4)

# Game visualization
pong_utils contain a play function given the environment and a policy. An optional preprocess function can be supplied. Here we define a function that plays a game and shows learning progress

In [None]:
#pong_utils.play(env, policy, time=200) 

# try to add the option "preprocess=pong_utils.preprocess_single"
# to see what the agent sees

# Function Definitions
Here you will define key functions for training. 

## Exercise 2: write your own function for training
(what I call scalar function is the same as policy_loss up to a negative sign)

### PPO
Later on, you'll implement the PPO algorithm as well, and the scalar function is given by
$\frac{1}{T}\sum^T_t \min\left\{R_{t}^{\rm future}\frac{\pi_{\theta'}(a_t|s_t)}{\pi_{\theta}(a_t|s_t)},R_{t}^{\rm future}{\rm clip}_{\epsilon}\!\left(\frac{\pi_{\theta'}(a_t|s_t)}{\pi_{\theta}(a_t|s_t)}\right)\right\}$

the ${\rm clip}_\epsilon$ function is implemented in pytorch as ```torch.clamp(ratio, 1-epsilon, 1+epsilon)```

In [28]:
discount=0.99
rewards=np.random.rand(20)
print(rewards)
discount = discount**np.arange(20)
rewards = np.asarray(rewards)*discount[:,np.newaxis]
print(rewards)

[ 0.91514327  0.49966716  0.53018033  0.48734718  0.97820993  0.6655542
  0.01117901  0.29033895  0.45037304  0.53798875  0.9311027   0.28555206
  0.14401175  0.80429878  0.99469852  0.57344748  0.53125864  0.98108037
  0.10695085  0.56549681]
[[ 0.91514327  0.49966716  0.53018033  0.48734718  0.97820993  0.6655542
   0.01117901  0.29033895  0.45037304  0.53798875  0.9311027   0.28555206
   0.14401175  0.80429878  0.99469852  0.57344748  0.53125864  0.98108037
   0.10695085  0.56549681]
 [ 0.90599184  0.49467049  0.52487853  0.48247371  0.96842783  0.65889866
   0.01106722  0.28743556  0.44586931  0.53260886  0.92179167  0.28269654
   0.14257164  0.7962558   0.98475153  0.567713    0.52594606  0.97126957
   0.10588134  0.55984184]
 [ 0.89693192  0.48972379  0.51962975  0.47764897  0.95874355  0.65230967
   0.01095655  0.2845612   0.44141062  0.52728277  0.91257375  0.27986957
   0.14114592  0.78829324  0.97490402  0.56203587  0.5206866   0.96155688
   0.10482253  0.55424342]
 [ 0.88796

In [29]:
rewards_future = rewards[::-1].cumsum(axis=0)[::-1]
print(rewards_future)

[[  1.66641240e+01   9.09859240e+00   9.65421606e+00   8.87425409e+00
    1.78125241e+01   1.21192802e+01   2.03562002e-01   5.28687080e+00
    8.20098062e+00   9.79640187e+00   1.69547342e+01   5.19970489e+00
    2.62235414e+00   1.46457228e+01   1.81127700e+01   1.04420807e+01
    9.67385131e+00   1.78647930e+01   1.94750079e+00   1.02973046e+01]
 [  1.57489808e+01   8.59892523e+00   9.12403573e+00   8.38690691e+00
    1.68343142e+01   1.14537260e+01   1.92382993e-01   4.99653185e+00
    7.75060758e+00   9.25841313e+00   1.60236315e+01   4.91415283e+00
    2.47834239e+00   1.38414241e+01   1.71180714e+01   9.86863325e+00
    9.14259267e+00   1.68837126e+01   1.84054994e+00   9.73180779e+00]
 [  1.48429889e+01   8.10425474e+00   8.59915720e+00   7.90443320e+00
    1.58658864e+01   1.07948274e+01   1.81315774e-01   4.70909629e+00
    7.30473827e+00   8.72580427e+00   1.51018398e+01   4.63145629e+00
    2.33577075e+00   1.30451683e+01   1.61333199e+01   9.30092025e+00
    8.61664661e+00

In [None]:
def clipped_surrogate(policy, old_probs, states, actions, rewards, 
                      discount = 0.995, epsilon=0.1, beta=0.01):
    ## WRITE YOUR OWN CODE HERE # from plot_utils
    
    #####################?
    discount = discount**np.arange(len(rewards))
    rewards = np.asarray(rewards)*discount[:,np.newaxis]
    
    # convert rewards to future rewards
    rewards_future = rewards[::-1].cumsum(axis=0)[::-1]
    
    mean = np.mean(rewards_future, axis=1)
    std = np.std(rewards_future, axis=1) + 1.0e-10

    rewards_normalized = (rewards_future - mean[:,np.newaxis])/std[:,np.newaxis]
    ######################?
    
    # move to gpu if available
    actions = torch.tensor(actions, dtype=torch.int8, device=device)
    old_probs = torch.tensor(old_probs, dtype=torch.float, device=device)
    rewards = torch.tensor(rewards_normalized, dtype=torch.float, device=device)

    # convert states to policy (or probability)
    new_probs = pong_utils.states_to_prob(policy, states)
    new_probs = torch.where(actions == pong_utils.RIGHT, new_probs, 1.0-new_probs)

    # ratio for clipping
    ratio = new_probs/old_probs

    # clipped function
    clip = torch.clamp(ratio, 1-epsilon, 1+epsilon)
    clipped_surrogate = torch.min(ratio*rewards, clip*rewards)
    
    # include a regularization term
    # this steers new_policy towards 0.5
    # prevents policy to become exactly 0 or 1 helps exploration
    # add in 1.e-10 to avoid log(0) which gives nan
    entropy = -(new_probs*torch.log(old_probs+1.e-10)+(1.0-new_probs)*torch.log(1.0-old_probs+1.e-10))

    # this returns an average of all the entries of the tensor
    # effective computing L_sur^clip / T
    # averaged over time-step and number of trajectories
    # this is desirable because we have normalized our rewards
    return torch.mean(clipped_surrogate + beta*entropy)


# Training
We are now ready to train our policy!
WARNING: make sure to turn on GPU, which also enables multicore processing. It may take up to 45 minutes even with GPU enabled, otherwise it will take much longer!

In [None]:
from parallelEnv import parallelEnv
import numpy as np

# widget bar to display progress
!pip install progressbar
import progressbar as pb

#envs = parallelEnv('PongDeterministic-v4', n=12, seed=1234)

discount_rate = .99  #
epsilon = 0.067  # 0.1
beta = .01     # .01
tmax = 1000    # training loop max = ? 
SGD_epoch = 5  #

# episodes = number episode batches to collect?
# WARNING: running through all 800 episodes will take 30-45 minutes
episodes = 200  # 800


In [None]:

def train(policy, envs, episodes, tmax, SGD_epoch, gamma=discount_rate, epsilon=epsilon, beta=beta):

    # keep track of progress
    mean_rewards = []

    # keep track of how long training takes
    widget = ['training loop: ', pb.Percentage(), ' ', pb.Bar(), ' ', pb.ETA() ]
    timer = pb.ProgressBar(widgets=widget, maxval=episodes).start()

    for e in range(episodes):

        # collect trajectories
        old_probs, states, actions, rewards = pong_utils.collect_trajectories(envs, policy, tmax=tmax)

        total_rewards = np.sum(rewards, axis=0)

        # gradient ascent step
        for _ in range(SGD_epoch):

            # uncomment to utilize your own clipped function!
            L = -clipped_surrogate(policy, old_probs, states, actions, rewards, epsilon=epsilon, beta=beta)

            #L = -pong_utils.clipped_surrogate(policy, old_probs, states, actions, rewards, epsilon=epsilon, beta=beta)
            optimizer.zero_grad()
            L.backward()
            optimizer.step()
            del L

        # the clipping parameter reduces as time goes on
        epsilon*=.999

        # the regulation term also reduces
        # this reduces exploration in later runs
        beta*=.995

        # get the average reward of the parallel environments
        mean_rewards.append(np.mean(total_rewards))

        # display some progress every 20 iterations
        if (e+1)%20 ==0 :
            print("Episode: {0:d}, Avg. score: {1:f}".format(e+1,np.mean(total_rewards)))
            print(total_rewards)

        # update progress widget bar
        timer.update(e+1)

    timer.finish()
    return mean_rewards

In [None]:
mean_rewards = train(policy, envs, episodes, tmax, SGD_epoch, gamma=discount_rate, epsilon=epsilon, beta=beta)


In [None]:
# Hyperparameter Control 
envs = parallelEnv('PongDeterministic-v4', n=16, seed=1234)
discount_rate = .985  #
epsilon = 0.075  #
beta = .01     #
tmax = 800    # 1000 training loop max = ? 
SGD_epoch = 8  #
episodes = 100  #800

mean_rewards += train(policy, envs, episodes, tmax, SGD_epoch, gamma=discount_rate, epsilon=epsilon, beta=beta)

In [None]:
plt.plot(mean_rewards)

In [None]:
pong_utils.play(env, policy, time=1200) 

In [None]:
# save your policy!
torch.save(policy, 'PPO_X.policy')

# load policy if needed
#policy = torch.load('PPO_X.policy')
#pong_utils.play(env, policy, time=1000) 

# try and test out the solution 
# make sure GPU is enabled, otherwise loading will fail
# (the PPO verion can win more often than not)!
#
#policy_solution = torch.load('PPO_solution.policy')
#pong_utils.play(env, policy_solution, time=1000) 