In [1]:
import pickle
import gym
import time
import numpy as np 
import random
import torch
from run_test import *

Logging to /tmp/openai-2018-11-15-21-58-45-340634


set up the code for running trajectories with pong


In [2]:
#try it just for two trajectories

env_id = "PongNoFrameskip-v4"
env_type = "atari"

#env id, env type, num envs, and seed
env = make_vec_env(env_id, 'atari', 1, 0,
                   wrapper_kwargs={
                       'clip_rewards':False,
                       'episode_life':False,
                   })


env = VecFrameStack(env, 4)
agent = PPO2Agent(env, env_type)
#agent = RandomAgent(env.action_space)

In [3]:

def normalize_state(obs):
    obs_highs = env.observation_space.high
    obs_lows = env.observation_space.low
    #print(obs_highs)
    #print(obs_lows)
    return  2.0 * (obs - obs_lows) / (obs_highs - obs_lows) - 1.0


In [5]:
checkpoints = ['00100','00200','00300','00400','03600']

demonstrations = []
learning_returns = []
for checkpoint in checkpoints:
    
    model_path = "./models/pong/checkpoints/" + checkpoint
    
    agent.load_policy(model_path, env)
    episode_count = 1
    done = False
    traj = []
    r = 0
    for i in range(episode_count):
        ob = env.reset()
        #traj.append(ob)
        #print(ob.shape)
        steps = 0
        acc_reward = 0
        while True:
            action = agent.act(ob, r, done)
            ob, r, done, _ = env.step(action)
            #print(ob.shape)
            traj.append(normalize_state(ob))
            steps += 1
            acc_reward += r
            if done:
                print(steps,acc_reward)
                break
                
    demonstrations.append(traj)
    learning_returns.append(acc_reward)
    


env.close()


    
    
print(learning_returns)



865 [-20.]
1001 [-20.]
1716 [-18.]
2529 [-11.]
2730 [21.]
[array([-20.], dtype=float32), array([-20.], dtype=float32), array([-18.], dtype=float32), array([-11.], dtype=float32), array([21.], dtype=float32)]


Below are some helper functions for getting argmax action from TileCoding value function and normalizing observations to range [0,1].

Normalize observations?

Now I want to build a neural network to predict the reward the learner is trying to optimize. The inputs are 84x84x4 grayscale images. I'm going to try and use the NIPS architecture from DeepMind.

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# Assume that we are on a CUDA machine, then this should print a CUDA device:
print(device)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.conv1 = nn.Conv2d(4, 16, 8, stride=4)
        self.conv2 = nn.Conv2d(16, 32, 4, stride=2)
        self.fc1 = nn.Linear(2592, 64)
        self.fc2 = nn.Linear(64, 1)
        
        
    def cum_return(self, traj):
        '''calculate cumulative return of trajectory'''
        sum_rewards = 0
        for x in traj:
            x = x.permute(0,3,1,2)
            #x = x.view(-1,4,84,84)
            #compute forward pass of reward network
            x = F.leaky_relu(self.conv1(x))
            x = F.leaky_relu(self.conv2(x))
            x = x.view(-1, 2592)
            x = F.leaky_relu(self.fc1(x))
            r = torch.tanh(self.fc2(x))
            sum_rewards += r
        ##    y = self.scalar(torch.ones(1))
        ##    sum_rewards += y
        #print(sum_rewards)
        return sum_rewards
        
            
    
    def forward(self, traj_i, traj_j):
        '''compute cumulative return for each trajectory and return logits'''
        #print([self.cum_return(traj_i), self.cum_return(traj_j)])
        return torch.cat([self.cum_return(traj_i), self.cum_return(traj_j)])
    
class SmallNet(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.conv1 = nn.Conv2d(4, 8, 8, stride=4)
        self.conv2 = nn.Conv2d(8, 16, 4, stride=2)
        self.fc1 = nn.Linear(9*9*16, 1)
        #self.fc2 = nn.Linear(256, 1)
        
        
    def cum_return(self, traj):
        '''calculate cumulative return of trajectory'''
        sum_rewards = 0
        for x in traj:
            x = x.permute(0,3,1,2)
            x = F.relu(self.conv1(x))
            x = F.relu(self.conv2(x))
            x = x.view(-1, 9*9*16)
            r = torch.tanh(self.fc1(x))
            
            sum_rewards += r
        ##    y = self.scalar(torch.ones(1))
        ##    sum_rewards += y
        #print(sum_rewards)
        return sum_rewards
        
            
    
    def forward(self, traj_i, traj_j):
        '''compute cumulative return for each trajectory and return logits'''
        #print([self.cum_return(traj_i), self.cum_return(traj_j)])
        return torch.cat([self.cum_return(traj_i), self.cum_return(traj_j)])
    


Now we train the network. I'm just going to do it one by one for now. Could adapt it for minibatches to get better gradients

In [None]:
def learn_reward(reward_network, optimizer, trajectories, num_iter):
    loss_criterion = nn.CrossEntropyLoss()
    #print(training_data[0])
    cum_loss = 0.0
    for epoch in range(num_iter):
        #pick two random trajectories, traj_i and traj_j such that i > j
        j = np.random.randint(len(trajectories)-1) #make sure there is at least one later trajectory
        i = np.random.randint(j+1,len(trajectories))
        #print(i,j)
        #traj_i = np.array([[d[0]] for d in trajectories[i]])
        #traj_j = np.array([[d[0]] for d in trajectories[j]])
        traj_i = np.array(trajectories[i])
        traj_j = np.array(trajectories[j])
        labels = np.array([[0]])
        traj_i = torch.from_numpy(traj_i).float().to(device)
        traj_j = torch.from_numpy(traj_j).float().to(device)
        labels = torch.from_numpy(labels).to(device)
        
        #zero out gradient
        optimizer.zero_grad()

        #forward + backward + optimize
        outputs = reward_network.forward(traj_i, traj_j).unsqueeze(0)
        #print(outputs)
        #print(labels)
        loss = loss_criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        #print stats to see if learning
        item_loss = loss.item()
        cum_loss += item_loss
        if epoch % 20 == 19:
            #with torch.no_grad():
            #    print(torch.cat([reward_network.cum_return(torch.from_numpy(np.array(traj)).float()) for traj in trajectories]))
            print(epoch, cum_loss / 100)
            cum_loss = 0.0
    print("finished training")



        
    

Now we create a reward network and optimize it using the training data.

In [None]:
reward = Net()
reward.to(device)
import torch.optim as optim
optimizer = optim.Adam(reward.parameters(), lr = 0.0001)
learn_reward(reward, optimizer, demonstrations, 100)

Print out learned return for all demos. should be roughly increasing

In [None]:
with torch.no_grad():
    print(torch.cat([reward.cum_return(torch.from_numpy(np.array(traj)).float().to(device)) for traj in demonstrations]))

Look at predicted reward over last demo. It's all +1 since demos are monotonically increasing. Maybe need to truncate demos to fixed length?

In [None]:
%matplotlib inline
import matplotlib.pylab as plt

#for last demo

cnt = 0
with torch.no_grad():
    for d in demonstrations:
        rewards = []
        print(cnt)
        for s in d:
            r = reward.cum_return(torch.from_numpy(np.array([s])).float().to(device)).item()
            rewards.append(r)
        plt.figure(cnt)
        plt.plot(rewards)
        plt.xlabel("time")
        plt.ylabel("reward")
        plt.title("true return = {}".format(learning_returns[cnt]))
        cnt += 1
#plt.savefig("learned_mcar_return.png")
plt.show()


Try to fix at length of first demo... Doesn't seem to really work. Maybe use smaller neural net? there's gotta be something different in the different demos, right?

In [None]:
H = len(demonstrations[0])
demos_fh = [d[len(d)-H:] for d in demonstrations]
reward_fh = Net()
reward_fh.to(device)
import torch.optim as optim
optimizer = optim.Adam(reward_fh.parameters(), lr = 0.0001)
learn_reward(reward_fh, optimizer, demos_fh, 100)

In [None]:
with torch.no_grad():
    print(torch.cat([reward_fh.cum_return(torch.from_numpy(np.array(traj)).float().to(device)) for traj in demos_fh]))

In [None]:
for d in demos_fh:
    print(len(d))

cnt = 0
with torch.no_grad():
    for d in demos_fh:
        rewards = []
        print(cnt)
        for s in d:
            r = reward_fh.cum_return(torch.from_numpy(np.array([s])).float().to(device)).item()
            rewards.append(r)
        plt.figure(cnt)
        plt.plot(rewards)
        plt.xlabel("time")
        plt.ylabel("reward")
        plt.title("true return = {}".format(learning_returns[cnt]))
        cnt += 1
#plt.savefig("learned_mcar_return.png")
plt.show()

Okay. So now we want to optimize a policy using the learned reward to see how well it can perform if we run RL to convergence on it.

In [None]:
#class that makes a reward function out of a neural network
class NNetReward:
    def __init__(self,nnet):
        self.nnet = nnet
    def get_reward(self, state):
        #transform to tensor and input to nnet
        return self.nnet.cum_return(torch.from_numpy(np.array([state])).float()).item()
        

In [None]:
#Run SARSA tilecoding on learned reward function
num_episodes = 1500

numOfTilings = 8
alpha = 0.5
n = 1

nn_reward = NNetReward(reward)


# use optimistic initial value, so it's ok to set epsilon to 0
EPSILON = 0.0
discount = 0.999 #using high discount factor

apprenticeVFunction = ValueFunction(alpha, numOfTilings)
for i in range(num_episodes):
    if i % 100 == 99:
        print(i, steps)
    r, s, steps = run_episode(env, apprenticeVFunction, n, False, EPSILON, max_time = 2000, reward_fn = nn_reward)
    
print("done training")
    


In [None]:
#evaluate the learned policy
returns = evaluate_policy(env, 200, apprenticeVFunction)
print("best, worst, average", np.max(returns), np.min(returns), np.mean(returns))

In [None]:
print("evaluate demonstrations")
demo_returns = learning_returns

print("best worst, average", np.max(demo_returns), np.min(demo_returns), np.mean(demo_returns))