# Imitation Learning:

The goal of this notebook is to experiment with imitation learning, in the first part of the assignment, the goal is to set up behavior cloning within OpenAI Gym benchmark suite. The dependent library include:

TensorFlow: I used keras to instead the google tensorflow suite.

OpenAI Gym

MoJoCo


In [12]:
import gym
import load_policy
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import seaborn as sns
import tensorflow as tf
import tf_util
import tqdm

In [13]:
task_name='Humanoid-v1'
num_rollouts=6
cached_data_path="data/"+task_name+"-their.p"
their_data_path="data/"+task_name+"-their.p"
our_data_path="data/"+task_name+"-our.p"
expert_policy_file="experts/"+task_name+".pkl"

env=gym.make(task_name)
max_steps = env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps')
envname=task_name
render_them=True
render_us=False

#neural net params
learning_rate=0.001
epochs=50

[2017-02-18 21:13:19,591] Making new env: Humanoid-v1


In [14]:
def one_data_table_stats(data):
    mean=data['returns'].mean()
    std=data['returns'].std()
    x=data['steps']
    pct_full_steps=(x/x.max()).mean()
    
    return pd.Series({
        'mean reward':mean,
        'std reward':std,
        'pct full rollout': pct_full_steps
    })

def view_data(data, rollouts):
    returns=[]
    observations=[]
    actions=[]
    print("Total rollouts from data:", rollouts)
    env=gym.make(envname)
    for i in range(rollouts):
        print("start rollout", i)
        observation=env.reset()
        steps=0
        for t in range(2000):
            env.render()
            x=t+i*max_steps
            action=data['actions'][x,:,:]
            observations.append(t)
            actions.append(action)
            observation, reward, done, info=env.step(action)
            steps+=1
            if steps>=max_steps:
                print("Max timestep reached")
                break
            if done:
                print("Episode finished after {} timesteps".format(t+1))
                break
                
def view_model(model,rollouts):
    returns = []
    observations = []
    actions = []
    env = gym.make(envname)
    print ("Total rollouts from model: ", rollouts)
    for i in range(rollouts):
        #observation = env.reset()
        print ("Start rollout ", i)
        obs = env.reset()
        steps = 0
        for t in range(2000):
            env.render()
            #print(observation)
            action = model.predict(obs[None, :])
            observations.append(obs)
            actions.append(action)
            obs, reward, done, info = env.step(action)
            steps += 1
            if steps >= max_steps:
                print("Max timestep reached")
                break
            if done:
                print("Episode finished after {} timesteps".format(t+1))
                break


#  The part is to get the data of expert policy. 
the data is usded for traning data.

In [5]:
policy_fn=load_policy.load_policy(expert_policy_file)
print('loaded and built')

with tf.Session():
    tf.variables_initializer
    max_steps = env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps')
    print ("Total rollouts for building policy: ", num_rollouts)
    returns = []
    observations = []
    actions = []
    steps_numbers = []
    
    
    for i in tqdm.tqdm(range(num_rollouts)):
        obs = env.reset()
        done = False
        totalr = 0.
        steps = 0
        while not done:
            action = policy_fn(obs[None,:])
            observations.append(obs)
            actions.append(action)
            obs, r, done, _ = env.step(action)
            totalr += r
            steps += 1
            if render_them:
                env.render()
            if steps >= max_steps:
                break
        steps_numbers.append(steps)
        returns.append(totalr)

    expert_data = {'observations': np.array(observations),
                   'actions': np.array(actions),
                   'returns': np.array(returns),
                   'steps': np.array(steps_numbers)}

pickle.dump(expert_data, open(their_data_path, 'wb'))

    

    
    

  0%|          | 0/6 [00:00<?, ?it/s]

obs (1, 376) (1, 376)
loaded and built
Total rollouts for building policy:  6


100%|██████████| 6/6 [01:40<00:00, 16.67s/it]


# Load the expert policy data

In [5]:
data=pickle.load(open(cached_data_path,'rb'))

In [6]:
##Alternative model designs:
def baseline_model():
    model = Sequential()
    model.add(Dense(num_inputs/2, input_dim=num_inputs, init='normal', activation='relu'))
    model.add(Dense(num_outputs, init='normal'))
    
    model.compile(loss='mse', optimizer='adam', metrics=['mae'])
    return model
def regularized_model():
    from keras.regularizers import l2, activity_l2
    model = Sequential()
    model.add(Dense(64, input_dim=num_inputs, init='normal', activation='relu',W_regularizer=l2(0.01), activity_regularizer=activity_l2(0.01),b_regularizer=l2(0.01)))
    model.add(Dense(64, input_dim=num_inputs, init='normal', activation='relu',W_regularizer=l2(0.01), activity_regularizer=activity_l2(0.01),b_regularizer=l2(0.01)))
    model.add(Dense(num_outputs, init='normal',W_regularizer=l2(0.01), activity_regularizer=activity_l2(0.01),b_regularizer=l2(0.01)))
    
    model.compile(loss='mse', optimizer='adam')
    return model
def wide_model():
    model = Sequential()
    model.add(Dense(128, input_dim=num_inputs, init='normal', activation='relu'))
    model.add(Dense(num_outputs, init='normal'))

    model.compile(loss='mse', optimizer='adam')
    return model

def awesome_model():
    model = Sequential([
    Lambda(lambda x: (x - mean) / std, batch_input_shape=(None, observations_dim)),
    Dense(64, activation='tanh'),
    Dense(64, activation='tanh'),
    Dense(actions_dim)])

    opt = Adam(lr=learning_rate)
    model.compile(optimizer=opt, loss='mse', metrics=['mse'])
    return model

# Training the model

In [7]:
from sklearn.utils import shuffle
from keras.models import Sequential
from keras.layers import Dense, Lambda
from keras.optimizers import Adam

mean, std = np.mean(data['observations'], axis=0), np.std(data['observations'], axis=0) + 1e-6
observations_dim=env.observation_space.shape[0]
actions_dim=env.action_space.shape[0]
num_inputs=observations_dim
num_outputs=actions_dim

model=wide_model()
x,y=shuffle(data['observations'],data['actions'].reshape(-1, actions_dim))
model.fit(x,y,
         validation_split=0.1,
         batch_size=256,
         nb_epoch=epochs,
         verbose=2)

Using TensorFlow backend.


Train on 5400 samples, validate on 600 samples
Epoch 1/50
0s - loss: 78.1870 - val_loss: 19.1168
Epoch 2/50
0s - loss: 8.2728 - val_loss: 4.2103
Epoch 3/50
0s - loss: 2.9558 - val_loss: 2.1356
Epoch 4/50
0s - loss: 1.8701 - val_loss: 1.6242
Epoch 5/50
0s - loss: 1.4319 - val_loss: 1.2755
Epoch 6/50
0s - loss: 1.1748 - val_loss: 1.0865
Epoch 7/50
0s - loss: 1.0000 - val_loss: 0.9490
Epoch 8/50
0s - loss: 0.8879 - val_loss: 0.8501
Epoch 9/50
0s - loss: 0.8084 - val_loss: 0.7742
Epoch 10/50
0s - loss: 0.7127 - val_loss: 0.7020
Epoch 11/50
0s - loss: 0.6428 - val_loss: 0.6579
Epoch 12/50
0s - loss: 0.5967 - val_loss: 0.5972
Epoch 13/50
0s - loss: 0.5441 - val_loss: 0.5623
Epoch 14/50
0s - loss: 0.5137 - val_loss: 0.5342
Epoch 15/50
0s - loss: 0.4762 - val_loss: 0.4951
Epoch 16/50
0s - loss: 0.4536 - val_loss: 0.4643
Epoch 17/50
0s - loss: 0.4225 - val_loss: 0.4510
Epoch 18/50
0s - loss: 0.4041 - val_loss: 0.4285
Epoch 19/50
0s - loss: 0.3828 - val_loss: 0.4119
Epoch 20/50
0s - loss: 0.3644

<keras.callbacks.History at 0x7fe5b9f624e0>

# Train the model and save the data

In [8]:
returns = []
observations = []
actions = []
steps_numbers = []

for i in tqdm.tqdm(range(num_rollouts)):
    obs = env.reset()
    done = False
    totalr = 0.
    steps = 0
    while not done:
        action = model.predict(obs[None, :])
        observations.append(obs)
        actions.append(action)
        obs, r, done, _ = env.step(action)
        totalr += r
        steps += 1
        if render_us:
            env.render()
        if steps >= max_steps:
            break
    steps_numbers.append(steps)
    returns.append(totalr)

our_net_data = {'observations': np.array(observations),
                'actions': np.array(actions),
                'returns': np.array(returns),
                'steps': np.array(steps_numbers)}

pickle.dump(our_net_data, open(our_data_path, 'wb'))

100%|██████████| 6/6 [00:04<00:00,  1.42it/s]


# Compare the performance of two model

In [9]:
their=pickle.load(open(their_data_path,'rb'))
our=pickle.load(open(our_data_path,'rb'))

df=pd.DataFrame({
    'expert': one_data_table_stats(their),
    'imitation': one_data_table_stats(our)
})

print('the comparsion of expert_policy'+ envname)
df

the comparsion of expert_policyHumanoid-v1


Unnamed: 0,expert,imitation
mean reward,10418.305422,186.716306
pct full rollout,1.0,0.757862
std reward,72.307472,36.59224


# Animate the learned policy

In [10]:
view_data(data,5)

[2017-02-18 19:56:16,831] Making new env: Humanoid-v1


Total rollouts from data: 5
start rollout 0
Episode finished after 103 timesteps
start rollout 1
Episode finished after 90 timesteps
start rollout 2
Episode finished after 88 timesteps
start rollout 3
Episode finished after 103 timesteps
start rollout 4
Episode finished after 90 timesteps
