In [1]:
from maml_env import HalfCheetahDirecBulletEnv
import tensorflow as tf
from tf_agents.specs import tensor_spec
from tf_agents.specs import array_spec
import tensorflow_probability as tfp
from tf_agents.policies import greedy_policy
import tensorflow.keras as keras
import tensorflow.keras.backend as keras_backend
from tf_agents.trajectories import time_step as ts
from tf_agents.policies import actor_policy
import tensorflow.keras.losses as kls
from tf_agents.metrics import tf_metrics
keras_backend.set_floatx('float32')
import pandas as pd

import random
import sys
import time

import numpy as np
import matplotlib.pyplot as plt
from tf_agents.networks import network

In [2]:
class Tasks:
    def __init__(self, *task_configs):
        self.tasks = [i for i in task_configs]

    def sample_tasks(self, batch_size):
        return random.choices(self.tasks, k=batch_size)

In [3]:
class policyNet(keras.Model):
    def __init__(self):
        super().__init__()
        self.hidden1 = keras.layers.Dense(40, activation="relu",input_shape=(1,),name = "Inner Net Input")
        self.hidden2 = keras.layers.Dense(40, activation = "relu", name = "Inner Net Hidden")
        self.out = keras.layers.Dense(6,activation="tanh", name = "Inner Net Ouput")
        
    def call(self, x):
        output = self.hidden1(x)
        output = self.hidden2(output)
        output = self.out(output)
        return output

In [4]:
class ActionNet(network.Network):

    def __init__(self, input_tensor_spec, output_tensor_spec):
        super(ActionNet, self).__init__(
            input_tensor_spec=input_tensor_spec,
            state_spec=(),name='ActionNet')
        self._output_tensor_spec = output_tensor_spec
        self._sub_layers = [
            tf.keras.layers.Dense(26, activation = tf.nn.relu),
            tf.keras.layers.Dense(30,activation = tf.nn.relu),
            tf.keras.layers.Dense(
                action_spec.shape.num_elements(), activation=tf.nn.tanh),
    ]
    
    def call(self, observations, step_type=(),network_state=()):

        output = tf.cast(observations, dtype=tf.float32)
        
        for layer in self._sub_layers:
            output = layer(output)
        
        actions = tf.reshape(output, [-1] + self._output_tensor_spec.shape.as_list())
        return actions, network_state

class ValueNet(network.Network):

    def __init__(self, input_tensor_spec, output_tensor_spec):
        super(ValueNet, self).__init__(
            input_tensor_spec=input_tensor_spec,
            state_spec=(),name='ValueNet')
        self._output_tensor_spec = output_tensor_spec
        self._sub_layers = [
            tf.keras.layers.Dense(26, activation = tf.nn.relu),
            tf.keras.layers.Dense(30,activation = tf.nn.relu),
            tf.keras.layers.Dense(1,activation=None),
    ]
    
    def call(self, observations, step_type=(),network_state=()):

        output = tf.cast(observations, dtype=tf.float32)
        
        for layer in self._sub_layers:
            output = layer(output)
        
        values = tf.reshape(output, [-1] + self._output_tensor_spec.shape.as_list())
        return values, network_state

In [5]:
class ActionDistributionNet(ActionNet):
    def call(self, observations):
        action_means, network_state = super(ActionDistributionNet, self).call(
                observations)

        action_std = tf.Variable(tf.ones_like(action_means),dtype=tf.float32,name="Inner Sigma")
        return tfp.distributions.MultivariateNormalDiag(action_means, action_std), network_state

In [6]:
class agent():
    def __init__(self,input_tensor_spec,action_spec,value_specs):
        self.input_tensor_spec = input_tensor_spec
        self.action_spec = action_spec
        self.value_specs = value_specs
        self.policy = ActionDistributionNet(self.input_tensor_spec,self.action_spec)
        self.critic = ValueNet(self.input_tensor_spec,self.value_specs)
        self.a_opt = keras.optimizers.Adam(learning_rate=0.01)
        self.c_opt = keras.optimizers.Adam(learning_rate=0.01)
        self.clip_pram = 0.2
        
    def act(self,state):
        dist,_ = self.policy(state)
        action = dist.sample()
        mult = tf.constant(2.,dtype=tf.float32)
        sub = tf.constant(1.,dtype=tf.float32)
        action = tf.subtract(
                    tf.multiply(mult,
                        tf.divide(
                            tf.subtract(
                                action, 
                                tf.reduce_min(action)
                                ), 
                            tf.subtract(
                                tf.reduce_max(action), 
                                tf.reduce_min(action)
                                )
                            )
                        ),sub
                            
                    )
        return action
            
    def learn_pg(self,states,actions,rewards):
        with tf.GradientTape() as tape:
            dist , _ = self.policy(states)
            rewards = tf.reshape(rewards,(len(rewards),1))
            actions = tf.reshape(actions,(len(actions),6))
            logps = dist.log_prob(actions)
            loss = tf.math.negative(tf.reduce_sum(tf.math.multiply(rewards,logps)))
            
        grads = tape.gradient(loss,self.policy.trainable_variables)
        self.a_opt.apply_gradients(zip(grads,self.policy.trainable_variables))
        return loss
    
    def preprocess_ppo(self, states, actions, rewards, values):
        g = 0
        lmbda = 1
        returns = []
        new_vals = []
        for i in reversed(range(len(rewards))):
            if i+1 % 201 == 0:
                pass
            delta = rewards[i] + values[i + 1] - values[i]
            g = delta + lmbda * g
            returns.append(g + values[i])
            new_vals.append(values[i])

        returns.reverse()
        adv = np.array(returns, dtype=np.float32) - new_vals
        adv = (adv - np.mean(adv)) / (np.std(adv) + 1e-10)
        states = np.array(states, dtype=np.float32)
        actions = np.array(actions, dtype=np.float32)
        returns = np.array(returns, dtype=np.float32)
        return states, actions, returns, adv
    
    def learn_ppo(self, states, actions,  adv , old_probs, discnt_rewards):
        discnt_rewards = tf.reshape(discnt_rewards, (len(discnt_rewards),))
        adv = tf.reshape(adv, (len(adv),))

        old_p = old_probs

        old_p = tf.reshape(old_p, (len(old_p),1))
        with tf.GradientTape() as tape1, tf.GradientTape() as tape2:
            dist, _ = self.policy(states)
            p = dist.log_prob(actions)
            entropy = tf.math.negative(tf.reduce_sum(tf.math.multiply(discnt_rewards,p)))
            sur1 = []
            sur2 = []
            for pb, t, op in zip(probs, adv, old_probs):
                        t =  tf.constant(t)
                        op =  tf.constant(op)
                        ratio = tf.math.divide(pb,op)
                        s1 = tf.math.multiply(ratio,t)
                        s2 =  tf.math.multiply(tf.clip_by_value(ratio, 1.0 - self.clip_pram, 1.0 + self.clip_pram),t)
                        sur1.append(s1)
                        sur2.append(s2)
            sr1 = tf.stack(sur1)
            sr2 = tf.stack(sur2)
            v, _ = self.critic(states)
            v = tf.reshape(v, (len(v),1))
            td = tf.math.subtract(discnt_rewards, v)
            c_loss = 0.5 * kls.mean_squared_error(discnt_rewards, v)
            a_loss = tf.math.negative(tf.reduce_mean(tf.math.minimum(sr1, sr2)) - c_loss + 0.001 * entropy)
            
        grads1 = tape1.gradient(a_loss, self.policy.trainable_variables)
        grads2 = tape2.gradient(c_loss, self.critic.trainable_variables)
        self.a_opt.apply_gradients(zip(grads1, self.policy.trainable_variables))
        self.c_opt.apply_gradients(zip(grads2, self.critic.trainable_variables))
        return a_loss, c_loss
    
    def actor_loss_ppo(self, probs, actions, adv, old_probs, closs):
        
        probability = probs      
        entropy = tf.reduce_mean(tf.math.negative(tf.math.multiply(probability,tf.math.log(probability))))
        sur1 = []
        sur2 = []
        
        for pb, t, op in zip(probability, adv, old_probs):
            t =  tf.constant(t)
            op =  tf.constant(op)
            ratio = tf.math.divide(pb,op)
            s1 = tf.math.multiply(ratio,t)
            s2 =  tf.math.multiply(tf.clip_by_value(ratio, 1.0 - self.clip_pram, 1.0 + self.clip_pram),t)
            sur1.append(s1)
            sur2.append(s2)

        sr1 = tf.stack(sur1)
        sr2 = tf.stack(sur2)
        
        loss = tf.math.negative(tf.reduce_mean(tf.math.minimum(sr1, sr2)) - closs + 0.001 * entropy)
        return loss

In [7]:
def copy_actor(policy, x, input_tensor_spec,action_spec):
    '''Copy model weights to a new model.
    
    Args:
        model: model to be copied.
        x: An input example. This is used to run
            a forward pass in order to add the weights of the graph
            as variables.
    Returns:
        A copy of the model.
    '''
    policy(tf.convert_to_tensor(x))
    copied_model_actor = ActionDistributionNet(input_tensor_spec,action_spec)
    # If we don't run this step the weights are not "initialized"
    # and the gradients will not be computed.
    copied_model_actor(tf.convert_to_tensor(x))
   
    copied_model_actor.set_weights(policy.get_weights())
    
    return copied_model_actor

def copy_critic(critic, x, input_tensor_spec,value_spec):
    '''Copy model weights to a new model.
    
    Args:
        model: model to be copied.
        x: An input example. This is used to run
            a forward pass in order to add the weights of the graph
            as variables.
    Returns:
        A copy of the model.
    '''
    critic(tf.convert_to_tensor(x))
    copied_model_critic = ValueNet(input_tensor_spec,value_spec)
    # If we don't run this step the weights are not "initialized"
    # and the gradients will not be computed.
    copied_model_critic(tf.convert_to_tensor(x))
   
    copied_model_critic.set_weights(critic.get_weights())
    
    return copied_model_critic

In [8]:
# Sample one trajectory for task_i and collect the transitions (old_state,action,new_state,reward)
tasks = Tasks(("Forward", True), ("Backward", False))
task_config = tasks.sample_tasks(1)
task_name, env_args = task_config[0], task_config[1:]
env = HalfCheetahDirecBulletEnv(*env_args)
input_tensor_spec = tensor_spec.TensorSpec((26,), tf.float32)
action_spec = tensor_spec.BoundedTensorSpec((6,),tf.float32,minimum=-1,maximum=1)
value_spec = tensor_spec.TensorSpec((1,),tf.float32)
# Instantiate initial agent with random policy theta
outer_agent = agent(input_tensor_spec,action_spec,value_spec)
# Update temp outer agent with trajectories sampled from each updated policy in env. t_i
temp_outer_agent = agent(input_tensor_spec,action_spec,value_spec)
K = 1
meta_iterations = 100
num_adapt_steps=0
# Sample tasks from task distribution to train policy on
avg_meta_return=[]
meta_iter_policies=[]
meta_iter_critics=[]
task_list = []
for meta_iter in range(meta_iterations):
    # Update actual outer agent policy with temp outer agent after learning with theta`_i from each task
    if meta_iter_policies:
        outer_agent.critic= meta_iter_critics[-1]
        outer_agent.policy=meta_iter_policies[-1]
    avg_task_return = []
    for task_config in tasks.sample_tasks(20):
        task_name, env_args = task_config[0], task_config[1:]
        task_list.append(task_name)
        env = HalfCheetahDirecBulletEnv(*env_args)
        # Create a copy of the model to sample trajectories with new policy theta'
        actor_copy = copy_actor(outer_agent.policy,env.reset().reshape(1,26),
                                            input_tensor_spec,action_spec)
        # Create the inner agent to update the policy using the copy of the model made above
        inner_agent = agent(input_tensor_spec,action_spec,value_spec)
        inner_agent.policy=actor_copy
        if num_adapt_steps==1 | num_adapt_steps==2:
            # Sample K trajectories from outer policy
            Ss = []
            As = []
            Rs = []
            for _ in range(K):
                state = env.reset()
                h = 0 
                # Sample K trajectories from initial policy on task_i
                while True:
                    Ss.append(state)
                    action = outer_agent.act(state.reshape(1,26))
                    As.append(action)
                    state, reward, done, _ = env.step(action.numpy().flatten())
                    Rs.append(reward)
                    h += 1
                    if h == 200:
                        break
            # Calculate the loss and update the inner agent
            task_loss = inner_agent.learn_pg(np.array(Ss,dtype=np.float32),As,np.array(Rs,dtype=np.float32))
        if num_adapt_steps==2:
            # Sample K trajectories from outer policy
            Ss = []
            As = []
            Rs = []
            for _ in range(K):
                state = env.reset()
                h = 0 
                # Sample K trajectories from initial policy on task_i
                while True:
                    Ss.append(state)
                    action = outer_agent.act(state.reshape(1,26))
                    As.append(action)
                    state, reward, done, _ = env.step(action.numpy().flatten())
                    Rs.append(reward)
                    h += 1
                    if h == 200:
                        break
            task_loss2 = inner_agent.learn_pg(np.array(Ss,dtype=np.float32),As,np.array(Rs,dtype=np.float32))
        # Sample trajectories using new policy theta' with inner agent
        avg_return = []
        new_Ss = []
        new_As = []
        new_Rs = []
        values = []
        probs = []
        for _ in range(K):
            state = env.reset()
            h = 0
            init_value,_ = inner_agent.critic(state.reshape(1,26))
            values.append(init_value.numpy()[0])
            # Sample K trajectories from updated policy on task_i
            while True:
                new_Ss.append(state)
                action = inner_agent.act(state.reshape(1,26))
                new_As.append(action)
                state, reward, done, _ = env.step(action.numpy().flatten())
                new_Rs.append(reward)
                actor_dist,_ = inner_agent.policy(state.reshape(1,26))
                probs.append(actor_dist.log_prob(action))
                value,_ = inner_agent.critic(state.reshape(1,26))
                values.append(value.numpy()[0])
                h += 1
                if h == 200:
                    avg_return.append(sum(new_Rs[-200:]))
                    break
        states, actions,returns, adv  = inner_agent.preprocess_ppo(new_Ss, new_As, new_Rs, values)
        
        # Update temp outer agent with trajectories sampled from each updated policy in env. t_i
        temp_outer_agent = agent(input_tensor_spec,action_spec,value_spec)
        temp_outer_agent.policy=actor_copy
        temp_outer_agent.critic=(copy_critic(outer_agent.critic,env.reset().reshape(1,26),
                                              input_tensor_spec,value_spec))
        actor_loss,critic_loss = temp_outer_agent.learn_ppo(np.array(new_Ss,dtype=np.float32),
                                                            new_As,adv,probs,np.array(new_Rs,dtype=np.float32))
        avg_task_return.append(sum(avg_return)/len(avg_return))
    # Update outer agent policy without in a copy of the model, so the inner_agent isn't affected
    meta_iter_critics.append(copy_critic(temp_outer_agent.critic,env.reset().reshape(1,26),input_tensor_spec,value_spec))
    meta_iter_policies.append(copy_actor(temp_outer_agent.policy,env.reset().reshape(1,26),input_tensor_spec,value_spec))
    avg_meta_return.append(sum(avg_task_return)/len(avg_task_return))
    print("Meta Iteration Complete")




Meta Iteration Complete
Meta Iteration Complete
Meta Iteration Complete
Meta Iteration Complete
Meta Iteration Complete
Meta Iteration Complete
Meta Iteration Complete
Meta Iteration Complete
Meta Iteration Complete
Meta Iteration Complete
Meta Iteration Complete
Meta Iteration Complete
Meta Iteration Complete
Meta Iteration Complete
Meta Iteration Complete
Meta Iteration Complete
Meta Iteration Complete
Meta Iteration Complete
Meta Iteration Complete
Meta Iteration Complete
Meta Iteration Complete
Meta Iteration Complete
Meta Iteration Complete
Meta Iteration Complete
Meta Iteration Complete
Meta Iteration Complete
Meta Iteration Complete
Meta Iteration Complete
Meta Iteration Complete
Meta Iteration Complete
Meta Iteration Complete
Meta Iteration Complete
Meta Iteration Complete
Meta Iteration Complete
Meta Iteration Complete
Meta Iteration Complete
Meta Iteration Complete
Meta Iteration Complete
Meta Iteration Complete
Meta Iteration Complete
Meta Iteration Complete
Meta Iteration C

KeyboardInterrupt: 

In [None]:
def main(args):
    tasks = Tasks(("Forward", True), ("Backward", False))
    input_tensor_spec = tensor_spec.TensorSpec((26,), tf.float32)
    action_spec = tensor_spec.BoundedTensorSpec((6,),tf.float32,minimum=-1,maximum=1)
    # Outer loop
    for meta_iter in range(args.meta_iteration):
        for task_config in tasks.sample_tasks(args.meta_batch_size):
            # Inner loop
            task_name, env_args = task_config[0], task_config[1:]
            env = HalfCheetahDirecBulletEnv(*env_args)
            init = env.reset()
            # Adaptation

            # Run adapted policy

        # Meta Optimization

In [None]:
if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser()

    parser.add_argument("--meta_iteration", default=500, type=int)
    parser.add_argument("--meta_batch_size", default=40, type=int)
    parser.add_argument("--horizon", "-H", default=200, type=int)
    parser.add_argument("--num_adapt_steps", default=1, type=int)

    args = parser.parse_args()

    main(args)