In [1]:
%load_ext autoreload
%autoreload 2
%aimport -jax
%aimport -jaxlib

In [2]:
import stanza.envs as envs
import stanza.policies as policies
import optax
import jax
import jax.numpy as jnp
from jax.random import PRNGKey
from stanza import Partial
from stanza.rl.ppo import PPO
from stanza.train import Trainer
from stanza.rl import EpisodicEnvironment, ACPolicy
from stanza.rl.nets import MLPActorCritic
from stanza.util.rich import StatisticsTable, ConsoleDisplay, LoopProgress
from stanza.solver.ilqr import iLQRSolver
from stanza.util.random import PRNGSequence


In [3]:
from stanza.util.logging import logger
from stanza.policies.mpc import MPC
from stanza.data.trajectory import Timestep
from stanza.data import Data
env = envs.create("pendulum")
# will automatically reset when done
# or when 1000 timesteps have been reached
solver_t = iLQRSolver()
expert_policy=MPC(
            # Sample action
            action_sample=env.sample_action(PRNGKey(42)),
            cost_fn=env.cost, 
            model_fn=env.step,
            horizon_length=50,
            solver=solver_t,
            receed=False
        )

def rollout_mpc(key: PRNGKey):
    # An MPC policy
    rollout = policies.rollout(
        model=env.step,
        state0=env.reset(key),
        length=50,
        policy=expert_policy
    )
    #turns from Python/jax Data class into Stanza Dataset
    return Data.from_pytree(Timestep(rollout.states,rollout.actions))
    
    #logger.info(f'MPC Rollout with {solver} solver results')
    #logger.info('states: {}', rollout.states)
    #logger.info('actions: {}', rollout.actions)
    #cost = env.cost(rollout.states, rollout.actions)
    #logger.info('cost: {}', cost)


num_trajs = 100
def batch_roll(rng_key, num_t):
    roll_fun = jax.vmap(rollout_mpc)
    rng_keys = jax.random.split(rng_key,num_t)
    return roll_fun(rng_keys)

expert_data = Data.from_pytree(batch_roll(PRNGKey(42), num_trajs))

In [4]:
from stanza.goal_conditioned.roll_in_sampler import roll_in_sampler
from stanza.envs import Environment
from stanza.goal_conditioned import GCState, StartEndGoal
import chex
from stanza.data.trajectory import Timestep

action_noiser = None
process_noiser = None



def gsa_sampler(key: PRNGKey, traj_data = expert_data,  env : Environment = env, encode_start = False, 
                delta_t_max = 3, delta_t_min = 8,
                roll_len_min = 3, roll_len_max = 8):

    
    #chex.assert_scalar_non_negative(roll_len_min)

    rng = PRNGSequence(key)
    rand_traj = traj_data.sample(next(rng))
    traj_len = rand_traj.length

    delta_t = jax.random.randint(next(rng), (), minval = delta_t_min,maxval = delta_t_max)
    delta_t = jax.lax.cond(delta_t <= traj_len - 1, lambda x: x, lambda x: traj_len - 1, operand = delta_t) 

    start_t = jax.random.randint(next(rng), (), minval = 1,
                                 maxval = traj_len - delta_t)
    
    roll_len = jax.random.randint(next(rng), (), minval = roll_len_min,maxval = roll_len_max)
    
    roll_len = jax.lax.cond(roll_len < start_t + 1, lambda x: x, lambda x: start_t, operand = roll_len)
    
    

    start_state, start_action =  roll_in_sampler(traj = rand_traj,
                    target_time = start_t,
                    noise_rng_key = next(rng), 
                    roll_len = roll_len, 
                    env = env, 
                    env_rng_key = next(rng),
                    action_noiser = action_noiser, 
                    process_noiser = process_noiser )
        
    end_state = rand_traj.get(start_t + delta_t).observation

    if encode_start:
        goal = StartEndGoal(start_state = start_state, 
                            end_state = end_state)
    else:
        goal = StartEndGoal(start_state = None, end_state = end_state)
    return Timestep(observation = GCState(goal = goal, env_state = start_state), 
                    action = start_action)

gs_sampler = (lambda key: gsa_sampler(key, encode_start = False).observation)
my_gc_state = gs_sampler(PRNGKey(42))
print(my_gc_state)


GCState(goal=StartEndGoal(start_state=None, end_state=State(angle=Array(3.1154735, dtype=float32), vel=Array(0.03439788, dtype=float32))), env_state=State(angle=Array(2.9252484, dtype=float32), vel=Array(0.22980095, dtype=float32)))


In [5]:
import math


def goal_reward(state, next_state, end_state):
        angle_diff = next_state.angle - state.angle
        vel_diff = next_state.vel - state.vel
        angle_rew = 32 * angle_diff * jnp.sign(end_state.angle - next_state.angle)
        vel_rew = vel_diff * jnp.sign(end_state.vel-next_state.vel)
        return angle_rew + vel_rew

def cost_to_goal( x, u, x_goal):
        x = jnp.stack((x.angle, x.vel), -1)
        x_goal = jnp.stack((x_goal.angle, x_goal.vel), -1)
        diff = (x - x_goal)
        x_cost = jnp.sum(diff[:-1]**2)
        xf_cost = jnp.sum(diff[-1]**2)
        if u == None:
            u_cost = 0
        else:
            u_cost = jnp.sum(u**2)
        return 5*xf_cost + 2*x_cost + u_cost

def gc_reward(gc_state, action, next_state ):
    env_state, goal = gc_state.env_state, gc_state.goal
    end_state = goal.end_state
    
    return goal_reward(env_state,next_state,end_state)
    #return 3 - (1 * cost_to_goal(env_state, action, end_state))

def g_done(gc_state):
        x = gc_state.env_state
        x_goal = gc_state.goal.end_state
        return (cost_to_goal(x =x,u=None,x_goal = x_goal) < .03*.03)



In [6]:
from stanza.goal_conditioned import GCEnvironment
gc_pendulum_env = GCEnvironment(env = env, gs_sampler = gs_sampler,
                            gc_reward = gc_reward, g_done = g_done)

In [7]:
#Set up net and env
ep_env = EpisodicEnvironment(gc_pendulum_env, 1000)
from stanza.rl.nets import transform_ac_to_mean
from stanza.goal_conditioned.bilevel_policy import make_trivial_bi_policy

net = MLPActorCritic(
    ep_env.sample_action(PRNGKey(0))
)
init_params = net.init(PRNGKey(42),
    ep_env.observe(ep_env.sample_state(PRNGKey(0))))

net.apply(init_params,gs_sampler(PRNGKey(0)))


ac_apply = Partial(net.apply, init_params)
policy = ACPolicy(ac_apply)
bipo = make_trivial_bi_policy(policy)
print("policy_made")

start_state = ep_env.reset(PRNGKey(42))
m_key = PRNGKey(31231)
p_key = PRNGKey(43232)
print("rolling out policy")
roll_len = 5

def print_roll_info(r):
    print(r.actions)
    print(r.final_policy_state)
    print(r.info)
    print(jax.vmap(ep_env.observe)(r.states))


r = policies.rollout(ep_env.step, 
    start_state , policy, 
    model_rng_key=m_key,
    policy_rng_key=p_key,
    observe=ep_env.observe,
    length=roll_len)
print_roll_info(r)

print("rolling out bipo")
r = policies.rollout(ep_env.step, 
    start_state , bipo, 
    model_rng_key=m_key,
    policy_rng_key=p_key,
    observe=ep_env.observe,
    length=roll_len)
print("done rollout")
print_roll_info(r)

#net.apply(init_params, ep_env.sample_state(PRNGKey(0)))
#actor_apply = transform_ac_to_mean(net.apply)
#actor_apply(init_params,ep_env.sample_state(PRNGKey(0)))


policy_made
rolling out policy
[ 1.5575948  -1.1119164   0.08042021  2.0939438 ]
-3.0967412
{'log_prob': Array([-2.1214223 , -1.5447118 , -0.92164135, -3.0967412 ], dtype=float32), 'value': Array([-0.00346179,  0.00905811, -0.00111795, -0.0009095 ], dtype=float32)}
GCState(goal=StartEndGoal(start_state=None, end_state=State(angle=Array([3.1154735, 3.1154735, 3.1154735, 3.1154735, 3.1154735], dtype=float32), vel=Array([0.03439788, 0.03439788, 0.03439788, 0.03439788, 0.03439788],      dtype=float32))), env_state=State(angle=Array([2.9252484, 2.9712086, 3.0781543, 3.1388757, 3.2019532], dtype=float32), vel=Array([0.22980095, 0.5347287 , 0.3036069 , 0.31538695, 0.7309675 ],      dtype=float32)))
rolling out bipo
done rollout
[ 1.5575948  -1.1119164   0.08042021  2.0939438 ]
BLPolicyState(state_low_level=None, state_high_level=Array(-3.0967412, dtype=float32), chunk_time=Array(1, dtype=int32, weak_type=True), current_goal=Array(2.0939438, dtype=float32), info_high_level={'log_prob': Array(-

In [12]:
# BC Pretraining    
from stanza.rl.bc import BCState, BCTrainer


rng_bc = PRNGKey(41)
# note, use > 256 data_points!!
num_bc_data = 500
gc_data = jax.vmap(gsa_sampler)(jax.random.split(PRNGKey(40),num_bc_data))
gc_data = Data.from_pytree(gc_data)
actor_apply = transform_ac_to_mean(net.apply)


display = ConsoleDisplay()
display.add("train", StatisticsTable(), interval=100)
display.add("train", LoopProgress(), interval=100)

with display as w:
    trainer = BCTrainer()
    result =  trainer.train(ac_apply=actor_apply, 
                                ac_params = init_params, dataset=gc_data,
                                rng_key = rng_bc,
                                max_iterations=10000,
                                 hooks=[w.train])
new_params = result.fn_params


Output()

TrainResults(fn_params=FrozenDict({
    params: {
        Dense_0: {
            bias: Array([-0.00529622, -0.02614935, -0.00268743, -0.00767602, -0.01769547,
                   -0.03497847,  0.02488812, -0.01852584,  0.00417277, -0.02757289,
                   -0.04600565, -0.03372218,  0.03583556,  0.02342569,  0.02991934,
                   -0.03089732,  0.05261647,  0.00715047, -0.01816924,  0.00032106,
                    0.0626414 ,  0.02999147,  0.00610645,  0.06164348,  0.00040018,
                    0.00685393, -0.01364732, -0.02067262, -0.0252315 , -0.06259558,
                   -0.00088347,  0.01339869, -0.01825848, -0.0205423 ,  0.00681141,
                    0.0789051 ,  0.02233529, -0.02617844,  0.00399305,  0.02117206,
                   -0.03204448,  0.01063614, -0.00918867, -0.07030746, -0.0187936 ,
                   -0.04023917, -0.01474619,  0.00965733, -0.04031759,  0.02910583,
                   -0.02005335,  0.03104168, -0.01303418, -0.00025136,  0.04854416,
 

In [13]:
# RL Training 

display = ConsoleDisplay()
display.add("ppo", StatisticsTable(), interval=1)
display.add("ppo", LoopProgress("RL"), interval=1)

ppo = PPO(
    trainer = Trainer(
        optimizer=optax.chain(
            optax.clip_by_global_norm(0.5),
            optax.adam(3e-4, eps=1e-5)
        )
    )
)

with display as dh:
    trained_params = ppo.train(
        PRNGKey(42),
        ep_env, net.apply,
        new_params,
        rl_hooks=[dh.ppo]
    )

ac_apply = Partial(net.apply, trained_params.fn_params)
policy = ACPolicy(ac_apply)

r = policies.rollout(ep_env.step, 
    ep_env.reset(PRNGKey(42)), policy, 
    model_rng_key=PRNGKey(31231),
    policy_rng_key=PRNGKey(43232),
    observe=ep_env.observe,
    length=200)

print(jax.vmap(ep_env.observe)(r.states))

Output()

NameError: name 'params' is not defined