In [1]:
import sys, os
import time
import numpy as np
import gym
import torch
import torch.nn as nn
from torch import Tensor
import matplotlib.pyplot as plt
import configargparse
from tabulate import tabulate
import torch.multiprocessing as mp

%load_ext autoreload
%autoreload 2

from dm_control import suite
from dm_control import viewer

import foundation as fd
from foundation import util
from foundation import models
from foundation import rl
from foundation import envs
from foundation import train

from rlhw_backend import *

In [2]:
for i, task in enumerate(suite.ALL_TASKS):
    print(i,*task)

0 acrobot swingup
1 acrobot swingup_sparse
2 ball_in_cup catch
3 cartpole balance
4 cartpole balance_sparse
5 cartpole swingup
6 cartpole swingup_sparse
7 cartpole two_poles
8 cartpole three_poles
9 cheetah run
10 finger spin
11 finger turn_easy
12 finger turn_hard
13 fish upright
14 fish swim
15 hopper stand
16 hopper hop
17 humanoid stand
18 humanoid walk
19 humanoid run
20 humanoid run_pure_state
21 humanoid_CMU stand
22 humanoid_CMU run
23 lqr lqr_2_1
24 lqr lqr_6_2
25 manipulator bring_ball
26 manipulator bring_peg
27 manipulator insert_ball
28 manipulator insert_peg
29 pendulum swingup
30 point_mass easy
31 point_mass hard
32 reacher easy
33 reacher hard
34 stacker stack_2
35 stacker stack_4
36 swimmer swimmer6
37 swimmer swimmer15
38 walker stand
39 walker walk
40 walker run


In [3]:
parser = train.setup_rl_options()
args = parser.parse_args(['--config', '../config/ppo.yaml'])
print(args.__dict__.keys())

dict_keys(['config', 'name', 'save_root', 'log_date', 'log_tb', 'log_txt', 'save_freq', 'agent', 'clip', 'policy', 'model', 'baseline', 'env', 'device', 'seed', 'budget_steps', 'steps_per_itr', 'tau', 'epochs', 'batch_size', 'norm_adv', 'optim_type', 'lr', 'weight_decay', 'momentum', 'step_size', 'discount', 'subsample', 'gae_lambda', 'nonlin', 'hidden', 'min_log_std', 'b_hidden', 'b_scale_max', 'b_epochs', 'b_batch_size', 'b_nonlin', 'b_optim_type', 'b_lr', 'b_weight_decay', 'b_momentum', 'b_nesterov', 'b_time_order', 'b_obs_order'])


In [4]:
# manually changing args

args.name = 'test-ppo-nb-cp'

args.log_tb = True

args.env = suite.ALL_TASKS[3]


In [5]:
now = time.strftime("%y-%m-%d-%H%M%S")
if args.log_date:
    args.name = os.path.join(args.name, now)
args.save_dir = os.path.join(args.save_root, args.name)
print('Save dir: {}'.format(args.save_dir))
if args.log_tb or args.log_txt or args.save_freq is not None:
    util.create_dir(args.save_dir)
    print('Logging/Saving in {} (tb={},txt={})'.format(args.save_dir, args.log_tb, args.log_txt))
logger = util.Logger(args.save_dir, tensorboard=args.log_tb, txt=args.log_txt)

if args.seed is None:
    args.seed = util.get_random_seed()
    print('Generating random seed: {}'.format(args.seed))

Save dir: results/test-ppo-nb-cp\19-03-09-020705
Logging/Saving in results/test-ppo-nb-cp\19-03-09-020705 (tb=True,txt=False)
Generating random seed: -1702636919


In [6]:
torch.manual_seed(args.seed)
print('Using {}'.format(args.device))

env = envs.Pytorch_DMC_Env(*args.env, seed=args.seed, device=args.device)

args.state_dim, args.action_dim = env.obs_dim, env.act_dim
print('Env name={} (obs={}, act={})'.format(env._env.task, args.state_dim, args.action_dim))

n_batch = args.budget_steps / args.steps_per_itr

Using cpu
Env name=<dm_control.suite.cartpole.Balance object at 0x0000018315E429E8> (obs=5, act=1)


In [7]:
if 'mlp' in args.baseline:

    baseline_model = NormalizedMLP(args.state_dim, 1, norm='norm' in args.baseline,
                                   hidden_dims=args.b_hidden, nonlin=args.b_nonlin)

    baseline_model.optim = util.get_optimizer(args.b_optim_type, baseline_model.parameters(), lr=args.b_lr, weight_decay=args.b_weight_decay)
    baseline_model.scheduler = torch.optim.lr_scheduler.LambdaLR(
                baseline_model.optim, lambda x: (n_batch - x) / n_batch, -1)

    # print(baseline_model.optim)
    # quit()

    #assert args.baseline == 'norm-mlp'
    baseline = rl.Deep_Baseline(baseline_model, scale_max=args.b_scale_max,
                        batch_size=args.b_batch_size, epochs_per_step=args.b_epochs, )

elif args.baseline == 'lin':
    baseline = rl.Linear_Baseline(state_dim=args.state_dim, value_dim=1)
else:
    raise Exception('unknown baseline: {}'.format(args.baseline))

In [8]:
assert args.policy == 'normal'
assert args.model == 'norm-mlp'
policy_model = NormalizedMLP(args.state_dim, 2 * args.action_dim, hidden_dims=args.hidden, nonlin=args.nonlin)
policy = rl.NormalPolicy(policy_model, )

assert args.agent == 'ppoclip'

agent = rl.PPOClip(policy=policy, baseline=baseline, clip=args.clip, normalize_adv=args.norm_adv,
            optim_type=args.optim_type, lr=args.lr, scheduler_lin=n_batch, weight_decay=args.b_weight_decay,
            batch_size=args.batch_size, epochs_per_step=args.epochs,
            ).to(args.device)

print(agent)
print('Agent has {} parameters'.format(util.count_parameters(agent)))

gen = fd.data.Generator(env, agent, step_limit=args.budget_steps,
                step_threshold=args.steps_per_itr, drop_last_state=True)

itr = 0


PPOClip(
  (policy): NormalPolicy(
    (model): NormalizedMLP(
      (criterion): MSELoss()
      (norm): RunningNormalization()
      (net): Sequential(
        (0): Linear(in_features=5, out_features=8, bias=True)
        (1): PReLU(num_parameters=1)
        (2): Linear(in_features=8, out_features=8, bias=True)
        (3): PReLU(num_parameters=1)
        (4): Linear(in_features=8, out_features=2, bias=True)
      )
    )
  )
  (baseline): Deep_Baseline(
    (model): NormalizedMLP(
      (criterion): MSELoss()
      (norm): RunningNormalization()
      (net): Sequential(
        (0): Linear(in_features=5, out_features=8, bias=True)
        (1): PReLU(num_parameters=1)
        (2): Linear(in_features=8, out_features=8, bias=True)
        (3): PReLU(num_parameters=1)
        (4): Linear(in_features=8, out_features=1, bias=True)
      )
    )
  )
)
Agent has 271 parameters


In [9]:
N = 5

In [15]:
stats = train.run_rl_training(gen, agent, args=args, logger=logger, save_freq=None, 
                              num_iter=N, continue_gen_stats=True)
path = train.save_checkpoint({
            'agent_state_dict': agent.state_dict(),
            'stats': stats,
            'args': args,
            'steps': gen.steps_generated(),
            'episodes': gen.episodes_generated(),
        }, args.save_dir, epoch=itr)
itr += N

  self.val = torch.tensor(val).float()


[ 03-09-19 02:12:12 ] 42000/1000000 (ep=42) : last=273.001 max=401.993 - 377.244 
[ 03-09-19 02:12:21 ] 45000/1000000 (ep=45) : last=199.005 max=401.993 - 343.674 


KeyboardInterrupt: 

In [11]:
env.view(agent)