In [1]:
import sys, os, time
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
# install
# pytorch - pip install torch
# gym - pip install gym

# gym[atari] - pip install gym[atari]

from collections import deque
import copy
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.multiprocessing as mp
from torch import distributions
from torch.distributions import Categorical
import foundation as fd
import foundation.util as util
from itertools import islice
from tabulate import tabulate

import gym

from light import *

In [21]:
args = util.NS()

args.name = 'test-a3c'
args.save_dir = 'results'

args.num_iter = 100
args.num_train = 10
args.num_eval = 5
args.logdate = True
args.tblog = True
args.txtlog = False
args.small_print = True

args.agent = 'ppo'#'vpg'#'ddpg'#'vpg'
args.env = 'InvertedPendulum-v2'#'CartPole-v1' # 'LunarLander-v2'

args.policy = 'full'
args.hidden_dims = []#[6,4]
args.nonlin = 'prelu'
args.discount = 0.99
args.epsilon = 0.01
args.tau = 0.001
args.use_replica = True
args.actor_steps = 1

args.critic_hidden_dims = [4,4]
args.critic_nonlin = 'prelu'

args.buffer_max_episodes = 20
args.buffer_min_start = 1000
args.buffer_batch_size = 128

args.cpi_clip = 0.3
args.target_kl = None
args.kl_weight = 1.
args.agent_epochs = 5
args.agent_batch_size = 32

args.device = 'cuda' if torch.cuda.is_available() else 'cpu'

args.optim = 'adam'
args.lr = 1e-4
args.weight_decay = 1e-4

print('Using device: {}'.format(args.device))

Using device: cpu


In [22]:
now = time.strftime("%b-%d-%Y-%H%M%S")
args.save = os.path.join(args.save_dir, args.name, now if args.logdate else '')
util.create_dir(args.save)
logger = util.Logger(args.save, tensorboard=args.tblog, txt=args.txtlog)
print('Saving to {}'.format(args.save))

env = utils.Pytorch_Gym_Env(args.env, device=args.device)
args.state_dim = env.observation_space.shape[0]
args.discrete_action_space = isinstance(env.action_space, gym.spaces.Discrete)
args.action_dim = env.action_space.n if args.discrete_action_space else env.action_space.shape[0]
print('Env: {} - state-dim={}, action-dim={} ({})'.format(args.env, args.state_dim, args.action_dim, 'discrete' if args.discrete_action_space else 'continuous'))

Saving to results/test-a3c/Dec-14-2018-012718
Env: Pendulum-v0 - state-dim=3, action-dim=1 (continuous)


In [23]:
if args.agent == 'dqn':
    policy = policies.ActionOut_QFunction(args.state_dim, args.action_dim, hidden_dims=args.hidden_dims, nonlin=args.nonlin,
                                          epsilon=args.epsilon)
elif args.discrete_action_space:
    policy = policies.Discrete_Policy(args.state_dim, args.action_dim, hidden_dims=args.hidden_dims, nonlin=args.nonlin)
elif args.policy == 'full':
    policy = policies.Full_Gaussian_Policy(args.state_dim, args.action_dim, hidden_dims=args.hidden_dims, nonlin=args.nonlin)
else:
    policy = policies.Gaussian_Policy(args.state_dim, args.action_dim, hidden_dims=args.hidden_dims, nonlin=args.nonlin)
    
if args.agent == 'vpg':
    agent = agents.REINFORCE(policy, discount=args.discount, 
                             optim_type=args.optim, lr=args.lr, weight_decay=args.weight_decay)
elif args.agent == 'ppo':
    agent = agents.PPO(policy, discount=args.discount, 
                       optim_type=args.optim, lr=args.lr, weight_decay=args.weight_decay,
                       clipping=args.cpi_clip, target_kl=args.target_kl, kl_weight=args.kl_weight, 
                       epochs=args.agent_epochs, batch_size=args.agent_batch_size, )
elif args.agent == 'a3c':
    value_fn = policies.ValueFunction(args.state_dim, hidden_dims=args.critic_hidden_dims, nonlin=args.critic_nonlin)
    agent = agents.A3C(actor=policy, critic=value_fn, discount=args.discount, 
                       optim_type=args.optim, lr=args.lr, weight_decay=args.weight_decay)
elif args.agent == 'dqn':
    buffer = utils.Replay_Buffer(max_transition_size=args.buffer_max_transitions, device=args.device)
    agent = agents.DQN(policy, discount=args.discount, buffer=buffer, batch_size=args.buffer_batch_size, 
                       min_buffer_size=args.buffer_min_start, tau=args.tau, use_replica=args.use_replica,
                       optim_type=args.optim, lr=args.lr, weight_decay=args.weight_decay)
elif args.agent == 'ddpg':
    assert not args.discrete_action_space
    qnet = policies.QFunction(args.state_dim, args.action_dim, hidden_dims=args.critic_hidden_dims, nonlin=args.critic_nonlin)
    policy = policies.ActorCritic(policy, qnet)
    buffer = utils.Replay_Buffer(max_episode_size=args.buffer_max_episodes, device=args.device)
    agent = agents.DDPG(policy, discount=args.discount, actor_steps=args.actor_steps, buffer=buffer, 
                        min_buffer_size=args.buffer_min_start, batch_size=args.buffer_batch_size, tau=args.tau,
                        optim_type=args.optim, lr=args.lr, weight_decay=args.weight_decay)
else:
    raise Exception('Unknown agent: {}'.format(args.agent))
    
agent.to(args.device)
gen = utils.Generator(env, agent)
score = utils.Score(tau=0.01)
stats = util.StatsMeter('score', 'rewards-train', 'rewards-eval')
stats.shallow_join(agent.stats)
total_episodes = 0
agent

REINFORCE(
  (policy): Full_Gaussian_Policy(
    (net): Sequential(
      (0): Linear(in_features=3, out_features=2, bias=True)
    )
  )
  (baseline): Linear(in_features=3, out_features=1, bias=True)
)

In [24]:
for itr in range(args.num_iter):
    #agent.model.epsilon = epsilon * epsilon_decay ** (total_episodes / epsilon_decay_episodes)
    #print('** Iteration {}/{} **'.format(itr+1, num_iter))
    train_rewards = utils.run_iteration('train', args.num_train, agent, gen)
    eval_rewards = utils.run_iteration('eval', args.num_eval, agent, gen)
    total_episodes += args.num_train
    score.update_all(eval_rewards)
    stats.update('score', score.val)
    stats.update('rewards-train', train_rewards.mean())
    stats.update('rewards-eval', eval_rewards.mean())
    
    vals = stats.vals()
    logger.update(vals, step=total_episodes)
    if args.small_print:
        print('Ep {}: train={:.3f}, eval={:.3f}, score={:.3f}'.format(total_episodes, vals['rewards-train'], vals['rewards-eval'], vals['score']))
    else:
        print('Episode: {}'.format(total_episodes))
        print(tabulate(vals))
    
    # save model
print('Done')

Ep 10: train=-1300.205, eval=-1255.676, score=-1027.552
Ep 20: train=-1134.677, eval=-1419.308, score=-1046.678
Ep 30: train=-1216.518, eval=-1290.932, score=-1058.646
Ep 40: train=-1131.098, eval=-1218.439, score=-1066.419
Ep 50: train=-1275.410, eval=-1422.644, score=-1084.021
Ep 60: train=-1136.201, eval=-1467.669, score=-1102.764
Ep 70: train=-1323.634, eval=-1153.983, score=-1105.246
Ep 80: train=-1220.293, eval=-1221.794, score=-1110.921
Ep 90: train=-1133.004, eval=-1268.235, score=-1118.750
Ep 100: train=-1208.555, eval=-1235.151, score=-1124.492
Ep 110: train=-1155.352, eval=-1178.252, score=-1127.134
Ep 120: train=-1225.893, eval=-1156.927, score=-1128.540
Ep 130: train=-1374.182, eval=-1473.363, score=-1145.452
Ep 140: train=-1336.741, eval=-1311.535, score=-1153.649
Ep 150: train=-1183.891, eval=-1273.542, score=-1159.605
Ep 160: train=-1249.083, eval=-1204.392, score=-1161.794
Ep 170: train=-1157.868, eval=-1325.797, score=-1169.764
Ep 180: train=-1365.605, eval=-1241.430,

KeyboardInterrupt: 

In [26]:
utils.run_iteration('eval', 1, agent, gen, render=True)

tensor([-1722.5884], device='cuda:0')

In [32]:
agent.train()
pass

In [33]:
s = env.reset()

In [39]:
agent(s)

tensor([1.2589], device='cuda:0')

In [35]:
d = agent.actor.get_pi(s)
d.loc, d.scale

(tensor([0.2463], device='cuda:0', grad_fn=<ExpandBackward>),
 tensor([0.9736], device='cuda:0', grad_fn=<ExpandBackward>))

In [25]:
d.sample()

tensor([-0.3435], device='cuda:0')

In [20]:
start = time.time()
out = islice(g, 10)
states, actions, rewards = map(torch.cat, zip(*out))
print(time.time() - start)
states.size(), actions.size(), rewards.size()

Created iterator
0.15731143951416016


(torch.Size([191, 4]), torch.Size([191]), torch.Size([191]))

In [4]:
# define hyperparameters
env = Pytorch_Gym_Env('CartPole-v1')
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

WARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.


In [164]:
#agent = DDPG(state_dim, action_dim, max_buffer_size=1000, min_buffer_size=50)
total_episodes = 0
epsilon = 0.01
agent = DQN(state_dim, action_dim, 
            max_buffer_size=1000, min_buffer_size=200, batch_size=128, use_replica=False,
            lr=1e-3, tau=0.001, weight_decay=1e-3, epsilon=epsilon)
agent

DQN(
  (model): QNet(
    (net): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): LeakyReLU(negative_slope=0.01)
      (2): Linear(in_features=8, out_features=2, bias=True)
    )
  )
  (target_model): QNet(
    (net): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): LeakyReLU(negative_slope=0.01)
      (2): Linear(in_features=8, out_features=2, bias=True)
    )
  )
  (criterion): SmoothL1Loss()
)

In [165]:
# create objects
# if True:
#     device = 'cuda'
#     agent.to(device)
#     env.to(device)

In [166]:
epsilon_decay_episodes = 1000
epsilon_decay = 0.5

In [None]:
num_iter = 20
num_train = 50
num_eval = 5
for itr in range(num_iter):
    #agent.model.epsilon = epsilon * epsilon_decay ** (total_episodes / epsilon_decay_episodes)
    #print('** Iteration {}/{} **'.format(itr+1, num_iter))
    train_reward, train_loss = run_episodes('train', num_train, agent, env)
    eval_reward, _ = run_episodes('eval', num_eval, agent, env)
    total_episodes += num_train
    print('Ep:{}: reward={:.3f}, loss={:.3f}, eval={:.3f}'.format(total_episodes, train_reward, train_loss, eval_reward))
    
    # save model
print('Done')

Ep:50: reward=9.800, loss=0.075, eval=23.400
Ep:100: reward=12.160, loss=0.082, eval=14.400
Ep:150: reward=24.320, loss=0.047, eval=9.800
Ep:200: reward=24.220, loss=0.033, eval=42.600
Ep:250: reward=39.800, loss=0.015, eval=34.400
Ep:300: reward=30.880, loss=0.016, eval=17.800
Ep:350: reward=27.180, loss=0.017, eval=75.000
Ep:400: reward=28.700, loss=0.021, eval=17.000
Ep:450: reward=33.120, loss=0.016, eval=14.200
Ep:500: reward=34.120, loss=0.013, eval=57.600
Ep:550: reward=50.880, loss=0.011, eval=68.400
Ep:600: reward=41.140, loss=0.013, eval=10.000
Ep:650: reward=38.120, loss=0.014, eval=43.400
Ep:700: reward=46.040, loss=0.013, eval=34.400
Ep:750: reward=51.880, loss=0.013, eval=11.800
Ep:800: reward=51.860, loss=0.011, eval=93.800
Ep:850: reward=40.580, loss=0.012, eval=57.200


In [None]:
agent.critic.

In [60]:
run_iteration('eval', 1, agent, gen, render=True)

75.9

In [None]:
N, D, M = 10, 4, 1

f = nn.Linear(D, M)
x = torch.randn(N,D)
y = f(x).detach()
g = util.solve(x,y)
g.weight.size()

In [11]:
env = gym.make('Pong-v4')

In [12]:
env.reset().shape

(210, 160, 3)

In [2]:
env = gym.make('LunarLander-v2')

In [3]:
env.reset()

array([-0.00551643,  1.4003555 , -0.55877346, -0.46954992,  0.00639898,
        0.12657054,  0.        ,  0.        ], dtype=float32)

In [11]:
env.render()

True

In [22]:
env.reset()

array([192,   0,   0,   0, 110,  38,   0,   7,  63,   1,  60,  59,   0,
         0,   0,  62, 255,   0, 255, 253,   0,   8,   0,  24, 128,  32,
         1,  86, 247,  86, 247,  86, 247, 134, 243, 245, 243, 240, 240,
       242, 242,  32,  32,  64,  64,  64, 188,  65, 189,   0,   8, 109,
        37,  37,  60,   0,   0,   0,   0, 109, 109,  37,  37, 192, 192,
       192, 192,   1, 192, 202, 247, 202, 247, 202, 247, 202, 247,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,  54, 236, 242, 121, 240], dtype=uint8)

In [23]:

for _ in range(1000):
    env.render()
    _,_,done,_ = env.step(env.action_space.sample())
    plt.pause(0.02)
    if done:
        print('stop')
        break