-
Notifications
You must be signed in to change notification settings - Fork 148
/
train.py
135 lines (111 loc) · 5.95 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
from __future__ import (absolute_import, division,
print_function, unicode_literals)
from future.builtins import (super)
import gym
import tensorflow as tf
import numpy as np
from gym import spaces
import deepdrive
import config as c
from agents.common import get_throttle
from agents.dagger.agent import Agent
from agents.dagger.net import MOBILENET_V2_NAME
from sim.driving_style import DrivingStyle
from sim.action import Action
from util.experience_buffer import ExperienceBuffer
from vendor.openai.baselines.ppo2.run_deepdrive import train
class BootstrapRLGymEnv(gym.Wrapper):
"""Bootstrap is probably a bad name here due to its overloaded use in RL where bootstrapping historically refers
to learning with value based or TDD methods."""
def __init__(self, env, dagger_agent, driving_style=DrivingStyle.NORMAL):
super(BootstrapRLGymEnv, self).__init__(env)
self.dagger_agent = dagger_agent
self.driving_style = driving_style
self.previous_obz = None
self.experience_buffer = ExperienceBuffer()
self.simple_test = c.SIMPLE_PPO
# One thing we need to do here is to make each action a bi-modal guassian to avoid averaging 50/50 decisions
# i.e. half the time we veer left, half the time veer right - but on average this is go straight and can run us
# into an obstacle. right now the DiagGaussianPd is just adding up errors which would not be the right
# thing to do for a bi-modal guassian. also, DiagGaussianPd assumes steering and throttle are
# independent which is not the case (steering at higher speeds causes more acceleration a = v**2/r),
# so that may be a problem as well.
if self.simple_test:
shape = (5,)
else:
# TODO: Add prior 200ms, 500ms, 1s and 2s mobilenet activations, along with speed, acceleration, and other stats we get from obz
speed_length = 1
acceleration_length = 3 # x,y,z
previous_output_length = 3 # steer,throttle,handbrake
# obz_length = dagger_agent.net.num_last_hidden + dagger_agent.net.num_targets
#
# shape = (obz_length * self.experience_buffer.fade_length,)
shape = (dagger_agent.net.num_last_hidden + dagger_agent.net.num_targets,)
self.observation_space = spaces.Box(low=np.finfo(np.float32).min,
high=np.finfo(np.float32).max,
# shape=(c.ALEXNET_FC7 + c.NUM_TARGETS,),
shape=shape,
dtype=np.float32)
def step(self, action):
if self.driving_style == DrivingStyle.STEER_ONLY and self.previous_obz is not None:
# Simplifying by only controlling steering. Otherwise, we need to shape rewards so that initial acceleration
# is not disincentivized by gforce penalty.
action[Action.THROTTLE_INDEX] = get_throttle(actual_speed=self.previous_obz['speed'],
target_speed=(8 * 100))
obz, reward, done, info = self.env.step(action)
if 'score' in info and 'episode_time' in info['score']:
self.experience_buffer.maybe_add(obz, info['score']['episode_time'])
self.previous_obz = obz
action, net_out = self.dagger_agent.act(obz, reward, done)
if net_out is None:
obz = None
else:
if self.simple_test:
obz = np.array([np.squeeze(a) for a in action])
else:
obz = np.concatenate((np.squeeze(net_out[0]), np.squeeze(net_out[1])))
return obz, reward, done, info
def reset(self):
return self.env.reset()
def run(env_id, bootstrap_net_path,
resume_dir=None, experiment=None, camera_rigs=None, render=False, fps=c.DEFAULT_FPS,
should_record=False, is_discrete=False, agent_name=MOBILENET_V2_NAME, is_sync=True,
driving_style=DrivingStyle.NORMAL, is_remote_client=False, eval_only=False):
tf_config = tf.ConfigProto(
allow_soft_placement=True,
intra_op_parallelism_threads=1,
inter_op_parallelism_threads=1,
gpu_options=tf.GPUOptions(
per_process_gpu_memory_fraction=0.4,
# leave room for the game,
# NOTE: debugging python, i.e. with PyCharm can cause OOM errors, where running will not
allow_growth=True
),
)
g_1 = tf.Graph()
with g_1.as_default():
sess_1 = tf.Session(config=tf_config)
with sess_1.as_default():
dagger_gym_env = deepdrive.start(experiment=experiment, env_id=env_id, cameras=camera_rigs, render=render, fps=fps,
combine_box_action_spaces=True, is_sync=is_sync,
driving_style=driving_style, is_remote_client=is_remote_client)
dagger_agent = Agent(sess_1, should_record_recovery_from_random_actions=False, should_record=should_record,
net_path=bootstrap_net_path, output_last_hidden=True, net_name=MOBILENET_V2_NAME)
g_2 = tf.Graph()
with g_2.as_default():
sess_2 = tf.Session(config=tf_config)
with sess_2.as_default():
# Wrap step so we get the pretrained layer activations rather than pixels for our observation
bootstrap_gym_env = BootstrapRLGymEnv(dagger_gym_env, dagger_agent, driving_style)
if c.SIMPLE_PPO:
minibatch_steps = 16
mlp_width = 5
else:
minibatch_steps = 80
mlp_width = 64
train(bootstrap_gym_env, seed=c.RNG_SEED, sess=sess_2, is_discrete=is_discrete,
minibatch_steps=minibatch_steps, mlp_width=mlp_width, eval_only=eval_only)
#
# action = deepdrive.action()
# while not done:
# observation, reward, done, info = gym_env.step(action)