/
__init__.py
234 lines (190 loc) · 9.09 KB
/
__init__.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
"""Interface for reinforcement learning."""
try:
from deepchem.rl.a2c import A2C # noqa: F401
from deepchem.rl.ppo import PPO # noqa: F401
except ModuleNotFoundError:
pass
class Environment(object):
"""An environment in which an actor performs actions to accomplish a task.
An environment has a current state, which is represented as either a single NumPy
array, or optionally a list of NumPy arrays. When an action is taken, that causes
the state to be updated. The environment also computes a reward for each action,
and reports when the task has been terminated (meaning that no more actions may
be taken).
Two types of actions are supported. For environments with discrete action spaces,
the action is an integer specifying the index of the action to perform (out of a
fixed list of possible actions). For environments with continuous action spaces,
the action is a NumPy array.
Environment objects should be written to support pickle and deepcopy operations.
Many algorithms involve creating multiple copies of the Environment, possibly
running in different processes or even on different computers.
"""
def __init__(self,
state_shape,
n_actions=None,
state_dtype=None,
action_shape=None):
"""Subclasses should call the superclass constructor in addition to doing their own initialization.
A value should be provided for either n_actions (for discrete action spaces)
or action_shape (for continuous action spaces), but not both.
Parameters
----------
state_shape: tuple or list of tuples
the shape(s) of the array(s) making up the state
n_actions: int
the number of discrete actions that can be performed. If the action space
is continuous, this should be None.
state_dtype: dtype or list of dtypes
the type(s) of the array(s) making up the state. If this is None, all
arrays are assumed to be float32.
action_shape: tuple
the shape of the array describing an action. If the action space
is discrete, this should be none.
"""
self._state_shape = state_shape
self._n_actions = n_actions
self._action_shape = action_shape
self._state = None
self._terminated = None
if state_dtype is None:
# Assume all arrays are float32.
import numpy
try:
from collections.abc import Sequence as SequenceCollection
except:
from collections import Sequence as SequenceCollection
if isinstance(state_shape[0], SequenceCollection):
self._state_dtype = [numpy.float32] * len(state_shape)
else:
self._state_dtype = numpy.float32
else:
self._state_dtype = state_dtype
@property
def state(self):
"""The current state of the environment, represented as either a NumPy array or list of arrays.
If reset() has not yet been called at least once, this is undefined.
"""
return self._state
@property
def terminated(self):
"""Whether the task has reached its end.
If reset() has not yet been called at least once, this is undefined.
"""
return self._terminated
@property
def state_shape(self):
"""The shape of the arrays that describe a state.
If the state is a single array, this returns a tuple giving the shape of that array.
If the state is a list of arrays, this returns a list of tuples where each tuple is
the shape of one array.
"""
return self._state_shape
@property
def state_dtype(self):
"""The dtypes of the arrays that describe a state.
If the state is a single array, this returns the dtype of that array. If the state
is a list of arrays, this returns a list containing the dtypes of the arrays.
"""
return self._state_dtype
@property
def n_actions(self):
"""The number of possible actions that can be performed in this Environment.
If the environment uses a continuous action space, this returns None.
"""
return self._n_actions
@property
def action_shape(self):
"""The expected shape of NumPy arrays representing actions.
If the environment uses a discrete action space, this returns None.
"""
return self._action_shape
def reset(self):
"""Initialize the environment in preparation for doing calculations with it.
This must be called before calling step() or querying the state. You can call it
again later to reset the environment back to its original state.
"""
raise NotImplementedError("Subclasses must implement this")
def step(self, action):
"""Take a time step by performing an action.
This causes the "state" and "terminated" properties to be updated.
Parameters
----------
action: object
an object describing the action to take
Returns
-------
the reward earned by taking the action, represented as a floating point number
(higher values are better)
"""
raise NotImplementedError("Subclasses must implement this")
class GymEnvironment(Environment):
"""This is a convenience class for working with environments from OpenAI Gym."""
def __init__(self, name):
"""Create an Environment wrapping the OpenAI Gym environment with a specified name."""
import gym
self.env = gym.make(name)
self.name = name
space = self.env.action_space
if 'n' in dir(space):
super(GymEnvironment,
self).__init__(self.env.observation_space.shape, space.n)
else:
super(GymEnvironment,
self).__init__(self.env.observation_space.shape,
action_shape=space.shape)
def reset(self):
self._state = self.env.reset()
self._terminated = False
def step(self, action):
self._state, reward, self._terminated, info = self.env.step(action)
return reward
def __deepcopy__(self, memo):
return GymEnvironment(self.name)
class Policy(object):
"""A policy for taking actions within an environment.
A policy is defined by a tf.keras.Model that takes the current state as input
and performs the necessary calculations. There are many algorithms for
reinforcement learning, and they differ in what values they require a policy to
compute. That makes it impossible to define a single interface allowing any
policy to be optimized with any algorithm. Instead, this interface just tries
to be as flexible and generic as possible. Each algorithm must document what
values it expects the model to output.
Special handling is needed for models that include recurrent layers. In that
case, the model has its own internal state which the learning algorithm must
be able to specify and query. To support this, the Policy must do three things:
1. The Model must take additional inputs that specify the initial states of
all its recurrent layers. These will be appended to the list of arrays
specifying the environment state.
2. The Model must also return the final states of all its recurrent layers as
outputs.
3. The constructor argument rnn_initial_states must be specified to define
the states to use for the Model's recurrent layers at the start of a new
rollout.
Policy objects should be written to support pickling. Many algorithms involve
creating multiple copies of the Policy, possibly running in different processes
or even on different computers.
"""
def __init__(self, output_names, rnn_initial_states=[]):
"""Subclasses should call the superclass constructor in addition to doing
their own initialization.
Parameters
----------
output_names: list of strings
the names of the Model's outputs, in order. It is up to each reinforcement
learning algorithm to document what outputs it expects policies to compute.
Outputs that return the final states of recurrent layers should have the
name 'rnn_state'.
rnn_initial_states: list of NumPy arrays
the initial states of the Model's recurrent layers at the start of a new
rollout
"""
self.output_names = output_names
self.rnn_initial_states = rnn_initial_states
def create_model(self, **kwargs):
"""Construct and return a tf.keras.Model that computes the policy.
The inputs to the model consist of the arrays representing the current state
of the environment, followed by the initial states for all recurrent layers.
Depending on the algorithm being used, other inputs might get passed as
well. It is up to each algorithm to document that.
"""
raise NotImplementedError("Subclasses must implement this")