Arena task, self-play, eval, model saving

cswinter · Oct 13, 2019 · 7f5d65a · 7f5d65a
1 parent 21011a1
commit 7f5d65a
Show file tree

Hide file tree

Showing 6 changed files with 238 additions and 55 deletions.
diff --git a/codecraft.py b/codecraft.py
@@ -8,17 +8,24 @@
 RETRIES = 100
 
 
-def create_game(game_length: int = None, action_delay: int = 0) -> int:
+def create_game(game_length: int = None, action_delay: int = 0, self_play: bool = False, custom_map=None) -> int:
+    if custom_map is None:
+        custom_map = ''
     try:
+        scripted_opponent = 'false' if self_play else 'true'
         if game_length:
-            response = requests.post(f'http://localhost:9000/start-game?maxTicks={game_length}&actionDelay={action_delay}').json()
+            response = requests.post(f'http://localhost:9000/start-game'
+                                     f'?maxTicks={game_length}'
+                                     f'&actionDelay={action_delay}'
+                                     f'&scriptedOpponent={scripted_opponent}',
+                                     json=custom_map).json()
         else:
             response = requests.post(f'http://localhost:9000/start-game?actionDelay={action_delay}').json()
         return int(response['id'])
     except requests.exceptions.ConnectionError:
         logging.info(f"Connection error on create_game, retrying")
         time.sleep(1)
-        return create_game()
+        return create_game(game_length, action_delay, self_play)
 
 
 def act(game_id: int, action):
@@ -38,15 +45,15 @@ def act(game_id: int, action):
 
 def act_batch(actions, disable_harvest: bool = False):
     payload = {}
-    for (game_id, move, turn, buildSpec, harvest) in actions:
+    for (game_id, player_id, move, turn, buildSpec, harvest) in actions:
         action = {
             "buildDrone": buildSpec,
             "move": move,
             "harvest": not disable_harvest,#harvest,
             "transfer": False,
             "turn": turn,
         }
-        payload[game_id] = action
+        payload[f'{game_id}.{player_id}'] = action
 
     retries = 100
     while retries > 0:
@@ -62,13 +69,13 @@ def act_batch(actions, disable_harvest: bool = False):
             time.sleep(1)
 
 
-def observe(game_id: int):
+def observe(game_id: int, player_id: int = 0):
     try:
-        return requests.get(f'http://localhost:9000/observation?gameID={game_id}&playerID=0').json()
+        return requests.get(f'http://localhost:9000/observation?gameID={game_id}&playerID={player_id}').json()
     except requests.exceptions.ConnectionError:
-        logging.info(f"Connection error on observe({game_id}), retrying")
+        logging.info(f"Connection error on observe({game_id}.{player_id}), retrying")
         time.sleep(1)
-        return observe(game_id)
+        return observe(game_id, player_id)
 
 
 def observe_batch(game_ids):

diff --git a/gym_codecraft/envs/codecraft_vec_env.py b/gym_codecraft/envs/codecraft_vec_env.py
@@ -8,10 +8,49 @@
 import codecraft
 
 
+def map_arena_tiny():
+    return {
+        'mapWidth': 1000,
+        'mapHeight': 1000,
+        'player1Drones': [
+            {
+                'xPos': np.random.randint(-450, 450),
+                'yPos': np.random.randint(-450, 450),
+                'resources': 0,
+                'storageModules': 1,
+                'missileBatteries': 0,
+                'constructors': 1,
+                'engines': 0,
+                'shieldGenerators': 0,
+            }
+        ],
+        'player2Drones': [
+            {
+                'xPos': np.random.randint(-450, 450),
+                'yPos': np.random.randint(-450, 450),
+                'resources': 0,
+                'storageModules': 0,
+                'missileBatteries': 1,
+                'constructors': 0,
+                'engines': 0,
+                'shieldGenerators': 3,
+            }
+        ]
+    }
+
+
 class CodeCraftVecEnv(VecEnv):
-    def __init__(self, num_envs, game_length, objective, action_delay):
+    def __init__(self, num_envs, num_self_play, objective, action_delay, stagger=True):
+        assert(num_envs >= 2 * num_self_play)
         self.objective = objective
         self.action_delay = action_delay
+        self.num_self_play = num_self_play
+        self.stagger = stagger
+        self.game_length = 3 * 60 * 60
+        self.custom_map = lambda: None
+        if objective == Objective.ARENA_TINY:
+            self.game_length = 1 * 60 * 60
+            self.custom_map = map_arena_tiny
 
         observations_low = []
         observations_high = []
@@ -47,25 +86,35 @@ def __init__(self, num_envs, game_length, objective, action_delay):
         self.eplen = []
         self.eprew = []
         self.score = []
-        self.game_length = game_length
 
     def reset(self):
         self.games = []
         self.eplen = []
         self.score = []
-        for i in range(self.num_envs):
+        for i in range(self.num_envs - self.num_self_play):
             # spread out initial game lengths to stagger start times
-            game_id = codecraft.create_game(self.game_length * (i + 1) // self.num_envs, self.action_delay)
+            self_play = i < self.num_self_play
+            game_length = self.game_length * (i + 1) // (self.num_envs - self.num_self_play) if self.stagger else self.game_length
+            game_id = codecraft.create_game(
+                game_length,
+                self.action_delay,
+                self_play,
+                self.custom_map())
             # print("Starting game:", game_id)
-            self.games.append(game_id)
+            self.games.append((game_id, 0))
             self.eplen.append(1)
             self.eprew.append(0)
             self.score.append(None)
+            if self_play:
+                self.games.append((game_id, 1))
+                self.eplen.append(1)
+                self.eprew.append(0)
+                self.score.append(None)
         return self.observe()[0]
 
     def step_async(self, actions):
         game_actions = []
-        for (game_id, action) in zip(self.games, actions):
+        for ((game_id, player_id), action) in zip(self.games, actions):
             # 0-5: turn/movement (4 is no turn, no movement)
             # 6: build [0,1,0,0,0] drone (if minerals > 5)
             # 7: harvest
@@ -83,7 +132,7 @@ def step_async(self, actions):
                 build = [[0, 1, 0, 0, 0]]
             if action == 7:
                 harvest = True
-            game_actions.append((game_id, move, turn, build, harvest))
+            game_actions.append((game_id, player_id, move, turn, build, harvest))
 
         codecraft.act_batch(game_actions, disable_harvest=self.objective == Objective.DISTANCE_TO_CRYSTAL)
 
@@ -96,14 +145,18 @@ def observe(self):
         infos = []
         obs = codecraft.observe_batch_raw(self.games)
         global_features = 1
-        nonobs_features = 2
-        dstride = 7
+        nonobs_features = 3
+        dstride = 13
         mstride = 4
-        stride = global_features + dstride + 10 * mstride
+        stride = global_features + dstride + 10 * mstride + 10 * dstride
         for i in range(self.num_envs):
             x = obs[stride * i + global_features + 0]
             y = obs[stride * i + global_features + 1]
-            if self.objective == Objective.ALLIED_WEALTH:
+            if self.objective == Objective.ARENA_TINY:
+                allied_score = obs[stride * self.num_envs + i * nonobs_features + 1]
+                enemy_score = obs[stride * self.num_envs + i * nonobs_features + 2]
+                score = 2 * allied_score / (allied_score + enemy_score + 1e-8)
+            elif self.objective == Objective.ALLIED_WEALTH:
                 score = obs[stride * self.num_envs + i * nonobs_features + 1] * 0.1
             elif self.objective == Objective.DISTANCE_TO_ORIGIN:
                 score = -dist(x, y, 0.0, 0.0)
@@ -131,16 +184,24 @@ def observe(self):
             self.score[i] = score
 
             if obs[stride * self.num_envs + i * nonobs_features] > 0:
-                game_id = codecraft.create_game(self.game_length, self.action_delay)
-                self.games[i] = game_id
-                observation = codecraft.observe(game_id)
+                (game_id, pid) = self.games[i]
+                if pid == 0:
+                    self_play = i // 2 < self.num_self_play
+                    game_id = codecraft.create_game(self.game_length,
+                                                    self.action_delay,
+                                                    self_play,
+                                                    self.custom_map())
+                    self.games[i] = (game_id, 0)
+                    if self_play:
+                        self.games[i + 1] = (game_id, 1)
+                observation = codecraft.observe(game_id, pid)
                 # TODO
                 # obs[stride * i:stride * (i + 1)] = codecraft.observation_to_np(observation)
 
                 dones.append(1.0)
-                infos.append({'episode': {'r': self.eprew[i], 'l': self.eplen[i]}})
+                infos.append({'episode': {'r': self.eprew[i], 'l': self.eplen[i], 'index': i}})
                 self.eplen[i] = 1
-                self.eprew[i] = reward
+                self.eprew[i] = 0
                 self.score[i] = None
             else:
                 self.eplen[i] += 1
@@ -161,13 +222,13 @@ def close(self):
         while running > 0:
             game_actions = []
             active_games = []
-            for game_id in self.games:
+            for (game_id, player_id) in self.games:
                 if not done[game_id]:
-                    active_games.append(game_id)
-                    game_actions.append((game_id, False, 0, [], False))
+                    active_games.append((game_id, player_id))
+                    game_actions.append((game_id, player_id, False, 0, [], False))
             codecraft.act_batch(game_actions)
             obs = codecraft.observe_batch(active_games)
-            for o, game_id in zip(obs, active_games):
+            for o, (game_id, _) in zip(obs, active_games):
                 if o['winner']:
                     done[game_id] = True
                     running -= 1
@@ -178,6 +239,7 @@ class Objective(Enum):
     DISTANCE_TO_CRYSTAL = 'DISTANCE_TO_CRYSTAL'
     DISTANCE_TO_ORIGIN = 'DISTANCE_TO_ORIGIN'
     DISTANCE_TO_1000_500 = 'DISTANCE_TO_1000_500'
+    ARENA_TINY = 'ARENA_TINY'
 
 
 def dist2(x1, y1, x2, y2):

diff --git a/hyper_params.py b/hyper_params.py
@@ -23,10 +23,16 @@ def __init__(self):
         self.zero_init_vf = True    # Set all initial weights for value function head to zero
         self.small_init_pi = False  # Set initial weights for policy head to small values and biases to zero
 
+        # Eval
+        self.eval_envs = 64
+        self.eval_timesteps = 360
+        self.eval_frequency = 1e5
+
         # RL
         self.steps = 10e6           # Total number of timesteps
+        self.num_envs = 64          # Number of environments
+        self.num_self_play = 32     # Number of self-play environments (each provides two environments)
         self.seq_rosteps = 256      # Number of sequential steps per rollout
-        self.rosteps = 256 * 64     # Number of total rollout steps
         self.gamma = 0.99           # Discount factor
         self.lamb = 0.95            # Generalized advantage estimation parameter lambda
         self.norm_advs = True       # Normalize advantage values
@@ -35,9 +41,10 @@ def __init__(self):
         self.cliprange = 0.2        # PPO cliprange
         self.clip_vf = False        # Use clipped value function objective
 
+        self.rosteps = self.num_envs * self.seq_rosteps
+
         # Task
-        self.objective = envs.Objective.ALLIED_WEALTH
-        self.game_length = 3 * 60 * 60
+        self.objective = envs.Objective.ARENA_TINY
         self.action_delay = 0
 
     def args_parser(self) -> argparse.ArgumentParser: