In [1]:
%matplotlib notebook

import math
import gym
import numpy as np
import matplotlib.pyplot as plt
import random
from mpl_toolkits.mplot3d import Axes3D  
from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.evaluation import evaluate_policy
from simulation_view import SimulationView

In [8]:
def get_simulation_view():
    return SimulationView(np.array([random.random(), random.random()]) * 10000 + 5000, [0, 0])
#     return SimulationView([5000, 10000], [0, 0])


class Environment(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, **kwargs):
        super(Environment, self).__init__()
        self.reward_range = (-math.inf, -math.inf)
        self.action_space = gym.spaces.Box(np.array([0]), np.array([1]), dtype=np.float32)
        self.observation_space = gym.spaces.Box(low=0, high=1e5, shape=(2,))
        self.simulation_view = get_simulation_view()
        self.simulation_step = 0
        self.plot_x = np.array([])
        self.plot_y = np.array([])
        self.draw_plot = kwargs.get('draw_plot')
        self.draw_plot_freq = kwargs.get('draw_plot_freq')

    def step(self, action): 
        self.last_action = action
        old_obs = list(self.simulation_view.simulation.entityCount)
        self.simulation_view.step([action[0] * 1e5, 0])
        obs = np.array(self.simulation_view.simulation.entityCount)
        self.plot_x = np.append(self.plot_x, obs[0])
        self.plot_y = np.append(self.plot_y, obs[1])
        # reward = 1e5 - np.abs(self.simulation_view.simulation.entityCount[0] - self.simulation_view.simulation.entityCount[1])
        # reward = self.simulation_step
        # reward = 1
        reward = 100 / (100 + np.sum(np.abs(obs - old_obs)))
        # reward = 1 - min(abs(1 - np.average(np.array(obs) / np.array(old_obs))), 1)
        # reward = 2000 - np.sum(np.abs(obs - old_obs))
        # reward = 1 if action[0] > 0.4 and action[0] < 0.6 else 0
        # reward = -1 if np.any(np.array(self.simulation_view.simulation.entityCount) < 100) else 1
        # reward = 10000.0 - abs(5000 - obs[0]) - abs(5000 - obs[1])
        # reward = (30000.0 - obs[0] - obs[1]) / 30000.0
        done = bool(np.any(np.array(self.simulation_view.simulation.entityCount) < 1000)) \
               or bool(self.simulation_step > 100)
               # or bool(np.sum(np.abs(obs - old_obs)) > (1000 - self.simulation_step) * 2)
              
        self.simulation_step += 1

        return obs, reward, done, {}

    def reset(self):
        self.simulation_view = get_simulation_view()
        self.simulation_step = 0
        self.plot_x = np.array([])
        self.plot_y = np.array([])
        return np.array(self.simulation_view.simulation.entityCount)

    def render(self, mode='human', close=False):
        if self.simulation_step != 0 and self.simulation_step % self.draw_plot_freq == 0:
            self.draw_plot(self.plot_x, self.plot_y)
        
        
check_env(Environment())

In [3]:
def get_empty_model():
    env = Environment()
    return PPO('MlpPolicy', env, clip_range=0.1, verbose=1, tensorboard_log="./logs/")


def learn_model(steps=100000, **kwargs):
    def draw_plot(x, y):
        ax.lines[0].set_xdata(x)
        ax.lines[0].set_ydata(y)
        fig.canvas.draw()

    eval_callback=None
    if kwargs.get('evaluate'):
        fig, ax = plt.subplots(1,1)
        ax.plot([0, 20000], [0, 20000])
        eval_callback = EvalCallback(Environment(draw_plot=draw_plot, draw_plot_freq=10), best_model_save_path='./logs/',
                                     log_path='./logs/', eval_freq=kwargs.get('eval_freq', 10000), render=True)
    
    model = get_empty_model()
    model.learn(total_timesteps=steps, callback=eval_callback)
    return model


def evaluate_model(model, **kwargs):
    def draw_plot(x, y):
        ax.lines[0].set_xdata(x)
        ax.lines[0].set_ydata(y)
        fig.canvas.draw()
    fig, ax = plt.subplots(1,1)
    ax.plot([0, 20000], [0, 20000])
    
    env = Environment(draw_plot=draw_plot, draw_plot_freq=kwargs.get('draw_plot_freq', 10))
    evaluate_policy(model, env, render=True)
    

def draw_model_surface(model):
    def fun(x, y):
        return list(map(lambda x: model.predict([x[0], x[1]], deterministic=True)[0][0], zip(x, y)))

    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    x = y = np.arange(5000, 10000, 100)
    X, Y = np.meshgrid(x, y)
    zs = np.array(fun(np.ravel(X), np.ravel(Y)))
    Z = zs.reshape(X.shape)
    ax.plot_surface(X, Y, Z)
    plt.show()

In [10]:
model = learn_model(100000, evaluate=True, eval_freq=1000)

<IPython.core.display.Javascript object>

Using cuda device
Wrapping the env in a DummyVecEnv.
Logging to ./logs/PPO_5
Eval num_timesteps=1000, episode_reward=0.29 +/- 0.07
Episode length: 7.00 +/- 1.10
New best mean reward!
Eval num_timesteps=2000, episode_reward=0.44 +/- 0.10
Episode length: 8.00 +/- 1.10
New best mean reward!
---------------------------------
| eval/              |          |
|    mean_ep_length  | 8        |
|    mean_reward     | 0.436    |
| time/              |          |
|    fps             | 491      |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 2048     |
---------------------------------
Eval num_timesteps=3000, episode_reward=1.92 +/- 0.15
Episode length: 15.80 +/- 0.98
New best mean reward!
Eval num_timesteps=4000, episode_reward=1.94 +/- 0.10
Episode length: 16.40 +/- 1.20
New best mean reward!
-------------------------------------------
| eval/                   |               |
|    mean_ep_length       | 16.4          |
|    mean_reward         

Eval num_timesteps=17000, episode_reward=21.42 +/- 11.42
Episode length: 102.00 +/- 0.00
Eval num_timesteps=18000, episode_reward=23.41 +/- 7.67
Episode length: 102.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 102         |
|    mean_reward          | 23.4        |
| time/                   |             |
|    fps                  | 207         |
|    iterations           | 9           |
|    time_elapsed         | 89          |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.000845441 |
|    clip_fraction        | 0.14        |
|    clip_range           | 0.1         |
|    entropy_loss         | -1.36       |
|    explained_variance   | -27.8       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.302       |
|    n_updates            | 80          |
|    policy_gradient_loss | -0.00586    |
|    std                 

KeyboardInterrupt: 

In [None]:
model = PPO.load("logs/best_model.zip")

In [11]:
evaluate_model(model)

<IPython.core.display.Javascript object>

In [12]:
draw_model_surface(model)

<IPython.core.display.Javascript object>