In [1]:
import webotsgym as wg

from webotsgym.config import WebotConfig
from webotsgym.environment import WebotsEnv
from webotsgym.evaluate import Evaluate, EvaluateMats, EvaluatePJ0
from webotsgym.action import DiscreteAction, ContinuousAction
from webotsgym.observation import Observation

import numpy as np
import matplotlib.pyplot as plt

import gym
import stable_baselines
from stable_baselines import A2C, ACER, ACKTR, DQN, DDPG, SAC, PPO1, PPO2, TD3, TRPO
from stable_baselines.common.env_checker import check_env
from stable_baselines.common.policies import MlpPolicy

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [2]:
def exponential_decay(x, N0=1, lambda_=5):
    return N0*np.exp(-lambda_*x)

class MyObs(Observation):
    def __init__(self, env):
        super(MyObs, self).__init__(env)
        self.env = env

class MyEval(Evaluate):
    def __init__(self, env, config: WebotConfig = WebotConfig()):
        super(MyEval, self).__init__(env, config)
        self.reward_range = (-1000, 1000)

    def calc_reward(self):
        reward = -1
        distance_norm = self.env.get_target_distance()
        distance_abs = self.env.get_target_distance(False)

        if distance_abs < 0.1 and abs(self.env.state.speed) < 0.05 and self.env.state.touching is False:
            return 10000
        else:
            reward += 2500 * exponential_decay(distance_abs, lambda_=40) * exponential_decay(abs(self.env.state.speed), lambda_=10)
        
        if self.env.state.touching:
            reward -= 5
        return reward

    def check_done(self):
        if self.env.total_reward < -10000:
            print("reward boundary")
            return True
        if self.env.get_target_distance(False) < 0.1 and abs(self.env.state.speed) < 0.05 and self.env.state.touching is False:
            print("target reached")
            return True
        return False
    

config = WebotConfig()
config.fast_simulation = True
config.reset_env_after = 200000
config.num_obstacles = 0
config.world_size = 2
config.world_scaling = 0.5

action_class = ContinuousAction(direction_type="steering", relative=False)
env = WebotsEnv(train=True, 
                action_class=action_class, 
                evaluate_class=MyEval,
                observation_class=MyObs,
                config=config)



Accepting on Port:  10201
sending: env
USE FAST MODE


In [3]:
time_steps = 499999
model_name = "Webots_find_target_small"

model = PPO1("MlpPolicy", env)





Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Please use `layer.__call__` method instead.









Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [None]:
while True:
    model.learn(total_timesteps=50000, log_interval=10000)
    model.save("models/" + model_name)

# model = PPO1.load("models/{}".format('DQN_WebotFakeMini_TRPO_pj1_nReward2_200000'))
# env = MyEnv()
# obs = env.reset()

# env.render()
# done = False
# max_num_steps = 100
# time = 0

sending: env
USE FAST MODE
Reward ( 250 )	 -0.9999498087560058
Reward ( 500 )	 -0.9999966987649441
Reward ( 750 )	 -5.999999983761102
Reward ( 1000 )	 -5.999999992247886
Reward ( 1250 )	 -5.999999977475733
Reward ( 1500 )	 -0.9999987438624036
Reward ( 1750 )	 -5.999999965798946
Reward ( 2000 )	 -5.999999996824686
Reward ( 2250 )	 -5.999999997876353
Reward ( 2500 )	 -5.999999998165003
reward boundary
sending: env
USE FAST MODE
Reward ( 2750 )	 -0.9999896346677061
Reward ( 3000 )	 -0.9999909493328931
Reward ( 3250 )	 -5.999999939795154
Reward ( 3500 )	 -5.999999994291154
Reward ( 3750 )	 -0.9999559225493873
Reward ( 4000 )	 -5.961438101721102
Reward ( 4250 )	 -5.866573715230784
Reward ( 4500 )	 -0.9899875771747109
Reward ( 4750 )	 0.08484045402007778
Reward ( 5000 )	 -4.706538243769086
Reward ( 5250 )	 6.869443255375924
Reward ( 5500 )	 -4.003439276071207
Reward ( 5750 )	 -5.116717364220389
Reward ( 6000 )	 -4.924271660116464
Reward ( 6250 )	 -5.529368830724751
reward boundary
sending: e

In [None]:
model = PPO1.load("models/Webots_find_target_small")

def exponential_decay(x, N0=1, lambda_=5):
    return N0*np.exp(-lambda_*x)

class MyObs(Observation):
    def __init__(self, env):
        super(MyObs, self).__init__(env)
        self.env = env

class MyEval(Evaluate):
    def __init__(self, env, config: WebotConfig = WebotConfig()):
        super(MyEval, self).__init__(env, config)
        self.reward_range = (-1000, 1000)

    def calc_reward(self):
        reward = -1
        distance_norm = self.env.get_target_distance()
        distance_abs = self.env.get_target_distance(False)

        if distance_abs < 0.1 and abs(self.env.state.speed) < 0.05 and self.env.state.touching is False:
            return 10000
        else:
            reward += 2500 * exponential_decay(distance_abs, lambda_=40) * exponential_decay(abs(self.env.state.speed), lambda_=10)
        
        if self.env.state.touching:
             reward -= 5
        return reward

    def check_done(self):
        if self.env.total_reward < -10000:
            print("reward boundary")
            return True
        if self.env.get_target_distance(False) < 0.1 and abs(self.env.state.speed) < 0.05 and self.env.state.touching is False:
            print("target reached")
            return True
        return False
    

config = WebotConfig()
config.fast_simulation = False
config.reset_env_after = 200000
config.num_obstacles = 0
config.world_size = 2
config.world_scaling = 0.5

action_class = ContinuousAction(direction_type="steering", relative=False)
env = WebotsEnv(train=True, 
                action_class=action_class, 
                evaluate_class=MyEval,
                observation_class=MyObs,
                config=config)

obs = env.reset()

for _ in range(100000):
    action, _states = model.predict(obs)
    obs, rewards, done, info = env.step(action)
    env.render()
    if done is True:
        break