## Imports

### RL

In [None]:
pip list

In [None]:
pip uninstall gym -y

In [None]:
pip install gym

In [None]:
pip uninstall tensorflow -y

In [None]:
pip install tensorflow==1.15.2

In [None]:
from tensorflow.python.compiler import tensorrt as trt

In [None]:
import stable_baselines
stable_baselines.__version__

In [None]:
pip uninstall gym -y

In [None]:
pip install gym==0.20.0

In [None]:
import os

import gym
import numpy as np
import matplotlib.pyplot as plt

from stable_baselines import DDPG, TD3
from stable_baselines.ddpg.policies import LnMlpPolicy
from stable_baselines.bench import Monitor
from stable_baselines.results_plotter import load_results, ts2xy
from stable_baselines.common.noise import AdaptiveParamNoiseSpec, NormalActionNoise
from stable_baselines.common.callbacks import BaseCallback

#Import the ml_monotor library
import ml_monitor
import time

### Callback function

In [None]:
class SaveOnBestTrainingRewardCallback(BaseCallback):
    """
    Callback for saving a model (the check is done every ``check_freq`` steps)
    based on the training reward (in practice, we recommend using ``EvalCallback``).

    :param check_freq: (int)
    :param log_dir: (str) Path to the folder where the model will be saved.
      It must contains the file created by the ``Monitor`` wrapper.
    :param verbose: (int)
    """
    def __init__(self, check_freq: int, log_dir: str, verbose=1):
        super(SaveOnBestTrainingRewardCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.log_dir = log_dir
        self.save_path = os.path.join(log_dir, 'best_model')
        self.best_mean_reward = -np.inf

    def _init_callback(self) -> None:
        my_monitor.monitor("testing_switch",0)
        my_monitor.monitor("training_switch",1)
        # Create folder if needed
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self) -> bool:
        if self.n_calls % self.check_freq == 0:

            # Retrieve training reward
            x, y = ts2xy(load_results(self.log_dir), 'timesteps')
            if len(x) > 0:
                # Mean training reward over the last 100 episodes
                mean_reward = np.mean(y[-100:])
                if self.verbose > 0:
                    print("Num timesteps: {}".format(self.num_timesteps))
                    print("Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}".format(self.best_mean_reward, mean_reward))
                    my_monitor.monitor("num_timesteps",self.num_timesteps)
                    my_monitor.monitor("mean_reward",mean_reward)
                    
            # New best model, you could save the agent here
            if mean_reward > self.best_mean_reward:
                self.best_mean_reward = mean_reward
                my_monitor.monitor("best_mean_reward",self.best_mean_reward)
                # Example for saving best model
                if self.verbose > 0:
                    print("Saving new best model to {}".format(self.save_path))
                self.model.save(self.save_path)

        return True

In [None]:
# Create log dir
log_dir = "machine-learning/logs/"
os.makedirs(log_dir, exist_ok=True)

In [None]:
# Create and wrap the environment
env = gym.make('CarRacing-v0')
# Logs will be saved in log_dir/monitor.csv
env = Monitor(env, log_dir)

### Train model

In [None]:
# Add some param noise for exploration
param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.1, desired_action_stddev=0.1)
# Create the callback: check every 1000 steps
callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir)

In [None]:
my_monitor = ml_monitor.Monitor()
my_monitor.start()

In [None]:
# Because we use parameter noise, we should use a MlpPolicy with layer normalization
model = DDPG(LnMlpPolicy, env, param_noise=param_noise, verbose=False)
# Train the agent
model.learn(total_timesteps=int(1e6), callback=callback)

In [None]:
model.save("machine-learning/models/CarRacing1e5")

# Testing model

## Fast import

In [None]:
import os
import gym
import time
import numpy as np
import matplotlib.pyplot as plt
import ml_monitor

from stable_baselines import DDPG, TD3
from stable_baselines.ddpg.policies import LnMlpPolicy
from stable_baselines.bench import Monitor
from stable_baselines.results_plotter import load_results, ts2xy
from stable_baselines.common.noise import AdaptiveParamNoiseSpec, NormalActionNoise
from stable_baselines.common.callbacks import BaseCallback

## Loading model

In [None]:
del model

In [None]:
env = gym.make('CarRacing-v0')
model = DDPG.load("machine-learning/models/CarRacing1e5")

## Agent environment

In [None]:
my_monitor = ml_monitor.Monitor()
my_monitor.start()

In [None]:
# Enjoy trained agent
my_monitor.monitor("testing_switch",1)
my_monitor.monitor("training_switch",0)
episodes = 100
for episodes in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        score += reward
    my_monitor.monitor("testing_episodes",episodes)
    my_monitor.monitor("testing_score",score)
    print('Episode:{} Score:{}'.format(episodes, score))
    time.sleep(3)
env.close()

In [None]:
env.close()

# Ploting results

In [None]:
from stable_baselines import results_plotter

# Helper from the library
results_plotter.plot_results([log_dir], 1e5, results_plotter.X_TIMESTEPS, "DDPG LunarLander")

In [None]:
def moving_average(values, window):
    """
    Smooth values by doing a moving average
    :param values: (numpy array)
    :param window: (int)
    :return: (numpy array)
    """
    weights = np.repeat(1.0, window) / window
    return np.convolve(values, weights, 'valid')


def plot_results(log_folder, title='Learning Curve'):
    """
    plot the results

    :param log_folder: (str) the save location of the results to plot
    :param title: (str) the title of the task to plot
    """
    x, y = ts2xy(load_results(log_folder), 'timesteps')
    y = moving_average(y, window=50)
    # Truncate x
    x = x[len(x) - len(y):]

    fig = plt.figure(title)
    plt.plot(x, y)
    plt.xlabel('Number of Timesteps')
    plt.ylabel('Rewards')
    plt.title(title + " Smoothed")
    plt.show()


In [None]:
plot_results(log_dir)