In [None]:

import numpy as np
import os

import matplotlib.pyplot as plt
import numpy as np
import gymnasium
from datetime import date
import pathlib


os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

In [None]:
env_cfg= {
        "action_space":[-1, 1],
        "obs_space":[-10, 10],
        "num":[1],
        "den":[1, 10, 20],
        "x_0":[0],
        "dt":0.01,
        "y_0":0,
        "t_0":0,
        "t_end":5,
        "y_ref":1}

## Library Is Installed via pip install -e .

In [None]:
from gymnasium.envs.registration import register
import gym_control

register(
    id='gym_control/LinearSISOEnv-v0',
    entry_point='gym_control.envs:LinearSISOEnv',
    max_episode_steps=3000)

env = gymnasium.make("gym_control/LinearSISOEnv-v0",env_config=env_cfg)
env

In [None]:
from gym_control.envs import LinearSISOEnv
from ray.tune.registry import register_env


def env_creator(env_config):
    return LinearSISOEnv(env_config)
register_env("LinearSISOEnv", lambda config: LinearSISOEnv(env_cfg))

### Init RAY

In [None]:
import ray
from ray.tune.logger import pretty_print

ray.init()

## Plot Reward

In [None]:
def plot_reward(train_result:list):
    plt.figure(1)
    plt.title("Rewards")
    plt.plot(np.arange(1,len(train_result["hist_stats"]["episode_reward"])+1,1),train_result["hist_stats"]["episode_reward"],'-x')


## Train with PPO Algorithm

In [None]:
from ray.rllib.algorithms import ppo
from ray.rllib.algorithms.ppo import PPOConfig

#### Simple way
# algo = ppo.PPO(env=LinearSISOEnv,config={"env_config":env_cfg})
# result = algo.train()


# More complex/customizable way of training in rllib
algo = (
    PPOConfig()
    .rollouts(num_rollout_workers=0)
    .resources(num_gpus=1)
    .environment(env=LinearSISOEnv,env_config=env_cfg)
    .training()
    .build()
)

for i in range(4):
    result = algo.train()
    print(pretty_print(result))

    if i % 2 == 0:
        checkpoint_dir = algo.save()
        print(f"Checkpoint saved in directory {checkpoint_dir}")

plot_reward(result)


## Train with DDPG Algorithm

In [None]:
from ray.rllib.algorithms import ddpg
algo_ddpg = ddpg.DDPG(env=LinearSISOEnv,config={"env_config":env_cfg})
# Train ddpg one iteration
result_ddpg = algo_ddpg.train()
plot_reward(result_ddpg)

In [None]:
from ray.rllib.algorithms.ddpg import DDPGConfig

algo = (
    DDPGConfig()
    .rollouts(num_rollout_workers=0)
    .resources(num_gpus=1)
    .environment(env=LinearSISOEnv,env_config=env_cfg)
    .training()
    .build()
)


algo_name = "DDPG"
now = datetime.now()
checkpoint_dir = f"{algo_name}_{now.day}_{now.month}_{now.year}_{now.hour}_{now.minute}_{now.second}"
checkpoint_dir = str(pathlib.PurePath("ray_results",checkpoint_dir))

best_rew = -99999
eps_rewards = []
best_ckpt = ""
for i in range(10):
    result = algo.train()
    print(result["episode_reward_mean"])
    eps_rewards.append(result["episode_reward_mean"])
    if result["episode_reward_mean"] > best_rew:
        best_rew = result["episode_reward_mean"]
        ckpt_dir = algo.save(checkpoint_dir=checkpoint_dir)
        best_ckpt = ckpt_dir
        print(f"Best checkpoint saved in directory {ckpt_dir}")
    if result["episode_reward_mean"]>= -1.1:
        break

algo.stop()

In [None]:
import torch
obs, info = env.reset()
torch.from_numpy(obs).float().to("cuda")
torch.from_numpy(obs).cuda(0)

In [None]:
policy = algo.get_policy()
policy.model

In [None]:
from ray.rllib.algorithms.algorithm import Algorithm
best_ckpt = "ray_results\\DDPG_10_7_2023_1_54_48\\checkpoint_000003"
algo_trained = Algorithm.from_checkpoint(best_ckpt)
algo_trained


obs, info = env.reset()


for x in range(100):
    action = algo_trained.compute_single_action(obs)
    obs, reward, terminated, truncated, info = env.step(action)
    print(obs)

### Training with Ray[Tune]

In [None]:
from ray import air, tune
from ray.tune.schedulers import AsyncHyperBandScheduler
from ray.air.config import CheckpointConfig
from ray.rllib.algorithms.ddpg import DDPGConfig

config = DDPGConfig().training(lr=0.1)
config.environment(env=LinearSISOEnv,env_config=env_cfg)
config.rollouts(num_rollout_workers=1)
config.framework("torch")
config.resources(num_gpus=1)


scheduler = AsyncHyperBandScheduler(time_attr="training_iteration")

tuner = tune.Tuner(
    "DDPG",
    run_config=air.RunConfig(
        stop={"episode_reward_mean": -1.1},
        checkpoint_config=CheckpointConfig(
            checkpoint_frequency=1,
            checkpoint_at_end=True)
    ),
    param_space=config,
    # # For 3 iteration if any progress does not happen in the metric, stop exp
    # tune_config=tune.TuneConfig(
    #         scheduler=scheduler, num_samples=3, metric="episode_reward_mean", mode="max"),

)

results_ddpg_tuner = tuner.fit()



In [None]:
best_result = results_ddpg_tuner.get_best_result(metric="episode_reward_mean", mode="max")
# Get the best checkpoint corresponding to the best result.
best_checkpoint = best_result.checkpoint
best_checkpoint

### Test Trained Policy

In [None]:
from ray.rllib.algorithms.algorithm import Algorithm
ddpg_trained = Algorithm.from_checkpoint("C:\\Users\\Furka\\ray_results\\DDPG\\DDPG_LinearSISOEnv_65c58_00000_0_2023-07-09_23-51-37\\checkpoint_000012")
ddpg_trained