# Лабораторная работа 2

### Импорт зависимостей

In [None]:
import gymnasium as gym
from stable_baselines3 import SAC, DDPG
import matplotlib.pyplot as plt
from matplotlib import animation
from IPython.display import HTML
import numpy as np

### Создание среды

In [None]:
env = gym.make("BipedalWalker-v3" , render_mode='rgb_array')

In [None]:
observation_space = env.observation_space
action_space = env.action_space

### Функция для визуализации результатов работы моделей

In [None]:
def visualize(model, env, step_limit=1000):

    obs = env.reset()[0]

    img = []
    step_num = 0
    while True:
        action, _states = model.predict(obs, deterministic=True)
        obs, reward, terminated, truncated, _ = env.step(action)

        img.append(env.render())
        step_num += 1

        if terminated or step_num >= step_limit:
            break

    dpi = 72
    interval = 50 # ms

    plt.figure(figsize=(img[0].shape[1]/dpi,img[0].shape[0]/dpi),dpi=dpi)
    patch = plt.imshow(img[0])
    plt.axis=('off')
    animate = lambda i: patch.set_data(img[i])
    ani = animation.FuncAnimation(plt.gcf(),animate,frames=len(img),interval=interval)
    return HTML(ani.to_jshtml())

### Функция оценки модели

In [None]:
def evaluate(model, env, step_limit=1000):

    obs = env.reset()[0]

    total_reward = 0

    step_num = 0
    while True:
        action, _states = model.predict(obs, deterministic=True)
        obs, reward, terminated, truncated, _ = env.step(action)

        total_reward += reward
        step_num += 1

        if terminated or step_num >= step_limit:
            break

    return total_reward / step_num

### Epsilon-Greedy = 0.15

In [None]:
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise

n_actions = env.action_space.shape[-1]
action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.15 * np.ones(n_actions))

model = DDPG("MlpPolicy", env, action_noise=action_noise, verbose=1)
model.learn(total_timesteps=50000, log_interval=1)
model.save("ddpg_01")
print("Mean reward:", evaluate(model, env))
visualize(model, env)

### Epsilon-Greedy = 0.05

In [None]:
action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.05 * np.ones(n_actions))

model = DDPG("MlpPolicy", env, action_noise=action_noise, verbose=1)
model.learn(total_timesteps=50000, log_interval=1)
model.save("ddpg_005")
print("Mean reward:", evaluate(model, env))
visualize(model, env)

### Epsilon-Greedy = 0.4

In [None]:
action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.4 * np.ones(n_actions))

model = DDPG("MlpPolicy", env, action_noise=action_noise, verbose=1)
model.learn(total_timesteps=50000, log_interval=1)
model.save("ddpg_03")
print("Mean reward:", evaluate(model, env))
visualize(model, env)

### SAC (lr 1e-2)

In [None]:
model = SAC("MlpPolicy", env, verbose=1, learning_rate=1e-2)
model.learn(total_timesteps=50000, log_interval=1)
model.save("sac_1e-2")
print("Mean reward:", evaluate(model, env))
visualize(model, env)

### SAC (lr 1e-3)

In [None]:
model = SAC("MlpPolicy", env, verbose=1, learning_rate=1e-3)
model.learn(total_timesteps=50000, log_interval=1)
model.save("sac_1e-3")
print("Mean reward:", evaluate(model, env))
visualize(model, env)

### SAC (lr 1e-4)

In [None]:
model = SAC("MlpPolicy", env, verbose=1, learning_rate=1e-4)
model.learn(total_timesteps=50000, log_interval=1)
model.save("sac_1e-4")
print("Mean reward:", evaluate(model, env))
visualize(model, env)

### SAC (lr 1e-5)

In [None]:
model = SAC("MlpPolicy", env, verbose=1, learning_rate=1e-5)
model.learn(total_timesteps=50000, log_interval=1)
model.save("sac_1e-5")
print("Mean reward:", evaluate(model, env))
visualize(model, env)