In [1]:
!pip install numpy==1.23.5

Collecting numpy==1.23.5
  Downloading numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)
Downloading numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.1/17.1 MB[0m [31m84.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
jaxlib 0.5.1 requires numpy>=1.25, but you have numpy 1.23.5 which is incompatible.
xarray-einstats 0.9.1 requires numpy>=1.25, but you have numpy 1.23.5 which is incompatible.
scikit-image 0.25.2 requires numpy>=1.24, but you have numpy 1.23.5 which is incompatible.
blosc2 3.5.1 requir

In [22]:
import os
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque
import matplotlib.pyplot as plt
import imageio

# --- Настройки ---
ENV_NAME = "Pendulum-v1"
GAMMA = 0.99
TAU = 0.005
ACTOR_LR = 1e-4
CRITIC_LR = 1e-3
BUFFER_SIZE = int(1e6)
BATCH_SIZE = 128
EPISODES = 300
EXPL_NOISE = 0.1
VIDEO_INTERVAL = 50
SAVE_DIR = "./ddpg_videos"
os.makedirs(SAVE_DIR, exist_ok=True)

device = torch.device("cpu")

# --- Сеть Актор ---
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, max_action):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim, 400), nn.ReLU(),
            nn.Linear(400, 300), nn.ReLU(),
            nn.Linear(300, action_dim), nn.Tanh()
        )
        self.max_action = max_action

    def forward(self, state):
        return self.net(state) * self.max_action

# --- Сеть Критик ---
class Critic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim + action_dim, 400), nn.ReLU(),
            nn.Linear(400, 300), nn.ReLU(),
            nn.Linear(300, 1)
        )

    def forward(self, state, action):
        return self.net(torch.cat([state, action], 1))

# --- Буфер повторов ---
class ReplayBuffer:
    def __init__(self, max_size=BUFFER_SIZE):
        self.buffer = deque(maxlen=max_size)

    def add(self, transition):
        self.buffer.append(transition)

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        state, action, reward, next_state, done = zip(*batch)
        return (
            torch.tensor(state, dtype=torch.float32),
            torch.tensor(action, dtype=torch.float32),
            torch.tensor(reward, dtype=torch.float32).unsqueeze(1),
            torch.tensor(next_state, dtype=torch.float32),
            torch.tensor(done, dtype=torch.float32).unsqueeze(1)
        )

    def __len__(self):
        return len(self.buffer)

# --- Копирование весов с коэффициентом TAU ---
def soft_update(target, source):
    for target_param, param in zip(target.parameters(), source.parameters()):
        target_param.data.copy_(target_param.data * (1.0 - TAU) + param.data * TAU)

# --- Видео ---
def record_video(env, actor, episode, max_action):
    frames = []
    state, _ = env.reset()
    done = False
    step = 0
    max_steps = 300  # ~8 секунд при fps=30

    while not done and step < max_steps:
        frame = env.render(mode="rgb_array")
        frames.append(frame)
        state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
        with torch.no_grad():
            action = actor(state_tensor).cpu().numpy()[0]
        state, _, done, _, _ = env.step(action)
        step += 1

    filename = os.path.join(SAVE_DIR, f"ddpg_ep_{episode}.mp4")
    imageio.mimsave(filename, frames, fps=30)
    print(f"Saved video: {filename}")

# --- Тренировка DDPG ---
def train():
    env = gym.make(ENV_NAME, render_mode="rgb_array")
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    actor = Actor(state_dim, action_dim, max_action).to(device)
    actor_target = Actor(state_dim, action_dim, max_action).to(device)
    actor_target.load_state_dict(actor.state_dict())

    critic = Critic(state_dim, action_dim).to(device)
    critic_target = Critic(state_dim, action_dim).to(device)
    critic_target.load_state_dict(critic.state_dict())

    actor_optimizer = optim.Adam(actor.parameters(), lr=ACTOR_LR)
    critic_optimizer = optim.Adam(critic.parameters(), lr=CRITIC_LR)

    replay_buffer = ReplayBuffer()
    rewards_history = []

    for episode in range(EPISODES):
        state = env.reset()
        episode_reward = 0
        done = False

        while not done:
            state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)
            with torch.no_grad():
                action = actor(state_tensor).cpu().numpy()[0]
            action = (action + np.random.normal(0, EXPL_NOISE, size=action_dim)).clip(-max_action, max_action)

            next_state, reward, terminated, truncated = env.step(action)

            done = terminated or truncated
            print(done)
            # Добавляем данные в буфер
            replay_buffer.add((state, action, reward, next_state, float(done[0])))



            state = next_state
            episode_reward += reward

            if len(replay_buffer) > BATCH_SIZE:
                s, a, r, s2, d = replay_buffer.sample(BATCH_SIZE)

                with torch.no_grad():
                    target_q = critic_target(s2, actor_target(s2))
                    target = r + (1 - d) * GAMMA * target_q

                current_q = critic(s, a)
                critic_loss = nn.MSELoss()(current_q, target)

                critic_optimizer.zero_grad()
                critic_loss.backward()
                critic_optimizer.step()

                actor_loss = -critic(s, actor(s)).mean()

                actor_optimizer.zero_grad()
                actor_loss.backward()
                actor_optimizer.step()

                soft_update(actor_target, actor)
                soft_update(critic_target, critic)

        rewards_history.append(episode_reward)
        if episode % 10 == 0:
            print(f"Episode {episode}, Reward: {episode_reward:.1f}")

        if episode % VIDEO_INTERVAL == 0:
            record_video(env, actor, episode, max_action)

    env.close()
    plt.plot(rewards_history)
    plt.xlabel("Episode")
    plt.ylabel("Total Reward")
    plt.title("DDPG on " + ENV_NAME)
    plt.grid()
    plt.show()

if __name__ == "__main__":
    train()


{}


  deprecation(
  deprecation(


KeyError: 0

In [35]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
import random
import time

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# === Нейросети Actor и Critic ===
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, max_action):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(state_dim, 400)
        self.fc2 = nn.Linear(400, 300)
        self.fc3 = nn.Linear(300, action_dim)
        self.max_action = max_action

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = torch.tanh(self.fc3(x)) * self.max_action
        return x


class Critic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(state_dim + action_dim, 400)
        self.fc2 = nn.Linear(400, 300)
        self.fc3 = nn.Linear(300, 1)

    def forward(self, x, u):
        x = F.relu(self.fc1(torch.cat([x, u], 1)))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


# === Буфер опыта ===
class ReplayBuffer:
    def __init__(self, max_size=1e6):
        self.buffer = deque(maxlen=int(max_size))

    def add(self, transition):
        self.buffer.append(transition)

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)

        return (
            torch.FloatTensor(np.array(states)).to(device),
            torch.FloatTensor(np.array(actions)).to(device),
            torch.FloatTensor(np.array(rewards)).unsqueeze(1).to(device),
            torch.FloatTensor(np.array(next_states)).to(device),
            torch.FloatTensor(np.array(dones)).unsqueeze(1).to(device)
        )

    def __len__(self):
        return len(self.buffer)


# === DDPG Агент ===
class DDPGAgent:
    def __init__(self, state_dim, action_dim, max_action):
        self.actor = Actor(state_dim, action_dim, max_action).to(device)
        self.actor_target = Actor(state_dim, action_dim, max_action).to(device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=1e-4)

        self.critic = Critic(state_dim, action_dim).to(device)
        self.critic_target = Critic(state_dim, action_dim).to(device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=1e-3)

        self.max_action = max_action
        self.noise_std = 0.2
        self.gamma = 0.99
        self.tau = 0.005

    def select_action(self, state, noise=True):
        state = torch.FloatTensor(state.reshape(1, -1)).to(device)
        with torch.no_grad():
            action = self.actor(state).cpu().numpy().flatten()
        if noise:
            action = (action + np.random.normal(0, self.noise_std, size=action.shape)).clip(-self.max_action, self.max_action)
        return action

    def update(self, replay_buffer, batch_size=64):
        if len(replay_buffer) < batch_size:
            return

        # Сэмплируем батч
        states, actions, rewards, next_states, dones = replay_buffer.sample(batch_size)

        # Вычисляем целевое Q значение
        with torch.no_grad():
            next_actions = self.actor_target(next_states)
            target_Q = self.critic_target(next_states, next_actions)
            target_Q = rewards + (1 - dones) * self.gamma * target_Q

        # Обучаем Critic
        current_Q = self.critic(states, actions)
        critic_loss = F.mse_loss(current_Q, target_Q)

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Обучаем Actor
        actor_loss = -self.critic(states, self.actor(states)).mean()

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # Обновляем целевые сети
        for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
            target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

        for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
            target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

    def save(self, filename):
        torch.save(self.actor.state_dict(), f"{filename}_actor.pth")
        torch.save(self.critic.state_dict(), f"{filename}_critic.pth")

    def load(self, filename):
        self.actor.load_state_dict(torch.load(f"{filename}_actor.pth", map_location=device))
        self.critic.load_state_dict(torch.load(f"{filename}_critic.pth", map_location=device))
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.critic_target.load_state_dict(self.critic.state_dict())


# === Функция тестирования ===
def test_agent(env, agent, episodes=3):
    for _ in range(episodes):
        state, _ = env.reset()
        total_reward = 0
        done = False
        while not done:
            action = agent.select_action(state, noise=False)
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            total_reward += reward
            state = next_state
        print(f"Test episode reward: {total_reward}")


# === Обучение ===
# === Обучение ===
def train_ddpg():
    env_name = 'Pendulum-v1'
    env = gym.make(env_name, render_mode="human")
    eval_env = gym.make(env_name, render_mode="human")

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    agent = DDPGAgent(state_dim, action_dim, max_action)
    replay_buffer = ReplayBuffer()

    total_timesteps = 0
    max_timesteps = 100000
    start_timesteps = 10000

    state = env.reset()
    episode_reward = 0
    episode_timesteps = 0
    episode_num = 0

    while total_timesteps < max_timesteps:
        episode_timesteps += 1
        total_timesteps += 1

        # Выбираем действие
        if total_timesteps < start_timesteps:
            action = env.action_space.sample()
        else:
            action = agent.select_action(state)

        # Делаем шаг в среде
        next_state, reward, terminated, truncated = env.step(action)
        done = terminated or truncated

        # Сохраняем в буфер
        replay_buffer.add((state, action, reward, next_state, (done)))
        state = next_state
        episode_reward += reward

        # Если эпизод закончен
        if done:
            print(f"Total T: {total_timesteps} Episode Num: {episode_num + 1} Episode T: {episode_timesteps} Reward: {episode_reward:.2f}")
            state, _ = env.reset()
            episode_reward = 0
            episode_timesteps = 0
            episode_num += 1

        # Обновляем политику
        agent.update(replay_buffer)

        # Тестирование каждые 20 эпизодов
        if (episode_num % 20 == 0) and (episode_num > 0):
            test_agent(eval_env, agent)


if __name__ == "__main__":
    train_ddpg()

TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint64, uint32, uint16, uint8, and bool.