In [24]:
!apt update
!apt install -y python-numpy python-dev cmake zlib1g-dev libjpeg-dev xvfb ffmpeg xorg-dev python-opengl libboost-all-dev libsdl2-dev swig

!pip3 install pyvirtualdisplay piglet gym torch torchvision
!pip3 install "gym[atari]"

[33m0% [Working][0m            Hit:1 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:4 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:9 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,668 kB]
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get:11 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [8,731 kB]
Hit:12 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Fetched 11.4 MB in 2s (6,130 kB/s)
Reading package lists... Done
Building dependenc

In [38]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque

# Создание нейронной сети
class DQN(nn.Module):
    def __init__(self, input_size, output_size):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_size, 24)
        self.fc2 = nn.Linear(24, 24)
        self.fc3 = nn.Linear(24, output_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

# Параметры
EPISODES = 1000
GAMMA = 0.99
LEARNING_RATE = 0.001
MEMORY_SIZE = 2000
BATCH_SIZE = 64
EPSILON_START = 1.0
EPSILON_END = 0.01
EPSILON_DECAY = 0.995

# Инициализация среды и модели
env = gym.make('CartPole-v1')
input_size = env.observation_space.shape[0]
output_size = env.action_space.n

model = DQN(input_size, output_size)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.MSELoss()

# Опытный буфер
memory = deque(maxlen=MEMORY_SIZE)

def get_action(state, epsilon):
    if random.random() < epsilon:
        return env.action_space.sample()
    else:
        with torch.no_grad():
            state_tensor = torch.FloatTensor(state).unsqueeze(0)
            q_values = model(state_tensor)
            return torch.argmax(q_values).item()

def replay(memory):
    if len(memory) < BATCH_SIZE:
        return

    batch = random.sample(memory, BATCH_SIZE)
    states, actions, rewards, next_states, dones = zip(*batch)

    # Преобразование в numpy массивы с правильной формой
    states = np.array(states, dtype=np.float32)
    next_states = np.array(next_states, dtype=np.float32)

    # Преобразование в тензоры PyTorch
    states_tensor = torch.FloatTensor(states)
    next_states_tensor = torch.FloatTensor(next_states)
    actions_tensor = torch.LongTensor(actions).unsqueeze(1)
    rewards_tensor = torch.FloatTensor(rewards)
    dones_tensor = torch.FloatTensor(dones)

    # Вычисление Q-значений
    q_values = model(states_tensor).gather(1, actions_tensor)
    next_q_values = model(next_states_tensor).max(1)[0].detach()
    target_q_values = rewards_tensor + (GAMMA * next_q_values * (1 - dones_tensor))

    # Оптимизация
    loss = criterion(q_values.squeeze(), target_q_values)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# Основной цикл обучения
epsilon = EPSILON_START

for episode in range(EPISODES):
    state_info = env.reset()
    # Извлечение массива состояния из кортежа
    state = state_info[0] if isinstance(state_info, tuple) else state_info
    done = False
    total_reward = 0

    while not done:
        action = get_action(state, epsilon)
        next_state_info, reward, done, info = env.step(action)  # Исправленная строка
        # Извлечение массива следующего состояния из кортежа
        next_state = next_state_info[0] if isinstance(next_state_info, tuple) else next_state_info

        # Проверка формы состояния и следующего состояния
        assert len(state) == input_size, f"State has incorrect shape: {state}"
        assert len(next_state) == input_size, f"Next state has incorrect shape: {next_state}"

        total_reward += reward
        memory.append((state, action, reward, next_state, done))
        state = next_state

        replay(memory)

    epsilon = max(EPSILON_END, epsilon * EPSILON_DECAY)
    print(f'Episode {episode + 1}/{EPISODES}, Total Reward: {total_reward}')

env.close()

Episode 1/1000, Total Reward: 57.0
Episode 2/1000, Total Reward: 21.0
Episode 3/1000, Total Reward: 10.0
Episode 4/1000, Total Reward: 14.0
Episode 5/1000, Total Reward: 21.0
Episode 6/1000, Total Reward: 15.0
Episode 7/1000, Total Reward: 31.0
Episode 8/1000, Total Reward: 12.0
Episode 9/1000, Total Reward: 14.0
Episode 10/1000, Total Reward: 14.0
Episode 11/1000, Total Reward: 27.0
Episode 12/1000, Total Reward: 10.0
Episode 13/1000, Total Reward: 34.0
Episode 14/1000, Total Reward: 17.0
Episode 15/1000, Total Reward: 25.0
Episode 16/1000, Total Reward: 73.0
Episode 17/1000, Total Reward: 29.0
Episode 18/1000, Total Reward: 12.0
Episode 19/1000, Total Reward: 17.0
Episode 20/1000, Total Reward: 9.0
Episode 21/1000, Total Reward: 10.0
Episode 22/1000, Total Reward: 12.0
Episode 23/1000, Total Reward: 21.0
Episode 24/1000, Total Reward: 39.0
Episode 25/1000, Total Reward: 36.0
Episode 26/1000, Total Reward: 25.0
Episode 27/1000, Total Reward: 21.0
Episode 28/1000, Total Reward: 74.0
Ep