In [1]:
pip install gym torch

Collecting gym
  Downloading gym-0.26.2.tar.gz (721 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m721.7/721.7 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting gym-notices>=0.0.4 (from gym)
  Downloading gym_notices-0.0.8-py3-none-any.whl (3.0 kB)
Building wheels for collected packages: gym
  Building wheel for gym (pyproject.toml) ... [?25ldone
[?25h  Created wheel for gym: filename=gym-0.26.2-py3-none-any.whl size=827620 sha256=d4967a9fbba2d83c76aa4f12418e24a5b2fbe82d89d226b39bd3e189b95d396a
  Stored in directory: /Users/educarrascovidal/Library/Caches/pip/wheels/1c/77/9e/9af5470201a0b0543937933ee99ba884cd237d2faefe8f4d37
Successfully built gym
Installing collected packages: gym-notices, gym
Successfully installed gym-0.26.2 gym-notices-0.0.8
Note: you may need to rest

In [3]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
from collections import deque
from itertools import count

# Definir la red neuronal para el aproximador de función Q
class DQN(nn.Module):
    def __init__(self, obs_space, action_space):
        super(DQN, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(obs_space, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, action_space)
        )

    def forward(self, x):
        return self.fc(x)

# Función para seleccionar una acción con política epsilon-greedy
def select_action(state, epsilon, action_space, model):
    if random.random() > epsilon:
        with torch.no_grad():
            return model(state).max(1)[1].view(1, 1)
    else:
        return torch.tensor([[random.randrange(action_space)]], dtype=torch.long)

# Entrenamiento del DQN
def train_dqn(env, model, optimizer, memory, batch_size, gamma):
    if len(memory) < batch_size:
        return
    transitions = random.sample(memory, batch_size)
    batch_state, batch_action, batch_next_state, batch_reward, batch_done = zip(*transitions)

    batch_state = torch.cat(batch_state)
    batch_action = torch.cat(batch_action)
    batch_reward = torch.cat(batch_reward)
    batch_next_state = torch.cat(batch_next_state)
    batch_done = torch.tensor(batch_done, dtype=torch.bool)

    current_q_values = model(batch_state).gather(1, batch_action)
    max_next_q_values = model(batch_next_state).max(1)[0].detach()
    expected_q_values = batch_reward + (gamma * max_next_q_values * ~batch_done)

    loss = nn.MSELoss()(current_q_values.squeeze(), expected_q_values)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# Configuraciones iniciales
env = gym.make('CartPole-v1')
obs_space = env.observation_space.shape[0]
action_space = env.action_space.n

model = DQN(obs_space, action_space)
optimizer = optim.Adam(model.parameters())
memory = deque(maxlen=10000)
epsilon = 0.1
gamma = 0.99
batch_size = 32
num_episodes = 500

# Bucle de entrenamiento
for i_episode in range(num_episodes):
    state = torch.tensor([env.reset().tolist()], dtype=torch.float)
    for t in count():
        action = select_action(state, epsilon, action_space, model)
        next_state, reward, done, _ = env.step(action.item())
        reward = torch.tensor([reward], dtype=torch.float)
        next_state = torch.tensor([next_state.tolist()], dtype=torch.float) if not done else None

        memory.append((state, action, next_state, reward, done))
        state = next_state if next_state is not None else torch.tensor([env.reset().tolist()], dtype=torch.float)

        train_dqn(env, model, optimizer, memory, batch_size, gamma)

        if done:
            break

    print(f"Episodio {i_episode + 1} completado")
env.close()

AttributeError: 'tuple' object has no attribute 'tolist'