In [22]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import gym
from collections import deque

class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        return self.fc2(x)

def one_hot_state(state, n_states):
    vec = np.zeros(n_states)
    vec[state] = 1
    return vec

# Constants
REPLAY_BUFFER_SIZE = 10000
MIN_REPLAY_BUFFER_SIZE = 1000
BATCH_SIZE = 32

# Initialize replay buffer
replay_buffer = deque(maxlen=10000)

def train_dqn(env, model, episodes=500, learning_rate=0.001, discount_factor=0.95, exploration_prob=1.0, exploration_decay=0.995, min_exploration=0.05):
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    loss_fn = nn.MSELoss()

    n_states = env.observation_space.n

    for episode in range(episodes):
        state = one_hot_state(env.reset()[0], n_states)
        terminated = False
        truncated = False

        while not terminated and not truncated:
            with torch.no_grad():
                state_tensor = torch.tensor([state], dtype=torch.float32)
                if np.random.uniform(0, 1) < exploration_prob:
                    action = env.action_space.sample()  # Explore
                else:
                    q_values = model(state_tensor)
                    action = torch.argmax(q_values).item()  # Exploit

            next_state, reward, terminated, truncated, _ = env.step(action)
            next_state = one_hot_state(next_state, n_states)

            # Store experience in replay buffer
            replay_buffer.append((state, action, reward, next_state, terminated))
            if len(replay_buffer) > REPLAY_BUFFER_SIZE:
                replay_buffer.pop(0)  # Remove oldest experience if buffer is full

            # Train using experience replay
            if len(replay_buffer) >= MIN_REPLAY_BUFFER_SIZE:
                batch = sample(replay_buffer, BATCH_SIZE)
                states, actions, rewards, next_states, dones = zip(*batch)

                states = torch.tensor(states, dtype=torch.float32)
                actions = torch.tensor(actions, dtype=torch.long)
                rewards = torch.tensor(rewards, dtype=torch.float32)
                next_states = torch.tensor(next_states, dtype=torch.float32)
                dones = torch.tensor(dones, dtype=torch.float32)

                q_values = model(states)
                q_values = q_values.gather(1, actions.unsqueeze(-1)).squeeze(-1)

                with torch.no_grad():
                    next_q_values = model(next_states)
                    next_q_values = torch.max(next_q_values, dim=1)[0]
                    targets = rewards + (1 - dones) * discount_factor * next_q_values

                loss = loss_fn(q_values, targets)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            state = next_state

        exploration_prob = max(min_exploration, exploration_prob * exploration_decay)

        if episode % 10 == 0:
            print(f"Episode {episode} finished.")


# Train
env = gym.make('Taxi-v3')
input_dim = env.observation_space.n  # Adjusted to match the number of states
output_dim = env.action_space.n
print(f"input_dim: {input_dim}, output_dim: {output_dim}")
model = DQN(input_dim, output_dim)
train_dqn(env, model)

#save model
torch.save(model, 'dqn-model1.pt')



input_dim: 500, output_dim: 6
Episode 0 finished.


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 2 dimensions. The detected shape was (1000, 5) + inhomogeneous part.

### Imports and Setup

In [1]:
import numpy as np
import gymnasium as gym
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim

### Model Definition

In [2]:
class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.fc = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.fc(x)

### Training

In [6]:
def train_dqn(env, model, episodes=5000, learning_rate=0.001, discount_factor=0.95, exploration_prob=1.0, exploration_decay=0.995, min_exploration=0.05):
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    loss_fn = nn.MSELoss()

    for episode in range(episodes):
        state = env.reset()[0]
        state = np.array([state])
        terminated = False
        truncated = False

        while not terminated and not truncated:
            with torch.no_grad():
                if np.random.uniform(0, 1) < exploration_prob:
                    action = env.action_space.sample()  # Explore
                else:
                    q_values = model(torch.tensor([state], dtype=torch.float32))
                    print("Debug - q_values shape:", q_values.shape)
                    print("Debug - q_values:", q_values)
                    action = torch.argmax(q_values).item()  # Exploit

            next_state, reward, terminated, truncated, _ = env.step(action)
            print("Debug - next_state:", next_state)
            print("Debug - state:", state)
            print("Debug - action:", action)

            # Update model
            target = reward
            if not terminated and not truncated:
                with torch.no_grad():
                    target = reward + discount_factor * torch.max(model(torch.tensor([next_state], dtype=torch.float32)))
                    print("Debug - target:", target)

            q_values = model(torch.tensor([state], dtype=torch.float32))
           
            loss = loss_fn(q_values[0][action], torch.tensor([target], dtype=torch.float32))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            state = next_state

        exploration_prob = max(min_exploration, exploration_prob * exploration_decay)

        if episode % 100 == 0:
            print(f"Episode {episode} finished.")

# Train
env = gym.make('Taxi-v3')
print("Observation space:", env.observation_space.n)
input_dim = 1
output_dim = env.action_space.n
model = DQN(input_dim, output_dim)
train_dqn(env, model)


Observation space: 500
Debug - next_state: 134
Debug - state: [134]
Debug - action: 4
Debug - target: tensor(108.1784)
Debug - next_state: 134
Debug - state: 134
Debug - action: 4
Debug - target: tensor(108.0502)


IndexError: index 4 is out of bounds for dimension 0 with size 0