In [6]:
import gym
import random
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical

import numpy as np

In [2]:
class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 24)
        self.fc2 = nn.Linear(24, 24)
        self.fc3 = nn.Linear(24, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

In [3]:
# Create the environment
env = gym.make("CartPole-v1")
states = env.observation_space.shape[0]
actions = env.action_space.n

# Initialize the model
model = DQN(states, actions)

In [8]:
class DQNAgent:
    def __init__(self, model, actions, memory_size=50000, batch_size=64, gamma=0.99, lr=1e-3):
        self.model = model
        self.target_model = DQN(states, actions)
        self.target_model.load_state_dict(model.state_dict())
        self.memory = []
        self.actions = actions
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.optimizer = optim.Adam(model.parameters(), lr=lr)
        self.loss_fn = nn.MSELoss()

    def get_action(self, state):
        state = np.array(state).reshape(1, -1) # converting to 2d array
        state = torch.FloatTensor(state).unsqueeze(0)
        q_values = self.model(state)
        return q_values.argmax().item()

    def store_transition(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        if len(self.memory) > self.memory_size:
            self.memory.pop(0)

    def update_model(self):
        if len(self.memory) < self.batch_size:
            return
        batch = random.sample(self.memory, self.batch_size)
        state, action, reward, next_state, done = zip(*batch)
        state = torch.FloatTensor(state)
        next_state = torch.FloatTensor(next_state)
        action = torch.LongTensor(action)
        reward = torch.FloatTensor(reward)
        done = torch.FloatTensor(done)

        current_q = self.model(state).gather(1, action.unsqueeze(1))
        max_next_q = self.target_model(next_state).max(1)[0].detach()
        expected_q = reward + (1 - done) * self.gamma * max_next_q

        loss = self.loss_fn(current_q, expected_q.unsqueeze(1))
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def update_target_model(self):
        self.target_model.load_state_dict(self.model.state_dict())


In [9]:
def train_agent(agent, env, episodes=1000):
    for episode in range(episodes):
        state = env.reset()
        done = False
        while not done:
            action = agent.get_action(state)
            next_state, reward, done, _ = env.step(action)
            agent.store_transition(state, action, reward, next_state, done)
            state = next_state
            agent.update_model()
        if episode % 10 == 0:
            agent.update_target_model()

# Initialize the agent
agent = DQNAgent(model, actions)

# Train the agent
train_agent(agent, env, episodes=1000)


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (2,) + inhomogeneous part.