In [8]:
import gym
import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim as optim

# Define the network architecture
class DQNNetwork(nn.Module):
    def __init__(self, input_shape, num_actions):
        super(DQNNetwork, self).__init__()
        self.fc1 = nn.Linear(input_shape, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, num_actions)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Define the replay buffer
class ReplayBuffer():
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = []

    def push(self, state, action, reward, next_state, done):
        if len(self.buffer) >= self.capacity:
            self.buffer.pop(0)
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        state, action, reward, next_state, done = zip(*batch)
        return torch.FloatTensor(state), torch.LongTensor(action), \
               torch.FloatTensor(reward), torch.FloatTensor(next_state), \
               torch.FloatTensor(done)

# Define the agent
class DQNAgent():
    def __init__(self, state_shape, num_actions, gamma=0.99, epsilon=1.0, epsilon_min=0.01, epsilon_decay=0.995, lr=0.001, capacity=100000):
        self.state_shape = state_shape
        self.num_actions = num_actions
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.lr = lr
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.policy_net = DQNNetwork(state_shape, num_actions).to(self.device)
        self.target_net = DQNNetwork(state_shape, num_actions).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.lr)
        self.replay_buffer = ReplayBuffer(capacity)

    def act(self, state):
        if np.random.rand() < self.epsilon:
            return np.random.randint(self.num_actions)
        with torch.no_grad():
            state = torch.FloatTensor(state).to(self.device)
            q_values = self.policy_net(state)
            return q_values.argmax().item()

    def update_target_network(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

    def train(self, batch_size):
        if len(self.replay_buffer.buffer) < batch_size:
            return
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(batch_size)
        states = states.to(self.device)
        actions = actions.to(self.device)
        rewards = rewards.to(self.device)
        next_states = next_states.to(self.device)
        dones = dones.to(self.device)

        q_values = self.policy_net(states)
        state_action_values = q_values.gather(1, actions.unsqueeze(1)).squeeze(1)

        next_state_values = self.target_net(next_states).max(1)[0]
        next_state_values[dones] = 0
        expected_state_action_values = (next_state_values * self.gamma) + rewards

        loss = nn.MSELoss()(state_action_values, expected_state_action_values.detach())

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def learn(self, env, num_episodes=1000, max_steps=1000, batch_size=64):
        episode_rewards = []
        for episode in range(num_episodes):
            state = env.reset()
            done = False
            total_reward = 0
            for step in range(max_steps):
                action = self.act(state)
                next_state, reward, done, _ = env.step(action)
                self.replay_buffer.push(state, action, reward, next_state, done)
                state = next_state
                total_reward += reward
                self.train(batch_size)
                if done:
                    break
            self.update_target_network()
            episode_rewards.append(total_reward)
            print("Episode: {}, total reward: {}, epsilon: {:.2f}".format(episode+1, total_reward, self.epsilon))

        return episode_rewards

# Create the environment
env = gym.make("CarRacing-v2")

# Initialize the agent
state_shape = env.observation_space.shape[0]
num_actions = env.action_space.shape[0] # get size of action space as a numpy array
agent = DQNAgent(state_shape, num_actions)

# Train the agent
episode_rewards = agent.learn(env)

# Plot the learning curve
import matplotlib.pyplot as plt
plt.plot(episode_rewards)
plt.xlabel("Episode")
plt.ylabel("Total reward")
plt.show()

TypeError: 'int' object is not subscriptable