In [1]:
!pip install gym torch

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

# PyTorch DQN for CartPole
Coding a Deep Q-Network in PyTorch to make a CartPole agent learn to balance like a pro!

## DQN Code for CartPole
A simple PyTorch DQN to train a CartPole agent. Let’s make it learn!

**Plan**: Build a neural net, use experience replay, and train with rewards.

In [8]:
import numpy as np
if not hasattr(np, 'bool8'):
    np.bool8 = np.bool_

import gym
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque


# Set up CartPole
env = gym.make('CartPole-v1', new_step_api=True)
state_size = env.observation_space.shape[0]  # 4
action_size = env.action_space.n  # 2
memory = deque(maxlen=2000)  # Experience replay memory

# DQN Neural Network
class DQN(nn.Module):
    def __init__(self):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_size, 24)
        self.fc2 = nn.Linear(24, 24)
        self.fc3 = nn.Linear(24, action_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

# Hyperparameters
gamma = 0.99
epsilon = 1.0
epsilon_min = 0.01
epsilon_decay = 0.995
learning_rate = 0.001
batch_size = 32
episodes = 100

# Initialize network and optimizer
model = DQN()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.MSELoss()

# Training loop
for episode in range(episodes):
    reset_result = env.reset()
    state = reset_result[0] if isinstance(reset_result, tuple) else reset_result

    state = torch.FloatTensor(state).unsqueeze(0)
    total_reward = 0
    done = False

    while not done:
        # Choose action (epsilon-greedy)
        if random.random() < epsilon:
            action = env.action_space.sample()
        else:
            with torch.no_grad():
                q_values = model(state)
                action = q_values.argmax().item()

        # Take action (new Gym API returns 5 values)
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        next_state = torch.FloatTensor(next_state).unsqueeze(0)
        total_reward += reward

        # Save experience
        memory.append((state, action, reward, next_state, done))
        state = next_state

        # Train if enough memory
        if len(memory) > batch_size:
            batch = random.sample(memory, batch_size)
            states, actions, rewards, next_states, dones = zip(*batch)

            states = torch.cat(states)
            actions = torch.LongTensor(actions)
            rewards = torch.FloatTensor(rewards)
            next_states = torch.cat(next_states)
            dones = torch.FloatTensor(dones)

            # Compute Q-values
            q_values = model(states).gather(1, actions.unsqueeze(1)).squeeze(1)
            next_q_values = model(next_states).max(1)[0]
            targets = rewards + gamma * next_q_values * (1 - dones)

            # Update network
            loss = criterion(q_values, targets)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    # Decay epsilon
    epsilon = max(epsilon_min, epsilon * epsilon_decay)

    print(f"Episode {episode+1}, Reward: {total_reward}")

env.close()


Episode 1, Reward: 38.0
Episode 2, Reward: 30.0
Episode 3, Reward: 20.0
Episode 4, Reward: 28.0
Episode 5, Reward: 17.0
Episode 6, Reward: 19.0
Episode 7, Reward: 13.0
Episode 8, Reward: 25.0
Episode 9, Reward: 40.0
Episode 10, Reward: 11.0
Episode 11, Reward: 22.0
Episode 12, Reward: 15.0
Episode 13, Reward: 19.0
Episode 14, Reward: 22.0
Episode 15, Reward: 29.0
Episode 16, Reward: 21.0
Episode 17, Reward: 15.0
Episode 18, Reward: 41.0
Episode 19, Reward: 16.0
Episode 20, Reward: 26.0
Episode 21, Reward: 58.0
Episode 22, Reward: 29.0
Episode 23, Reward: 18.0
Episode 24, Reward: 33.0
Episode 25, Reward: 31.0
Episode 26, Reward: 28.0
Episode 27, Reward: 17.0
Episode 28, Reward: 31.0
Episode 29, Reward: 47.0
Episode 30, Reward: 28.0
Episode 31, Reward: 12.0
Episode 32, Reward: 17.0
Episode 33, Reward: 12.0
Episode 34, Reward: 22.0
Episode 35, Reward: 20.0
Episode 36, Reward: 19.0
Episode 37, Reward: 23.0
Episode 38, Reward: 15.0
Episode 39, Reward: 20.0
Episode 40, Reward: 20.0
Episode 4

## Testing the DQN
Let’s see how well the agent balances the pole after training!

In [12]:
env = gym.make('CartPole-v1', render_mode='human')
state = env.reset()
state = torch.FloatTensor(state).unsqueeze(0)
total_reward = 0
done = False

while not done:
    env.render()  # See the action (may not show in Colab)
    with torch.no_grad():
        q_values = model(state)
        action = q_values.argmax().item()
    state, reward, done, _ = env.step(action)
    state = torch.FloatTensor(state).unsqueeze(0)
    total_reward += reward

print(f"Test Reward: {total_reward}")
env.close()

Test Reward: 161.0


## What I Learned
- **DQN Power**: The neural network predicts Q-values way better than a Q-table for CartPole’s complex states.
- **PyTorch Vibe**: Used PyTorch to build and train the network, like in `ai_libraries.ipynb`.
- **RL Connection**: Combined Q-learning (`qlearning_frozenlake.ipynb`) with neural nets (`dqn_intro.md`).
- **Results**: Rewards went from ~20 to 100+ over 100 episodes—agent learned to balance!
- **Next Up**: Tweak the network (more layers?) or try my thermostat agent with a DQN.