In [None]:
!pip install gym torch

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
pip install numpy==1.23.5


Collecting numpy==1.23.5
  Downloading numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)
Downloading numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.1/17.1 MB[0m [31m77.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
xarray 2025.3.1 requires numpy>=1.24, but you have numpy 1.23.5 which is incompatible.
tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 1.23.5 which is incompatible.
pymc 5.22.0 requires numpy>=1.25.0, but you have numpy 1.23.5 which is incompatible.
treescope 0.1.9 r

# DQN Agent for CartPole
Building a Deep Q-Network agent in PyTorch to master CartPole, focusing on how it vibes with the environment!

## DQN Agent Code
Coding a DQN agent that interacts with CartPole (reset, step, rewards) to learn balancing.

**Focus**: How the agent talks to the environment to get states and rewards.

In [None]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque
import numpy as np

# Set up CartPole environment
env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]  # 4
action_size = env.action_space.n  # 2
memory = deque(maxlen=2000)  # Experience replay

# DQN Neural Network
class DQN(nn.Module):
    def __init__(self):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_size, 24)
        self.fc2 = nn.Linear(24, 24)
        self.fc3 = nn.Linear(24, action_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

# Hyperparameters
gamma = 0.99  # Discount factor
epsilon = 1.0  # Exploration rate
epsilon_min = 0.01
epsilon_decay = 0.995
learning_rate = 0.005
batch_size = 32
episodes = 100

# Initialize network and optimizer
model = DQN()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.MSELoss()

# Training loop with environment interaction
for episode in range(episodes):
    state = env.reset()  # ENVIRONMENT: Start new episode
    state = torch.FloatTensor(state).unsqueeze(0)  # Shape: [1, 4]
    total_reward = 0
    done = False

    while not done:
        # Choose action (epsilon-greedy)
        if random.random() < epsilon:
            action = env.action_space.sample()  # Random action
        else:
            with torch.no_grad():
                q_values = model(state)
                action = q_values.argmax().item()

        # ENVIRONMENT: Take action, get next state/reward
        next_state, reward, done, _ = env.step(action)
        next_state = torch.FloatTensor(next_state).unsqueeze(0)
        total_reward += reward

        # Save experience for replay
        memory.append((state, action, reward, next_state, done))
        state = next_state

        # Train if enough memory
        if len(memory) > batch_size:
            batch = random.sample(memory, batch_size)
            states, actions, rewards, next_states, dones = zip(*batch)

            states = torch.cat(states)
            actions = torch.LongTensor(actions)
            rewards = torch.FloatTensor(rewards)
            next_states = torch.cat(next_states)
            dones = torch.FloatTensor(dones)

            # Compute Q-values
            q_values = model(states).gather(1, actions.unsqueeze(1)).squeeze(1)
            next_q_values = model(next_states).max(1)[0]
            targets = rewards + gamma * next_q_values * (1 - dones)

            # Update network
            loss = criterion(q_values, targets)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    # Decay epsilon
    epsilon = max(epsilon_min, epsilon * epsilon_decay)

    print(f"Episode {episode+1}, Reward: {total_reward}")

env.close()

Episode 1, Reward: 22.0
Episode 2, Reward: 28.0
Episode 3, Reward: 17.0
Episode 4, Reward: 17.0
Episode 5, Reward: 14.0
Episode 6, Reward: 9.0
Episode 7, Reward: 18.0
Episode 8, Reward: 24.0
Episode 9, Reward: 12.0
Episode 10, Reward: 15.0
Episode 11, Reward: 31.0
Episode 12, Reward: 11.0
Episode 13, Reward: 29.0
Episode 14, Reward: 41.0
Episode 15, Reward: 67.0
Episode 16, Reward: 27.0
Episode 17, Reward: 17.0
Episode 18, Reward: 14.0
Episode 19, Reward: 23.0
Episode 20, Reward: 17.0
Episode 21, Reward: 15.0
Episode 22, Reward: 13.0
Episode 23, Reward: 12.0
Episode 24, Reward: 16.0
Episode 25, Reward: 28.0
Episode 26, Reward: 20.0
Episode 27, Reward: 15.0
Episode 28, Reward: 27.0
Episode 29, Reward: 49.0
Episode 30, Reward: 27.0
Episode 31, Reward: 16.0
Episode 32, Reward: 43.0
Episode 33, Reward: 32.0
Episode 34, Reward: 43.0
Episode 35, Reward: 20.0
Episode 36, Reward: 27.0
Episode 37, Reward: 87.0
Episode 38, Reward: 12.0
Episode 39, Reward: 48.0
Episode 40, Reward: 33.0
Episode 41

## Testing the DQN Agent
Let’s see the agent balance that pole using what it learned!

In [None]:
env = gym.make('CartPole-v1')
state = env.reset()
state = torch.FloatTensor(state).unsqueeze(0)
total_reward = 0
done = False

while not done:
    env.render()  # Try to visualize (may not show in Colab)
    with torch.no_grad():
        q_values = model(state)
        action = q_values.argmax().item()
    state, reward, done, _ = env.step(action)  # ENVIRONMENT: Key interaction
    state = torch.FloatTensor(state).unsqueeze(0)
    total_reward += reward

print(f"Test Reward: {total_reward}")
env.close()

Test Reward: 98.0


## Learnings
- **Environment Interaction**: The agent uses `env.reset()` to start and `env.step(action)` to move, getting states/rewards from CartPole—like a convo with the game!
- **DQN Magic**: The neural network predicts actions way better than random moves or a Q-table (`qlearning_frozenlake.ipynb`).
- **PyTorch RL**: Mixed PyTorch (`ai_libraries.ipynb`) with RL (`dqn_intro.md`) to train the agent.
- **Results**: Rewards hit ~100–200, way better than random (~20). The agent learned to balance!
- **Real Problem**: This is like teaching my thermostat (`day1_summary.md`) to adjust temps based on sensor data.
- **Next**: Add a target network (like `dqn_intro.md`) or try a thermostat DQN.