<a href="https://colab.research.google.com/github/dev-sundram/SAPIEN_ROBOTICS_ASSIGNMENT/blob/main/RL_atari_game.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Step 1  Install Required Libraries

In [2]:
!pip install gym[atari,accept-rom-license] autorom
!AutoROM --accept-license


AutoROM will download the Atari 2600 ROMs.
They will be installed to:
	/usr/local/lib/python3.11/dist-packages/AutoROM/roms

Existing ROMs will be overwritten.
Installed /usr/local/lib/python3.11/dist-packages/AutoROM/roms/adventure.bin
Installed /usr/local/lib/python3.11/dist-packages/AutoROM/roms/air_raid.bin
Installed /usr/local/lib/python3.11/dist-packages/AutoROM/roms/alien.bin
Installed /usr/local/lib/python3.11/dist-packages/AutoROM/roms/amidar.bin
Installed /usr/local/lib/python3.11/dist-packages/AutoROM/roms/assault.bin
Installed /usr/local/lib/python3.11/dist-packages/AutoROM/roms/asterix.bin
Installed /usr/local/lib/python3.11/dist-packages/AutoROM/roms/asteroids.bin
Installed /usr/local/lib/python3.11/dist-packages/AutoROM/roms/atlantis.bin
Installed /usr/local/lib/python3.11/dist-packages/AutoROM/roms/atlantis2.bin
Installed /usr/local/lib/python3.11/dist-packages/AutoROM/roms/backgammon.bin
Installed /usr/local/lib/python3.11/dist-packages/AutoROM/roms/bank_heist.bin
Inst

#Step 2: Import Libraries

In [5]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import numpy as np
from collections import deque
from gym.wrappers import GrayScaleObservation, ResizeObservation, FrameStack


# Step 3: Define the DQN Mode

In [6]:
class DQN(nn.Module):
    def __init__(self, input_shape, n_actions):
        super(DQN, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU()
        )

        conv_out_size = self._get_conv_out(input_shape)
        self.fc = nn.Sequential(
            nn.Linear(conv_out_size, 512),
            nn.ReLU(),
            nn.Linear(512, n_actions)
        )

    def _get_conv_out(self, shape):
        o = self.conv(torch.zeros(1, *shape))
        return int(np.prod(o.size()))

    def forward(self, x):
        x = x / 255.0  # normalize
        conv_out = self.conv(x).view(x.size()[0], -1)
        return self.fc(conv_out)


# Step 4: Create the Replay Buffer

In [7]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        state, action, reward, next_state, done = map(np.array, zip(*batch))
        return state, action, reward, next_state, done

    def __len__(self):
        return len(self.buffer)


# Step 5: Create Environments (Base and Dynamic)

In [8]:
import gym
from gym.wrappers import GrayScaleObservation, ResizeObservation, FrameStack

# Easy (base) environment
easy_env = gym.make("BreakoutNoFrameskip-v4", render_mode="rgb_array")
easy_env = GrayScaleObservation(easy_env, keep_dim=True)
easy_env = ResizeObservation(easy_env, (84, 84))
easy_env = FrameStack(easy_env, 4)

# Hard (dynamic) environment – for now, we just use same env unless you define a custom one
# You can replace this with your DynamicBreakoutEnv later
hard_env = gym.make("BreakoutNoFrameskip-v4", render_mode="rgb_array")
hard_env = GrayScaleObservation(hard_env, keep_dim=True)
hard_env = ResizeObservation(hard_env, (84, 84))
hard_env = FrameStack(hard_env, 4)


# Step 6: Setup Training Components

In [18]:
input_shape = (4, 84, 84,)
n_actions = easy_env.action_space.n
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

dqn = DQN(input_shape, n_actions).to(device)
optimizer = torch.optim.Adam(dqn.parameters(), lr=1e-4)
replay_buffer = ReplayBuffer(100000)

batch_size = 32
gamma = 0.99
epsilon = 0.1


# Step 7: Define train_dqn_step() Function

In [10]:
def train_dqn_step():
    if len(replay_buffer) < batch_size:
        return

    state, action, reward, next_state, done = replay_buffer.sample(batch_size)

    state = torch.FloatTensor(state).to(device)
    next_state = torch.FloatTensor(next_state).to(device)
    action = torch.LongTensor(action).to(device)
    reward = torch.FloatTensor(reward).to(device)
    done = torch.FloatTensor(done).to(device)

    q_values = dqn(state)
    next_q_values = dqn(next_state)

    q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1)
    max_next_q_value = next_q_values.max(1)[0]
    expected_q_value = reward + gamma * max_next_q_value * (1 - done)

    loss = F.mse_loss(q_value, expected_q_value)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


# Step 8: Define Training Loop (With Curriculum Learning)

In [14]:
def run_episode(env, epsilon):
    state = env.reset()[0]
    episode_reward = 0

    while True:
        state_tensor = torch.FloatTensor(np.array(state)).unsqueeze(0).to(device)
        if state_tensor.dim() == 5:
          state_tensor = state_tensor.squeeze(-1)

        if random.random() < epsilon:
            action = env.action_space.sample()
        else:
            q_values = dqn(state_tensor)
            action = q_values.max(1)[1].item()

        next_state, reward, done, _, _ = env.step(action)
        replay_buffer.push(state, action, reward, next_state, done)
        train_dqn_step()
        state = next_state
        episode_reward += reward

        if done:
            break

    return episode_reward


In [21]:
!pip install numpy==1.24.4

Collecting numpy==1.24.4
  Downloading numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Downloading numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m61.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
jaxlib 0.5.1 requires numpy>=1.25, but you have numpy 1.24.4 which is incompatible.
pymc 5.22.0 requires numpy>=1.25.0, but you have numpy 1.24.4 which is incompatible.
blosc2 3.3.3 requires numpy>=1.26, but you have numpy 1.24.4 which is incompatible.
dopamine-rl 4.1.2 requires ale-py>

# Step 9: Curriculum Training Loop

In [16]:
num_episodes = 300
threshold = 50  # if agent scores > threshold, increase difficulty

for episode in range(num_episodes):
    if episode < 150:
        env = easy_env
    else:
        env = hard_env  # harder env after learning base

    reward = run_episode(env, epsilon)
    print(f"Episode {episode}, Reward: {reward}")


RuntimeError: Expected 3D (unbatched) or 4D (batched) input to conv2d, but got input of size: [32, 4, 84, 84, 1]