In [1]:
pip install numpy==1.26.4

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install gym-super-mario-bros==7.4.0

Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install tensordict==0.3.0

Note: you may need to restart the kernel to use updated packages.


In [5]:
pip install torchrl==0.3.0

Note: you may need to restart the kernel to use updated packages.


In [6]:
import torch
from torch import nn
from torchvision import transforms as T
from PIL import Image
import numpy as np
from pathlib import Path
from collections import deque
import random, datetime, os
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
import random

import gym
from gym.spaces import Box
from gym.wrappers import FrameStack

from nes_py.wrappers import JoypadSpace

# Super Mario environment for OpenAI Gym
import gym_super_mario_bros

from tensordict import TensorDict
from torchrl.data import TensorDictReplayBuffer, LazyMemmapStorage

from sklearn.metrics import accuracy_score



Initializing Environment

In [7]:
if gym.__version__ < '0.26':
    env = gym_super_mario_bros.make("SuperMarioBros-1-1-v0", new_step_api=True)
else:
    env = gym_super_mario_bros.make("SuperMarioBros-1-1-v0", render_mode='human', apply_api_compatibility=True)

# env = JoypadSpace(env, [["right"], ["right", "A"]])
# env = JoypadSpace(env, [[], ["right"], ["right", "A"]])
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
env = JoypadSpace(env, SIMPLE_MOVEMENT)
print("Action space:", SIMPLE_MOVEMENT)





env.reset()
next_state, reward, done, trunc, info = env.step(action=0)
print(f"{next_state.shape},\n {reward},\n {done},\n {info}")

  logger.warn(
  logger.warn(


Action space: [['NOOP'], ['right'], ['right', 'A'], ['right', 'B'], ['right', 'A', 'B'], ['A'], ['left']]
(240, 256, 3),
 0.0,
 False,
 {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 400, 'world': 1, 'x_pos': 40, 'y_pos': 79}


  if not isinstance(terminated, (bool, np.bool8)):


Using wrappers to preprocess environment data to give our agent only information it needs: making the structure of the environment a 3D array: [4, 84, 84]

In [8]:
class SkipFrame(gym.Wrapper):
    def __init__(self, env, skip):
        """Return only every `skip`-th frame"""
        super().__init__(env)
        self._skip = skip

    def step(self, action):
        """Repeat action, and sum reward"""
        total_reward = 0.0
        for i in range(self._skip):
            obs, reward, done, trunk, info = self.env.step(action)
            total_reward += reward
            if done:
                break
        return obs, total_reward, done, trunk, info


class GrayScaleObservation(gym.ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
        obs_shape = self.observation_space.shape[:2]
        self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)

    def permute_orientation(self, observation):
        # permute [H, W, C] array to [C, H, W] tensor
        observation = np.transpose(observation, (2, 0, 1))
        observation = torch.tensor(observation.copy(), dtype=torch.float)
        return observation

    def observation(self, observation):
        observation = self.permute_orientation(observation)
        transform = T.Grayscale()
        observation = transform(observation)
        return observation


class ResizeObservation(gym.ObservationWrapper):
    def __init__(self, env, shape):
        super().__init__(env)
        if isinstance(shape, int):
            self.shape = (shape, shape)
        else:
            self.shape = tuple(shape)

        obs_shape = self.shape + self.observation_space.shape[2:]
        self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)

    def observation(self, observation):
        transforms = T.Compose(
            [T.Resize(self.shape, antialias=True), T.Normalize(0, 255)]
        )
        observation = transforms(observation).squeeze(0)
        return observation


# env = SkipFrame(env, skip=4)
env = GrayScaleObservation(env)
env = ResizeObservation(env, shape=84)
if gym.__version__ < '0.26':
    env = FrameStack(env, num_stack=4, new_step_api=True)
else:
    env = FrameStack(env, num_stack=4)

#### Agent: Mario  
Goals  
- Makes optimal action policy based on current state  
- Learns a better action policy over time
- Q learning agent

#### Scripted Approach

In [9]:
def scripted_expert_policy(obs, step_count):
    if 0 <= (step_count % 20) < 17:  
        return 2  # ['right', 'A']
    return 1  # ['right']


In [10]:
def collect_scripted_expert_data(env, save_path="scripted_demo.pkl", episodes=5):
    import pickle
    import numpy as np

    MAX_STEPS = 1000

    with open(save_path, "wb") as f:
        for ep in range(episodes):
            state = env.reset()[0]
            done = False
            step_count = 0
            print(f"Episode {ep+1}")

            while not done and step_count < MAX_STEPS:

                action = scripted_expert_policy(state, step_count)
                pickle.dump((np.array(state), action), f)

                state, _, done, _, _ = env.step(action)
                step_count += 1

            print(f"Episode {ep+1} done in {step_count} steps.")
    

In [11]:
collect_scripted_expert_data(env, save_path="scripted_demo.pk1", episodes=5)

Episode 1
Episode 1 done in 1000 steps.
Episode 2
Episode 2 done in 1000 steps.
Episode 3
Episode 3 done in 1000 steps.
Episode 4
Episode 4 done in 1000 steps.
Episode 5
Episode 5 done in 1000 steps.


In [14]:
class FrameSequenceDataset(Dataset):
    def __init__(self, path, seq_len=8):
        import pickle

        frames = []
        actions = []

        with open(path, "rb") as f:
            while True:
                try:
                    state, action = pickle.load(f)
                    frames.append(torch.tensor(state, dtype=torch.float32) / 255.0)
                    actions.append(action)
                except EOFError:
                    break

        # sequences
        self.sequences = []
        self.labels = []
        for i in range(len(frames) - seq_len):
            sequence = torch.stack(frames[i:i+seq_len]).unsqueeze(1)  
            label = actions[i + seq_len - 1]
            self.sequences.append(sequence)
            self.labels.append(label)

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx], torch.tensor(self.labels[idx], dtype=torch.long)


In [15]:
dataset = FrameSequenceDataset("scripted_demo.pk1", seq_len=8)

In [16]:
def evaluate_scripted_agent(env, episodes=5, render=False):
    rewards = []
    distances = []
    steps_survived = []

    for ep in range(episodes):
        state = env.reset()[0]
        done = False
        total_reward = 0
        step_count = 0

        while not done:
            if render:
                env.render()

            action = scripted_expert_policy(state, step_count)
            state, reward, done, _, info = env.step(action)
            total_reward += reward
            step_count += 1

        rewards.append(total_reward)
        distances.append(info.get('x_pos', 0))
        steps_survived.append(step_count)

        print(f"[Scripted] Ep {ep+1}: Reward={total_reward}, Distance={distances[-1]}, Steps={step_count}")

    print(f"[Scripted] Avg Reward: {sum(rewards)/len(rewards):.2f}")
    print(f"[Scripted] Avg Distance: {sum(distances)/len(distances):.2f}")
    print(f"[Scripted] Avg Steps Survived: {sum(steps_survived)/len(steps_survived):.2f}")
    
    return rewards, distances, steps_survived


In [17]:
evaluate_scripted_agent(env, episodes=1, render=True)

  logger.warn(


[Scripted] Ep 1: Reward=3005.0, Distance=3161, Steps=2334
[Scripted] Avg Reward: 3005.00
[Scripted] Avg Distance: 3161.00
[Scripted] Avg Steps Survived: 2334.00


([3005.0], [3161], [2334])

#### Imitation Approach

In [10]:
import pandas as pd
from torch.utils.data import random_split

class ExpertDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = Path(root_dir)
        self.csv_files = list(self.root_dir.glob("*.csv"))
        self.transform = transform
        self.data = []
        
        for csv_file in self.csv_files:
            df = pd.read_csv(csv_file)
            for _, row in df.iterrows():
                img_path = self.root_dir / row['state_path']
                self.data.append((img_path, row['action']))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_path, action = self.data[idx]
        image = Image.open(img_path).convert('L')  
        if self.transform:
            image = self.transform(image)
        return image, torch.tensor(action, dtype=torch.long)
      

In [11]:
!pip install keyboard 



In [11]:
import pygame
import numpy as np
from PIL import Image

def collect_expert_demos(save_dir="expert_demos"):
    env = gym_super_mario_bros.make("SuperMarioBros-1-1-v3", 
                                   render_mode='human',
                                   apply_api_compatibility=True)
    env = JoypadSpace(env, SIMPLE_MOVEMENT)
    
    pygame.init()
    screen = pygame.display.set_mode((256, 240))
    
    save_path = Path(save_dir)
    save_path.mkdir(exist_ok=True)
    demo_num = len(list(save_path.glob("*.csv")))
    csv_path = save_path / f"demo_{demo_num}.csv"
    df = pd.DataFrame(columns=["state_path", "action"])

    state = env.reset()
    clock = pygame.time.Clock()
    running = True
    action = 0
    
    print("Use Arrow Keys + Space to control!")
    print("→: Right | ←: Left | ↑: Jump | Q: Quit")

    while running:
        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                running = False
        
        keys = pygame.key.get_pressed()
        
        if keys[pygame.K_RIGHT] and keys[pygame.K_UP]:
            action = 2  # Jump right
        elif keys[pygame.K_RIGHT]:
            action = 1  # Right
        elif keys[pygame.K_LEFT]:
            action = 6  # Left
        elif keys[pygame.K_UP]:
            action = 5  # Jump
        elif keys[pygame.K_q]:
            break
        else:
            action = 0
            
        next_state, reward, done, _, info = env.step(action)
        
        img_path = save_path / f"frame_{demo_num}_{len(df)}.png"
        Image.fromarray(next_state).save(img_path)
        df.loc[len(df)] = [img_path.name, action]
        
        if done:
            state = env.reset()
        
        clock.tick(60)  

    pygame.quit()
    env.close()
    df.to_csv(csv_path, index=False)
    print(f"Saved {len(df)} frames!")

!pip install pygame gym-super-mario-bros==7.4.0



In [12]:
collect_expert_demos()

  logger.warn(


Use Arrow Keys + Space to control!
→: Right | ←: Left | ↑: Jump | Q: Quit


  if not isinstance(terminated, (bool, np.bool8)):


Saved 18369 frames!


In [17]:
import torch
from torch.utils.data import DataLoader, random_split
import torch.optim as optim

class MarioNet(torch.nn.Module):
    def __init__(self, input_shape, num_actions):
        super().__init__()
        self.net = torch.nn.Sequential(
            torch.nn.Conv2d(1, 32, 8, stride=4),
            torch.nn.ReLU(),
            torch.nn.Conv2d(32, 64, 4, stride=2),
            torch.nn.ReLU(),
            torch.nn.Conv2d(64, 64, 3, stride=1),
            torch.nn.ReLU(),
            torch.nn.Flatten(),
            torch.nn.Linear(3136, 512),
            torch.nn.ReLU(),
            torch.nn.Linear(512, num_actions)
        )
        
    def forward(self, x):
        return self.net(x)

def train():
    BATCH_SIZE = 32
    LR = 0.00025
    EPOCHS = 10
    
    transform = T.Compose([
        T.Grayscale(),
        T.Resize((84, 84)),
        T.ToTensor(),
    ])
    
    dataset = ExpertDataset("expert_demos", transform=transform)
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_set, val_set = random_split(dataset, [train_size, val_size])
    
    train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_set, batch_size=BATCH_SIZE)

    # initialize model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = MarioNet(input_shape=(1, 84, 84), num_actions=env.action_space.n)
    model.to(device)
    
    optimizer = optim.Adam(model.parameters(), lr=LR)
    criterion = torch.nn.CrossEntropyLoss()

    # train
    for epoch in range(EPOCHS):
        model.train()
        train_loss = 0.0
        for states, actions in train_loader:
            states = states.to(device)
            actions = actions.to(device)
            
            optimizer.zero_grad()
            outputs = model(states)
            loss = criterion(outputs, actions)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        
        model.eval()
        val_loss = 0.0
        correct = 0
        with torch.no_grad():
            for states, actions in val_loader:
                states = states.to(device)
                actions = actions.to(device)
                
                outputs = model(states)
                loss = criterion(outputs, actions)
                val_loss += loss.item()
                
                _, predicted = torch.max(outputs, 1)
                correct += (predicted == actions).sum().item()
        
        print(f"Epoch {epoch+1}/{EPOCHS}")
        print(f"Train Loss: {train_loss/len(train_loader):.4f}")
        print(f"Val Loss: {val_loss/len(val_loader):.4f}")
        print(f"Val Acc: {correct/len(val_set):.4f}\n")
    
    torch.save(model.state_dict(), "mario_net.pt")
    return model

In [26]:
def evaluate(model, episodes=5):
    env = gym_super_mario_bros.make("SuperMarioBros-1-1-v3", 
                                   render_mode='human',
                                   apply_api_compatibility=True)
    env = JoypadSpace(env, SIMPLE_MOVEMENT)
    env = GrayScaleObservation(env)
    env = ResizeObservation(env, shape=84)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # metrics
    total_rewards = []
    total_distances = []
    steps_survived = []
    
    model.eval()
    
    for episode in range(episodes):
        raw_state = env.reset()
        state = raw_state[0] if isinstance(raw_state, tuple) else raw_state
        
        state_tensor = torch.tensor(state).float()
        if len(state_tensor.shape) == 2:  
            state_tensor = state_tensor.unsqueeze(0).unsqueeze(0)  
        elif len(state_tensor.shape) == 3:
            state_tensor = state_tensor.unsqueeze(0)  
        state_tensor = state_tensor.to(device)
        
        done = False
        episode_reward = 0
        episode_steps = 0
        max_x = 0
        
        while not done:
            with torch.no_grad():
                action = model(state_tensor).argmax().item()
            
            # step response
            step_response = env.step(action)
            next_state = step_response[0]
            reward = step_response[1]
            done = step_response[2]
            info = step_response[4]  
            
            # next state
            next_state_tensor = torch.tensor(next_state).float()
            if len(next_state_tensor.shape) == 2:
                next_state_tensor = next_state_tensor.unsqueeze(0).unsqueeze(0)
            elif len(next_state_tensor.shape) == 3:
                next_state_tensor = next_state_tensor.unsqueeze(0)
            next_state_tensor = next_state_tensor.to(device)
            
            # metrics
            episode_reward += reward
            episode_steps += 1
            max_x = max(max_x, info['x_pos'])
            env.render()
            
            state_tensor = next_state_tensor
        
        # results
        total_rewards.append(episode_reward)
        total_distances.append(max_x)
        steps_survived.append(episode_steps)
        
        print(f"Episode {episode+1}")
        print(f"  Reward: {episode_reward} | Distance: {max_x} | Steps: {episode_steps}")
    
    env.close()
    
    avg_reward = sum(total_rewards)/len(total_rewards)
    avg_distance = sum(total_distances)/len(total_distances)
    avg_steps = sum(steps_survived)/len(steps_survived)
    
    print("\nFinal Evaluation Metrics:")
    print(f"[Imitation Agent] Avg Reward: {avg_reward:.2f}")
    print(f"[Imitation Agent] Avg Distance: {avg_distance:.2f}")
    print(f"[Imitation Agent] Avg Steps Survived: {avg_steps:.2f}")
    
    return {
        'reward': avg_reward,
        'distance': avg_distance,
        'steps': avg_steps
    }

In [23]:
trained_model = train()
print(trained_model)

Epoch 1/10
Train Loss: 1.0098
Val Loss: 0.8786
Val Acc: 0.6662

Epoch 2/10
Train Loss: 0.8199
Val Loss: 0.7864
Val Acc: 0.7070

Epoch 3/10
Train Loss: 0.7585
Val Loss: 0.7408
Val Acc: 0.7226

Epoch 4/10
Train Loss: 0.6940
Val Loss: 0.6776
Val Acc: 0.7361

Epoch 5/10
Train Loss: 0.6435
Val Loss: 0.6292
Val Acc: 0.7535

Epoch 6/10
Train Loss: 0.5969
Val Loss: 0.5875
Val Acc: 0.7753

Epoch 7/10
Train Loss: 0.5544
Val Loss: 0.5494
Val Acc: 0.7869

Epoch 8/10
Train Loss: 0.5235
Val Loss: 0.5231
Val Acc: 0.8034

Epoch 9/10
Train Loss: 0.4906
Val Loss: 0.4939
Val Acc: 0.8170

Epoch 10/10
Train Loss: 0.4613
Val Loss: 0.4726
Val Acc: 0.8245

MarioNet(
  (net): Sequential(
    (0): Conv2d(1, 32, kernel_size=(8, 8), stride=(4, 4))
    (1): ReLU()
    (2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
    (3): ReLU()
    (4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
    (5): ReLU()
    (6): Flatten(start_dim=1, end_dim=-1)
    (7): Linear(in_features=3136, out_features=512, bias=True)

In [27]:
evaluate(trained_model)

  state_tensor = torch.tensor(state).float()
  if not isinstance(terminated, (bool, np.bool8)):
  next_state_tensor = torch.tensor(next_state).float()
  logger.warn(


Episode 1
  Reward: -415.0 | Distance: 40 | Steps: 9623
Episode 2
  Reward: -415.0 | Distance: 40 | Steps: 9623
Episode 3
  Reward: -415.0 | Distance: 40 | Steps: 9623
Episode 4
  Reward: -415.0 | Distance: 40 | Steps: 9623
Episode 5
  Reward: -415.0 | Distance: 40 | Steps: 9623

Final Evaluation Metrics:
[Imitation Agent] Avg Reward: -415.00
[Imitation Agent] Avg Distance: 40.00
[Imitation Agent] Avg Steps Survived: 9623.00


{'reward': -415.0, 'distance': 40.0, 'steps': 9623.0}

##### Using CNN + LSTM approach

In [33]:
class SequentialExpertDataset(Dataset):
    def __init__(self, expert_data):
        self.sequences = []
        self.labels = []

        for state, action in expert_data:
            full_stack = torch.tensor(np.array(state), dtype=torch.float32) / 255.0  
            seq = full_stack.unsqueeze(1)  
            self.sequences.append(seq)     
            self.labels.append(action)

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx], torch.tensor(self.labels[idx], dtype=torch.long)


In [32]:
class ImitationCNNLSTM(nn.Module):
    def __init__(self, num_actions=2, cnn_output_size=512, hidden_size=256, lstm_layers=1):
        super(ImitationCNNLSTM, self).__init__()
        
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=8, stride=4),  
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(64 * 7 * 7, cnn_output_size),
            nn.ReLU()
        )

        self.lstm = nn.LSTM(input_size=cnn_output_size, hidden_size=hidden_size, num_layers=lstm_layers, batch_first=True)

        self.fc = nn.Linear(hidden_size, num_actions)

    def forward(self, x):
        """
        x shape: [batch_size, seq_len, 1, 84, 84]
        """
        B, T, C, H, W = x.shape
        x = x.view(B * T, C, H, W)        
        features = self.cnn(x)             
        features = features.view(B, T, -1) 

        lstm_out, _ = self.lstm(features) 
        final_output = lstm_out[:, -1, :] 

        return self.fc(final_output)      


##### Splitting into train and test sets

In [34]:
expert_data = torch.load("expert_demo.pt")

train_data, test_data = train_test_split(expert_data, test_size=0.2, random_state=42)

train_dataset = SequentialExpertDataset(train_data)
test_dataset = SequentialExpertDataset(test_data)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [38]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_lstm = ImitationCNNLSTM(num_actions=2).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

##### Training 

In [39]:
num_epochs = 10
for epoch in range(num_epochs):
    model_lstm.train()
    total_loss = 0

    for batch_idx, (sequences, actions) in enumerate(train_loader):
        sequences = sequences.to(device)  
        actions = actions.to(device)      

        # forward
        logits = model(sequences)         
        loss = loss_fn(logits, actions)

        # backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        if batch_idx % 10 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Batch [{batch_idx+1}/{len(train_loader)}], Loss: {loss.item():.4f}")

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch+1}] Completed - Average Loss: {avg_loss:.4f}")

torch.save(model_lstm.state_dict(), "cnn_lstm_imitation_model.pth")
print("Model saved as cnn_lstm_imitation_model.pth")

Epoch [1/10], Batch [1/5], Loss: 0.0016
Epoch [1] Completed - Average Loss: 0.0010
Epoch [2/10], Batch [1/5], Loss: 0.0004
Epoch [2] Completed - Average Loss: 0.0003
Epoch [3/10], Batch [1/5], Loss: 0.0002
Epoch [3] Completed - Average Loss: 0.0001
Epoch [4/10], Batch [1/5], Loss: 0.0001
Epoch [4] Completed - Average Loss: 0.0001
Epoch [5/10], Batch [1/5], Loss: 0.0001
Epoch [5] Completed - Average Loss: 0.0001
Epoch [6/10], Batch [1/5], Loss: 0.0000
Epoch [6] Completed - Average Loss: 0.0000
Epoch [7/10], Batch [1/5], Loss: 0.0000
Epoch [7] Completed - Average Loss: 0.0000
Epoch [8/10], Batch [1/5], Loss: 0.0000
Epoch [8] Completed - Average Loss: 0.0000
Epoch [9/10], Batch [1/5], Loss: 0.0000
Epoch [9] Completed - Average Loss: 0.0000
Epoch [10/10], Batch [1/5], Loss: 0.0000
Epoch [10] Completed - Average Loss: 0.0000
Model saved as cnn_lstm_imitation_model.pth


#### Evaluation

##### Evaluating accuracy of imitation model

In [28]:
from sklearn.model_selection import train_test_split

expert_data = torch.load("expert_demo.pt")
train_data, test_data = train_test_split(expert_data, test_size=0.2, random_state=42)

train_dataset = ExpertDataset(train_data)
test_dataset = ExpertDataset(test_data)

In [41]:
def evaluate_imitation_model(model, env, episodes=5, device="cpu", render=False):
    model.eval()
    rewards = []
    distances = []
    steps_survived = []

    for ep in range(episodes):
        state = env.reset()[0]
        done = False
        total_reward = 0
        step_count = 0

        while not done:
            if render:
                env.render()

            state_tensor = torch.tensor(np.array(state), dtype=torch.float32).unsqueeze(0).unsqueeze(2).to(device)
            state_tensor = state_tensor / 255.0


            # predicting the action
            with torch.no_grad():
                logits = model(state_tensor)
                action = logits.argmax(dim=1).item()

            state, reward, done, _, info = env.step(action)
            total_reward += reward
            step_count += 1

        rewards.append(total_reward)
        distances.append(info.get('x_pos', 0))
        steps_survived.append(step_count)

        print(f"Episode {ep+1}: Reward={total_reward}, Distance={distances[-1]}, Steps={step_count}")

    print(f"Avg Reward: {sum(rewards)/len(rewards):.2f}")
    print(f"Avg Distance: {sum(distances)/len(distances):.2f}")
    print(f"Avg Steps Survived: {sum(steps_survived)/len(steps_survived):.2f}")


In [42]:
evaluate_imitation_model(model_lstm, env, episodes=5, device=device, render=True)

Episode 1: Reward=231.0, Distance=296, Steps=40
Episode 2: Reward=231.0, Distance=296, Steps=40
Episode 3: Reward=231.0, Distance=296, Steps=40
Episode 4: Reward=231.0, Distance=296, Steps=40
Episode 5: Reward=231.0, Distance=296, Steps=40
Avg Reward: 231.00
Avg Distance: 296.00
Avg Steps Survived: 40.00


#### Q-Learning

In [None]:
class MarioDQN(nn.Module):
    def __init__(self, num_actions):
        super(MarioDQN, self).__init__()
        self.net = nn.Sequential(
            nn.Conv2d(4, 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(64 * 7 * 7, 512),
            nn.ReLU(),
            nn.Linear(512, num_actions)
        )

    def forward(self, x):
        return self.net(x)
num_actions = len(SIMPLE_MOVEMENT)  
policy_net = MarioDQN(num_actions).to(device)
target_net = MarioDQN(num_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

replay_buffer = deque(maxlen=10000)
batch_size = 64
gamma = 0.99
epsilon_start = 1.0
epsilon_min = 0.05
epsilon_decay = 0.995
learning_rate = 0.00025
update_interval = 4
target_update_interval = 1000  
optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate)
loss_fn = nn.MSELoss()

def preprocess_state(state):
    state = torch.tensor(np.array(state), dtype=torch.float32).to(device)
    state = state.permute(2, 0, 1).unsqueeze(0) / 255.0 
    return state

def choose_action(state, epsilon):
    if random.random() < epsilon:
        return random.randint(0, num_actions-1) 
    state = preprocess_state(state)
    with torch.no_grad():
        return policy_net(state).argmax().item()

def train_dqn():
    if len(replay_buffer) < batch_size:
        return
    
    batch = random.sample(replay_buffer, batch_size)
    states, actions, rewards, next_states, dones = zip(*batch)
    
    states = torch.stack([preprocess_state(s) for s in states]).squeeze(1)
    next_states = torch.stack([preprocess_state(s) for s in next_states]).squeeze(1)
    actions = torch.tensor(actions, dtype=torch.long).to(device)
    rewards = torch.tensor(rewards, dtype=torch.float32).to(device)
    dones = torch.tensor(dones, dtype=torch.float32).to(device)
    
    current_q = policy_net(states).gather(1, actions.unsqueeze(1)).squeeze()
    
    with torch.no_grad():
        next_actions = policy_net(next_states).argmax(dim=1)
        next_q = target_net(next_states).gather(1, next_actions.unsqueeze(1)).squeeze()
        target = rewards + gamma * next_q * (1 - dones)
    
    # Compute  and update
    loss = loss_fn(current_q, target)
    optimizer.zero_grad()
    loss.backward()
    nn.utils.clip_grad_norm_(policy_net.parameters(), max_norm=1.0)
    optimizer.step()

epsilon = epsilon_start
episodes = 500
for ep in range(episodes):
    state = env.reset()[0]
    done = False
    total_reward = 0
    total_steps = 0
    
    while not done:
        action = choose_action(state, epsilon)
        next_state, reward, done, _, info = env.step(action)
        
        reward = np.clip(reward, -1, 1)
        
        replay_buffer.append((state, action, reward, next_state, done))
        total_steps += 1
        
        if total_steps % update_interval == 0:
            train_dqn()
            
        if total_steps % target_update_interval == 0:
            target_net.load_state_dict(policy_net.state_dict())
            
        state = next_state
        total_reward += reward
    
    epsilon = max(epsilon_min, epsilon * epsilon_decay)