In [1]:
import os 
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [2]:
import gym
import csv
import re 
import pickle
import glob
import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import BaseCallback
import cv2
from hashlib import sha256
from collections import OrderedDict
from gym.wrappers import GrayScaleObservation
from stable_baselines3.common.vec_env import VecFrameStack, DummyVecEnv, VecEnvWrapper
from stable_baselines3.common.env_util import make_vec_env
from gym import spaces
from stable_baselines3.common.callbacks import CheckpointCallback

In [3]:
# Create environment
env = gym.make('MontezumaRevenge', render_mode="rgb_array")

  logger.warn(


In [4]:
env.observation_space

Box(0, 255, (210, 160, 3), uint8)

In [5]:
env.action_space

Discrete(18)

In [6]:
env.spec

EnvSpec(id='MontezumaRevenge-v4', entry_point='ale_py.env.gym:AtariEnv', reward_threshold=None, nondeterministic=False, max_episode_steps=None, order_enforce=True, autoreset=False, disable_env_checker=False, apply_api_compatibility=False, kwargs={'game': 'montezuma_revenge', 'obs_type': 'rgb', 'repeat_action_probability': 0.0, 'full_action_space': False, 'max_num_frames_per_episode': 108000, 'frameskip': (2, 5), 'render_mode': 'rgb_array'}, namespace=None, name='MontezumaRevenge', version=4)

In [7]:
env.reward_range = (-2, 2) 

In [8]:
env1 = GrayScaleObservation(env, keep_dim=True)
#env1 = make_vec_env(lambda: env1, n_envs=4) you wouldn't use the dummyvec if you uncomment this
env1 = DummyVecEnv([lambda: env1]) #create a vectorized environment  for parallelized training using multiole envs
env1 = VecFrameStack(env1, 4, channels_order='last') #consecutive frames are stacked together as a single input to the agent's policy network to make decisions based on the temporal dynamics of the game env.



In [9]:
def convert_state(state):
    # Extract dimensions of a single frame
    state = state.squeeze()
    height, width, num_frames = state.shape

    # New dimensions for downscaling 
    new_width = 8
    new_height = 11
    depth = 12  

    # Resize each frame individually
    resized_frames = []
    for i in range(num_frames):
        resized_frame = cv2.resize(state[:, :, i], (new_width, new_height), interpolation=cv2.INTER_AREA)
        resized_frame = ((resized_frame / 255.0) * depth).astype(np.uint8)
        resized_frames.append(resized_frame)
    
    # Stack the resized frames back together
    resized_state = np.stack(resized_frames, axis=-1)

    return resized_state.astype(np.uint8)


In [10]:
def make_reference(cell):
      cell_as_string = ''.join(cell.astype(int).astype(str).flatten())
      cell_as_bytes = cell_as_string.encode()
      cell_as_hash_bytes = sha256(cell_as_bytes)
      cell_as_hash_hex = cell_as_hash_bytes.hexdigest()
      cell_as_hash_int = int(cell_as_hash_hex, 16)
      cell_as_hash_string = str(cell_as_hash_int)
      return cell_as_hash_string

In [11]:
def get_latest_checkpoint(checkpoint_dir):
    checkpoint_files = [f for f in os.listdir(checkpoint_dir) if re.match(r'no_expl_ppo_montezuma_\d+_steps.zip', f)]
    if not checkpoint_files:
        return None
    checkpoint_files.sort(key=lambda x: int(re.findall(r'(\d+)_steps', x)[0]), reverse=True)
    return os.path.join(checkpoint_dir, checkpoint_files[0])

In [12]:
class LogCallback(BaseCallback):
    def __init__(self, verbose=1, log_file='training_log.csv', log_file2='episode_info.csv'):
        super(LogCallback, self).__init__(verbose)
        self.visited_cells = set()
        self.visited_cells_per_episode = set()
        self.current_episode_reward = 0
        self.score = 0
        self.episode_count = 0
        self.iteration_count = 0  # Added iteration count
        self.log_file = log_file
        self.log_file2 = log_file2
        self.train_keys = ['train/entropy_loss', 'train/policy_gradient_loss', 
                           'train/value_loss', 'train/approx_kl', 'train/clip_fraction', 
                           'train/loss', 'train/explained_variance']
        self.metrics = []

        # Ensure the log files have headers
        self._initialize_log_files()

    def _initialize_log_files(self):
        # Check if log_file exists, if not, write headers
        if not os.path.exists(self.log_file):
            with open(self.log_file, 'w', newline='') as file:
                writer = csv.writer(file)
                writer.writerow(['Iteration', 'Episode', 'Cells', 'Score'] + self.train_keys)

        # Check if log_file2 exists, if not, write headers
        if not os.path.exists(self.log_file2):
            with open(self.log_file2, 'w', newline='') as ep_file:
                writer = csv.writer(ep_file)
                writer.writerow(['Episode', 'Cells per ep', 'Reward per ep', 'Score'])

    def save_state(self, save_path):
        state = {
            'visited_cells': self.visited_cells,
            'visited_cells_per_episode': self.visited_cells_per_episode,
            'current_episode_reward': self.current_episode_reward,
            'score': self.score,
            'episode_count': self.episode_count,
            'iteration_count': self.iteration_count  # Save iteration count
        }
        with open(save_path, 'wb') as f:
            pickle.dump(state, f)
        print(f"LogCallback state saved to {save_path}")

    def load_state(self, load_path):
        with open(load_path, 'rb') as f:
            state = pickle.load(f)
        self.visited_cells = state['visited_cells']
        self.visited_cells_per_episode = state['visited_cells_per_episode']
        self.current_episode_reward = state['current_episode_reward']
        self.score = state['score']
        self.episode_count = state['episode_count']
        self.iteration_count = state['iteration_count']  # Load iteration count
        print(f"LogCallback state loaded from {load_path}")

    def _on_step(self) -> bool:
        obs = self.locals['new_obs']
        cell = convert_state(obs) 
        ref = make_reference(cell)
        self.visited_cells.add(ref)
        self.visited_cells_per_episode.add(ref)
        
        reward = self.locals['rewards'][0]
        self.current_episode_reward += reward
        self.score += reward

        if self.locals['dones'][0]:
            self.episode_count += 1

            # Write episode information
            with open(self.log_file2, 'a', newline='') as ep_file:
                writer = csv.writer(ep_file)
                writer.writerow([
                    self.episode_count,
                    len(self.visited_cells_per_episode),
                    self.current_episode_reward,
                    self.score
                ])
     
            self.current_episode_reward = 0
            self.visited_cells_per_episode.clear()
        return True

    def _on_rollout_end(self) -> None:
        metrics = {key: self.model.logger.name_to_value.get(key, 0.0) for key in self.train_keys}
        self.metrics.append(metrics)
        self.iteration_count += 1  # Increment iteration count

        if self.verbose > 0:
            print(f"Iteration {self.iteration_count}: {metrics}")
            print(f"Episode {self.episode_count}, Cells Discovered: {len(self.visited_cells)}, Score: {self.score}")
        
        # Write metrics to CSV
        with open(self.log_file, 'a', newline='') as file:
            writer = csv.writer(file)
            writer.writerow([
                self.iteration_count,  # Use iteration count
                self.episode_count, 
                len(self.visited_cells), 
                self.score
            ] + [metrics.get(key, 0.0) for key in self.train_keys])


In [13]:
class CustomCheckpointCallback(CheckpointCallback):
    def __init__(self, start_step=0, log_callback=None, *args, **kwargs):
        super(CustomCheckpointCallback, self).__init__(*args, **kwargs)
        self.start_step = start_step
        self.log_callback = log_callback

    def _on_step(self) -> bool:
        # Calculate the total number of steps considering the start step
        total_steps = self.num_timesteps + self.start_step

        # Save the model and log callback state if the condition is met
        if total_steps % self.save_freq == 0:
            save_path = os.path.join(self.save_path, f"{self.name_prefix}_{total_steps}_steps.zip")
            self.model.save(save_path)
            print(f"Saving model checkpoint to {save_path}")

            # Save LogCallback state to a single file
            if self.log_callback is not None:
                log_state_path = os.path.join(self.save_path, 'log_callback_state.pkl')
                self.log_callback.save_state(log_state_path)

        return True


In [14]:
# Ensure the checkpoint directory exists
checkpoint_dir = './no_exploration_ppo_models/'
os.makedirs(checkpoint_dir, exist_ok=True)

# File paths
latest_checkpoint = get_latest_checkpoint(checkpoint_dir)
state_file_path = os.path.join(checkpoint_dir, 'log_callback_state.pkl')

if latest_checkpoint:
    model1 = PPO.load(latest_checkpoint, env=env1)
    last_checkpoint_step = int(re.findall(r'(\d+)_steps', latest_checkpoint)[0])
    print(f"Resuming training from checkpoint: {latest_checkpoint}")
    
    log_callback = LogCallback()
    if os.path.exists(state_file_path):
        log_callback.load_state(state_file_path)
    else:
        print("No previous log callback state found. Starting fresh.")
else:
    model1 = PPO('CnnPolicy', env1, learning_rate=2.5e-4, gamma=0.99, verbose=1)
    last_checkpoint_step = 0
    log_callback = LogCallback()
    print("Starting new training")

# Calculate the remaining timesteps to train
total_timesteps = 10000000
remaining_timesteps = total_timesteps - last_checkpoint_step

# Create the checkpoint callback
checkpoint_callback = CustomCheckpointCallback(
    start_step=last_checkpoint_step,
    save_freq=2048,
    save_path=checkpoint_dir,
    name_prefix='no_expl_ppo_montezuma',
    log_callback=log_callback
)

# Train the model with the callbacks
model1.learn(total_timesteps=remaining_timesteps, callback=[log_callback, checkpoint_callback])


Wrapping the env in a VecTransposeImage.
Resuming training from checkpoint: ./no_exploration_ppo_models/no_expl_ppo_montezuma_438272_steps.zip
LogCallback state loaded from ./no_exploration_ppo_models/log_callback_state.pkl


  if not isinstance(terminated, (bool, np.bool8)):


Saving model checkpoint to ./no_exploration_ppo_models/no_expl_ppo_montezuma_440320_steps.zip
LogCallback state saved to ./no_exploration_ppo_models/log_callback_state.pkl
Iteration 202: {'train/entropy_loss': 0.0, 'train/policy_gradient_loss': 0.0, 'train/value_loss': 0.0, 'train/approx_kl': 0.0, 'train/clip_fraction': 0.0, 'train/loss': 0.0, 'train/explained_variance': 0.0}
Episode 511, Cells Discovered: 131241, Score: 0.0
-----------------------------
| time/              |      |
|    fps             | 41   |
|    iterations      | 1    |
|    time_elapsed    | 48   |
|    total_timesteps | 2048 |
-----------------------------
Saving model checkpoint to ./no_exploration_ppo_models/no_expl_ppo_montezuma_442368_steps.zip
LogCallback state saved to ./no_exploration_ppo_models/log_callback_state.pkl
Iteration 203: {'train/entropy_loss': -0.8827146038413047, 'train/policy_gradient_loss': -0.08554378154512961, 'train/value_loss': 6.882635249283453e-06, 'train/approx_kl': 1.1783458, 'trai

KeyboardInterrupt: 