In [None]:
import os 
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [None]:
import gym
import numpy as np
import matplotlib.pyplot as plt
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import BaseCallback
import cv2
from hashlib import sha256
from collections import OrderedDict
import matplotlib
from gym.wrappers import GrayScaleObservation
from stable_baselines3.common.vec_env import VecFrameStack, DummyVecEnv, VecEnvWrapper
from stable_baselines3.common.env_util import make_vec_env
from gym import spaces
from stable_baselines3.common.callbacks import CheckpointCallback

In [None]:
# Create environment
env = gym.make('MontezumaRevenge-v0', render_mode="rgb_array")

In [None]:
env.observation_space

In [None]:
env.action_space

In [None]:
env.spec

In [None]:
env1 = GrayScaleObservation(env, keep_dim=True)
#env1 = make_vec_env(lambda: env1, n_envs=4) you wouldn't use the dummyvec if you uncomment this
env1 = DummyVecEnv([lambda: env1]) #create a vectorized environment 
env1 = VecFrameStack(env1, 4, channels_order='last') #consecutive frames are stacked together as a single input to the agent's policy network to make decisions based on the temporal dynamics of the game env.

In [None]:
def convert_state(state):
    # Extract dimensions of a single frame
    state = state.squeeze()
    height, width, num_frames = state.shape

    # New dimensions for downscaling 
    new_width = 8
    new_height = 11
    depth = 12  

    # Resize each frame individually
    resized_frames = []
    for i in range(num_frames):
        resized_frame = cv2.resize(state[:, :, i], (new_width, new_height), interpolation=cv2.INTER_AREA)
        resized_frame = ((resized_frame / 255.0) * depth).astype(np.uint8)
        resized_frames.append(resized_frame)
    
    # Stack the resized frames back together
    resized_state = np.stack(resized_frames, axis=-1)

    return resized_state.astype(np.uint8)


In [None]:
def make_reference(cell):
      cell_as_string = ''.join(cell.astype(int).astype(str).flatten())
      cell_as_bytes = cell_as_string.encode()
      cell_as_hash_bytes = sha256(cell_as_bytes)
      cell_as_hash_hex = cell_as_hash_bytes.hexdigest()
      cell_as_hash_int = int(cell_as_hash_hex, 16)
      cell_as_hash_string = str(cell_as_hash_int)
      return cell_as_hash_string

In [None]:
class TrackCellsCallback(BaseCallback):
    def __init__(self, verbose=0):
        super(TrackCellsCallback, self).__init__(verbose)
        self.visited_cells = set()
        self.visited_cells_per_episode = set()
        self.number_explored_over_timesteps = []
        self.episode_cells_counts = []

    def _on_step(self) -> bool:
        obs = self.locals['new_obs']
        cell = convert_state(obs) 
        ref = make_reference(cell)
        self.visited_cells.add(ref)
        self.visited_cells_per_episode.add(ref) 
        self.number_explored_over_timesteps.append(len(self.visited_cells)) 

        if self.locals['dones'][0]: # Check if the episode has ended
            self.episode_cells_counts.append(len(self.visited_cells_per_episode))
            self.visited_cells_per_episode.clear()  # Reset for the next episode
            
            
        return True

    def plot_cell_count_over_timesteps(self):
        plt.plot(range(len(self.number_explored_over_timesteps)), self.number_explored_over_timesteps)
        plt.xlabel('Timesteps')
        plt.ylabel('Percentage of Cells Visited')
        plt.title('Exploration Over Timesteps')
        plt.show()

    def plot_cell_count_over_episodes(self):
        plt.plot(range(len(self.episode_cells_counts)), self.episode_cells_counts)
        plt.xlabel('Episodes')
        plt.ylabel('Percentage of Cells Visited')
        plt.title('Exploration Over Episodes')
        plt.show()

In [None]:
class TrackCumulativeRewardsCallback(BaseCallback):
    def __init__(self, verbose=0, block_size=100):
        super(TrackCumulativeRewardsCallback, self).__init__(verbose)
        self.current_episode_reward = 0
        self.cumulative_rewards_per_episode = []
        self.block_rewards = []  # Stores rewards for current block
        self.block_size = block_size  # Number of timesteps per block
        self.block_variances = []  # Stores variance for each block

    def _on_step(self) -> bool:
        reward = self.locals['rewards'][0]  # rewards is a list, we take the first element
        self.current_episode_reward += reward
        self.block_rewards.append(reward)

        #calculating cumulative reward per episode
        if self.locals['dones'][0]:  # Check if the episode has ended
            self.cumulative_rewards_per_episode.append(self.current_episode_reward)
            self.current_episode_reward = 0  # Reset for the next episode
        
        #calculating reward variance per timesteps block
        if len(self.block_rewards) == self.block_size:
            self.calculate_block_variance()
        
        return True

    
    def plot_cumulative_rewards_over_episodes(self):
        plt.plot(range(len(self.cumulative_rewards_per_episode)), self.cumulative_rewards_per_episode)
        plt.xlabel('Episodes')
        plt.ylabel('Cumulative Reward')
        plt.title('Cumulative Reward Over Episodes')
        plt.show()


    def calculate_block_variance(self):
        
        mean = sum(self.block_rewards) / len(self.block_rewards)
        variance = sum((x - mean) ** 2 for x in self.block_rewards) / (len(self.block_rewards) - 1)
        self.block_variances.append(variance)
        self.block_rewards = []  # Reset for the next episode
    
    def plot_block_variances(self):
            
            block_starts = list(range(1, len(self.block_variances) + 1))  # Create block numbers
            plt.plot(block_starts, self.block_variances)  # Plot variance vs block start steps
            plt.xlabel('Block Start Step')
            plt.ylabel('Block Variance')
            plt.title('Evolution of Variance Across Blocks of Timesteps')
            plt.show()

    

In [None]:
# Set up PPO model
model = PPO('MlpPolicy', env1, verbose=1)

# Create the callbacks
cells_callback = TrackCellsCallback()
rewards_callback = TrackCumulativeRewardsCallback()
# Define checkpointing callback
checkpoint_callback = CheckpointCallback(save_freq=10000, save_path='./models_montezuma_revenge/', name_prefix='ppo_montezuma')
# Train the model with the callbacks
model.learn(total_timesteps=10000000, callback=[cells_callback, rewards_callback, checkpoint_callback])


In [None]:
#To interrupt and resume the training, save the model and then load it later to carry on from there 

In [None]:
# Save the model
model.save("ppo_montezuma_model")

In [None]:
# Plotting additional statistics
cells_callback.plot_cell_count_over_timesteps()
cells_callback.plot_cell_count_over_episodes()
rewards_callback.plot_cumulative_rewards_over_episodes()
rewards_callback.plot_block_variances()