<a href="https://colab.research.google.com/github/eisbetterthanpi/pytorch/blob/main/Curiosity_lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Curiosity-Driven Exploration

#### setup

In [1]:
# https://github.com/Yangyangii/Curiosity-Driven-A2C
# https://colab.research.google.com/github/Yangyangii/Curiosity-Driven-A2C/blob/master/Curiosity.ipynb

%pip install -U gym
%pip install -U gym[atari,accept-rom-license]
# !pip install gym[box2d]
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim
import collections
device = "cuda" if torch.cuda.is_available() else "cpu"

!pip install gym-super-mario-bros nes-py
# https://github.com/Kautenja/gym-super-mario-bros
from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT, COMPLEX_MOVEMENT
env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = JoypadSpace(env, COMPLEX_MOVEMENT) # SIMPLE_MOVEMENT COMPLEX_MOVEMENT

!pip install colabgymrender


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gym
  Downloading gym-0.24.1.tar.gz (696 kB)
[K     |████████████████████████████████| 696 kB 4.6 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting gym-notices>=0.0.4
  Downloading gym_notices-0.0.7-py3-none-any.whl (2.7 kB)
Building wheels for collected packages: gym
  Building wheel for gym (PEP 517) ... [?25l[?25hdone
  Created wheel for gym: filename=gym-0.24.1-py3-none-any.whl size=793152 sha256=f9975e58674834bdb620cd048b4fe68178d64db85d86a3275dd710d7b2505455
  Stored in directory: /root/.cache/pip/wheels/18/0e/54/63d9f3d16ddf0fec1622e90d28140df5e6016bcf8ea920037d
Successfully built gym
Installing collected packages: gym-notices, gym
  Attempting uninstall: gym
    Found existing installation: gym 0.17.3
    Uninstalling gym-0.17.3:
    

  f"The environment {id} is out of date. You should consider "
  "We recommend you to use a symmetric and normalized Box action space (range=[-1, 1]) "


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting colabgymrender
  Downloading colabgymrender-1.1.0.tar.gz (3.5 kB)
Building wheels for collected packages: colabgymrender
  Building wheel for colabgymrender (setup.py) ... [?25l[?25hdone
  Created wheel for colabgymrender: filename=colabgymrender-1.1.0-py3-none-any.whl size=3132 sha256=f993fe3667702119d2813b33ac12a214bd38231000c98c91e4ff880a1f1f5b3f
  Stored in directory: /root/.cache/pip/wheels/f1/0a/2a/86955ea711b461ab7918236fed2568733f75ed677d0524b56c
Successfully built colabgymrender
Installing collected packages: colabgymrender
Successfully installed colabgymrender-1.1.0


#### gym wrappers

In [2]:
# https://colab.research.google.com/github/araffin/rl-tutorial-jnrr19/blob/master/2_gym_wrappers_saving_loading.ipynb
import gym
class SparseEnv(gym.Wrapper): #https://alexandervandekleut.github.io/gym-wrappers/
    def __init__(self, env):
        super().__init__(env)
        self.env = env
        self.total_rewards = 0
    def step(self, action):
        observation, reward, done, info = self.env.step(action)
        self.total_rewards += reward
        if done: reward = self.total_rewards
        else: reward = 0
        return observation, reward, done, info
    def reset(self):
        self.total_rewards = 0
        return self.env.reset()
# env = SparseEnv(gym.make("LunarLander-v2"))

class MarioSparse(gym.Wrapper):
    def __init__(self, env):
        # super().__init__(env)
        super(MarioSparse, self).__init__(env)
        self.env = env
        self.total_score = 0
    def step(self, action):
        observation, reward, done, info = self.env.step(action)
        life = info['life']
        score = info['score']
        self.total_score += score
        if life<2:
            print("MarioSparse: died")
            # return observation, score, True, info # lost one life, end env
            done = True
        # else:
            # self.total_score = 0
        return observation, score, done, info
    def reset(self):
        self.total_score = 0
        return self.env.reset()
# env = MarioSparse(env)

class MarioEarlyStop(gym.Wrapper):
    def __init__(self, env):
        # super().__init__(env)
        super(MarioEarlyStop, self).__init__(env)
        self.env = env
        self.max_pos = 0
        self.count_step = 0
    def step(self, action):
        observation, reward, done, info = self.env.step(action)
        x_pos = info['x_pos']
        if x_pos <= self.max_pos: self.count_step += 1
        else:
            self.max_pos = x_pos
            self.count_step = 0
        if self.count_step > 500:
            print("MarioEarlyStop: early stop ", self.max_pos)
            # return observation, reward, True, info # early stop
            done = True
        # else:
        return observation, reward, done, info
    def reset(self):
        self.max_pos = 0
        self.count_step = 0
        return self.env.reset()
# env = MarioEarlyStop(env)


#### model

In [3]:
# @title Conv_Encoder
class Conv_Encoder(nn.Module):
    # def __init__(self):
    def __init__(self, in_channels=1):
        super(Conv_Encoder, self).__init__()
        self.conv_encoder = nn.Sequential( # embed pi (240, 256, 3) -> 256 when flattened
            nn.Conv2d(in_channels, 8, 3, stride=2, padding=1), nn.ELU(),
            # nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(8, 16, 5, stride=2, padding=2), nn.ELU(),
            nn.AdaptiveAvgPool2d((64,64)),
            nn.Conv2d(16, 8, 7, stride=2, padding=3), nn.ELU(),
            nn.Conv2d(8, 1, 5, stride=2, padding=2), nn.ELU(),
            # # nn.Conv2d(in_channels, out_channels=1, kernel_size=3, stride=2, padding=1),
            # nn.ReLU(),
            )
    def forward(self, x): # in [4, 3, 224, 224]
        # print("conv forward shape",x.shape)
        # x=x.squeeze()
        # x = torch.transpose(x, 1,2)
        # x = torch.transpose(x, 0,1)
        x = torch.transpose(x, -2,-1)
        x = torch.transpose(x, -3,-2)
        # print("conv forward",x.shape)
        x = self.conv_encoder(x)
        x=x.flatten(start_dim=1)
        # x = x.view(-1, 16 * 5 * 5)
        return x # out [batch, 256]

conv=Conv_Encoder(3)
# x=torch.rand(1, 210, 160, 3)#.squeeze()
x=torch.rand(2, 210, 160, 3)#.squeeze()
# x = torch.transpose(x, 1,2)
# x = torch.transpose(x, 0,1)
print(x.shape)
out=conv(x)
print(out.shape)


torch.Size([2, 210, 160, 3])
torch.Size([2, 256])


In [4]:

class Actor(nn.Module):
    def __init__(self, n_actions, space_dims, hidden_dims): #cartpole 2 4 32
        super(Actor, self).__init__()
        # print("init actor",n_actions, space_dims, hidden_dims)
        space_dims=256
        self.feature_extractor = nn.Sequential(
            nn.Linear(space_dims, hidden_dims),
            nn.ReLU(True),
        )
        self.actor = nn.Sequential(
            nn.Linear(hidden_dims, n_actions),
            nn.Softmax(dim=-1),
        )
        self.conv_encoder = Conv_Encoder(3)
    
    def forward(self, x):
        # print("forward1",x.shape) # cp [1, 4] mon [1, 210, 160, 3]
        x = self.conv_encoder(x)
        # print("forward2",x.shape) # cp [1, 4]
        features = self.feature_extractor(x)
        policy = self.actor(features)
        return policy
    
class Critic(nn.Module):
    def __init__(self, space_dims, hidden_dims):
        super(Critic, self).__init__()
        space_dims=256
        self.feature_extractor = nn.Sequential(
            nn.Linear(space_dims, hidden_dims),
            nn.ReLU(True),
        )
        self.critic = nn.Linear(hidden_dims, 1)
        self.conv_encoder = Conv_Encoder(3)
    
    def forward(self, x):
        x = self.conv_encoder(x)
        features = self.feature_extractor(x)
        est_reward = self.critic(features)
        return est_reward




class InvModel(nn.Module):
    def __init__(self, n_actions, hidden_dims, phist_size):
        super(InvModel, self).__init__()
        self.inv_lstm = nn.LSTMCell(phist_size, hidden_size)
        self.fc = nn.Linear(phist_size+hidden_size, n_actions)
        self.inv_latent = (torch.zeros(1, 512).to(device), torch.zeros(1, 512).to(device))
        # self.inv_latent = (torch.zeros(1, 512), torch.zeros(1, 512))
        
    def reset_hidden(self):
        self.inv_latent = (torch.zeros(1, 512).to(device), torch.zeros(1, 512).to(device))
        # self.inv_latent = (torch.zeros(1, 512), torch.zeros(1, 512))
        
    def forward(self, phist1): # [1, 512]
        features = torch.cat([self.inv_latent[0],phist1], dim=1)
        features = features.view(1, -1) # (1, hidden_dims)
        # print("InverseModel", features.shape)
        athat = self.fc(features) # (1, n_actions)
        self.inv_latent = self.inv_lstm(phist,self.inv_latent)
        return athat

class FwdModel(nn.Module):
    def __init__(self, n_actions, hidden_dims, phist_size):
        super(FwdModel, self).__init__()
        self.fwd_lstm = nn.LSTMCell(phist_size, hidden_size)
        self.fc = nn.Linear(hidden_dims+n_actions, phist_size)
        self.eye = torch.eye(n_actions, device=device)
        self.fwd_latent = (torch.zeros(1, 512).to(device), torch.zeros(1, 512).to(device))
        # self.fwd_latent = (torch.zeros(1, 512), torch.zeros(1, 512))
        
    def reset_hidden(self):
        self.fwd_latent = (torch.zeros(1, 512).to(device), torch.zeros(1, 512).to(device))
        # self.fwd_latent = (torch.zeros(1, 512), torch.zeros(1, 512))

    def forward(self, action, phist):
        self.fwd_latent = self.fwd_lstm(phist, self.fwd_latent)
        # print("ForwardModel",action.shape, fwd_latent[0].shape)
        # print("ForwardModel2",self.eye[action], fwd_latent[0].shape)
        x = torch.cat([self.eye[action], self.fwd_latent[0]], dim=-1) # (1, n_actions+hidden_dims)
        phihat1 = self.fc(x) # (1, hidden_dims)
        return phihat1




    #     forward_pred_err = 1/2 * self.forward_loss(forward_pred, enc_state2.detach()).sum(dim=1).unsqueeze(dim=1)
    #     # calc prediction error
    #     pred_action = self.inverse_model(enc_state1, enc_state2) 
    #     inverse_pred_err = self.inverse_loss(pred_action, action.flatten().long()).unsqueeze(dim=1)    
    #     return forward_pred_err, inverse_pred_err

    # def update_ICM(self, forward_err, inverse_err):
    #     self.optimizer.zero_grad()
    #     loss = ((1. - self.beta)*inverse_err + self.beta*forward_err).mean()
    #     #print(loss)
    #     loss.backward(retain_graph=True)
    #     clip_grad_norm_(self.inverse_model.parameters(),1)
    #     clip_grad_norm_(self.forward_model.parameters(),1)
    #     self.optimizer.step()
    #     return loss.detach().cpu().numpy()


def to_tensor(x, dtype=None):
    # return torch.tensor(x, dtype=dtype).unsqueeze(0)
    return torch.tensor(x.copy(), dtype=dtype).unsqueeze(0)



#### wwwwwwwwwwwwwwwwww

In [5]:
beta = 0.2
lamda = 0.1
eta = 100.0 # scale factor for intrinsic reward
gamma = 0.99
lr_critic = 0.005
lr_actor = 0.001
lr_icm = 0.001
# max_eps = 1000
sparse_mode = True


# env = gym.make('CartPole-v1')
# env = gym.make('PongDeterministic-v4')
# env = gym.make('LunarLander-v2')
# env = gym.make('MontezumaRevengeDeterministic-v4')
# env = SparseEnv(env)
env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = JoypadSpace(env, COMPLEX_MOVEMENT) # SIMPLE_MOVEMENT COMPLEX_MOVEMENT
env = MarioSparse(env)
env = MarioEarlyStop(env)


  f"The environment {id} is out of date. You should consider "
  "We recommend you to use a symmetric and normalized Box action space (range=[-1, 1]) "


In [6]:

# Actor Critic
space_dims = env.observation_space.shape[0]
# space_dims = env.observation_space.shape
# cp 2 (4,) 32 mon 18 (210, 160, 3) 32
actor = Actor(n_actions=env.action_space.n, space_dims=space_dims, hidden_dims=32).to(device)
critic = Critic(space_dims=space_dims, hidden_dims=32).to(device)

conv_encode = Conv_Encoder(3).to(device)
phist_size=256
hidden_size=512

fwd_model = FwdModel(env.action_space.n, hidden_size, phist_size).to(device) #256 phist_size(no lstm) hidden_size(lstm)
inv_model = InvModel(env.action_space.n, hidden_size, phist_size).to(device) #32

# Actor Critic
a_optim = torch.optim.Adam(actor.parameters(), lr=lr_actor)
c_optim = torch.optim.Adam(critic.parameters(), lr=lr_critic)

# ICM
# icm_params = list(feature_extractor.parameters()) + list(forward_model.parameters()) + list(inverse_model.parameters())
# icm_params = list(forward_model.parameters()) + list(inverse_model.parameters())
icm_params = list(fwd_model.parameters()) + list(inv_model.parameters())
icm_optim = torch.optim.Adam(icm_params, lr=lr_icm)



In [7]:


def pg_loss(action_prob, reward): return -torch.mean(torch.log(action_prob+1e-6)*reward)
mse_loss = nn.MSELoss()
xe_loss = nn.CrossEntropyLoss()

global_step = 0
reward_lst = []
mva_lst = []
mva = 0.
avg_ireward_lst = []

actor.train()

for n_eps in range(100):
    st1 = to_tensor(env.reset(), dtype=torch.float).to(device).detach()
    done = False
    score = 0
    ireward_lst = []
    loss=0
    while not done:
        st = st1
        a_optim.zero_grad()
        c_optim.zero_grad()
        icm_optim.zero_grad()
        
        # estimate action with policy network
        policy = actor(st) # [18]
        # action = select_action(policy.detach().numpy()[0]) #only for lame
        # print(policy.shape)
        prob = nn.functional.softmax(policy, dim=-1) #1
        action = prob.multinomial(1).data
        
        # interaction with environment
        # st1, reward, done, info = env.step(action)
        st1, reward, done, info = env.step(action.item())
        st1 = to_tensor(st1, dtype=torch.float).to(device).detach()
        advantages = torch.zeros_like(policy)
        # extrinsic_reward = to_tensor([0.], dtype=torch.float) if sparse_mode else to_tensor([reward], dtype=torch.float)
        extrinsic_reward = torch.tensor(reward, dtype=torch.float, device=device).view(1,1)
        
        val = critic(st).squeeze(0) #[1,1]
        val1 = critic(st1).squeeze(0)

        phist = conv_encode(st)
        phist1 = conv_encode(st1)
        # athat, inv_latent = inv_model(phist1, inv_latent)
        athat = inv_model(phist1)
        # phihat, fwd_latent = fwd_model(action.squeeze(0), phist, fwd_latent)
        phihat = fwd_model(action.squeeze(0), phist)
        # print(phihat.shape, phist1.shape)
        # forward_loss = mse_loss(phihat, phist1)
        forward_loss = mse_loss(phihat.detach(), phist1.detach())


        # print("2",athat.shape, action.view(-1).shape)
        # inverse_loss = xe_loss(athat, t_action.view(-1))
        # inverse_loss = xe_loss(athat.squeeze(1), t_action.view(-1))
        inverse_loss = xe_loss(athat.squeeze(1), action.view(-1))
        icm_loss = (1-beta)*inverse_loss + beta*forward_loss
        # icm_loss = beta*forward_loss #problem
        # icm_loss = (1-beta)*inverse_loss #prob;em
        
        # Reward
        intrinsic_reward = eta*forward_loss.detach()
        # if done:
        #     total_reward = -100 + intrinsic_reward if score < 499 else intrinsic_reward
        #     advantages[0, action] = total_reward - val
        #     c_target = total_reward
        # else:
        # print(extrinsic_reward.dtype , intrinsic_reward.dtype)
        total_reward = extrinsic_reward + intrinsic_reward
        advantages[0, action] = total_reward + gamma*val1 - val
        c_target = total_reward + gamma*val1
        
        # Loss - Actor Critic
        # print("3",policy.shape, advantages.detach().shape)
        actor_loss = pg_loss(policy, advantages.detach())
        # print("4",val.shape, c_target.detach().squeeze(0).shape)
        critic_loss = mse_loss(val, c_target.detach().squeeze(0))
        ac_loss = actor_loss + critic_loss
        # Update
        # loss = lamda*ac_loss + icm_loss
        loss += lamda*ac_loss + icm_loss
        # loss.backward()
        # loss.backward(retain_graph=True)
        # # print("passed")
        # icm_optim.step()
        # a_optim.step()
        # c_optim.step()
        if not done:
            score += reward
        ireward_lst.append(intrinsic_reward.item())
        global_step += 1
    loss.backward()
    icm_optim.step()
    a_optim.step()
    c_optim.step()
    inv_model.reset_hidden() # i added
    fwd_model.reset_hidden()
    avg_intrinsic_reward = sum(ireward_lst) / len(ireward_lst)
    mva = 0.95*mva + 0.05*score
    reward_lst.append(score)
    avg_ireward_lst.append(avg_intrinsic_reward)
    mva_lst.append(mva)
    print('Episodes: {}, AVG Score: {:.3f}, Score: {}, AVG reward i: {:.6f}'.format(n_eps, mva, score, avg_intrinsic_reward))
    del val,val1, st1, reward, done, info, st, action, prob,athat, phihat
    del intrinsic_reward, total_reward ,extrinsic_reward
    del ac_loss , actor_loss , critic_loss, icm_loss,inverse_loss ,forward_loss


MarioEarlyStop: early stop  45
Episodes: 0, AVG Score: 0.000, Score: 0, AVG reward i: 4956.118231
MarioSparse: died
Episodes: 1, AVG Score: 0.000, Score: 0, AVG reward i: 4786.830893
MarioEarlyStop: early stop  435
Episodes: 2, AVG Score: 0.000, Score: 0, AVG reward i: 4682.334605
MarioEarlyStop: early stop  82
Episodes: 3, AVG Score: 0.000, Score: 0, AVG reward i: 4886.280052
MarioSparse: died
Episodes: 4, AVG Score: 0.000, Score: 0, AVG reward i: 4864.733166
MarioEarlyStop: early stop  88
Episodes: 5, AVG Score: 0.000, Score: 0, AVG reward i: 4874.130335
MarioEarlyStop: early stop  434
Episodes: 6, AVG Score: 3580.000, Score: 71600, AVG reward i: 4748.948863
MarioEarlyStop: early stop  48
Episodes: 7, AVG Score: 3401.000, Score: 0, AVG reward i: 4951.741298
MarioEarlyStop: early stop  53
Episodes: 8, AVG Score: 3230.950, Score: 0, AVG reward i: 4951.472426
MarioEarlyStop: early stop  40
Episodes: 9, AVG Score: 3069.402, Score: 0, AVG reward i: 4955.545267
MarioSparse: died
Episodes: 

RuntimeError: ignored

#### Visualization

In [None]:
# @title plot
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

plt.plot(reward_lst)
plt.ylabel('Score')
plt.show()

plt.plot(mva_lst)
plt.ylabel('Moving Average Score')
plt.show()


#### save

In [None]:

from google.colab import drive
drive.mount('/content/gdrive')
PATH="/content/gdrive/MyDrive/curious/" # for saving to google drive
name='curiousity_mario.pth'
# PATH="/content/" # for saving on colab only
# name='model.pth'

model=actor
torch.save(model.state_dict(), PATH+name)

# model.load_state_dict(torch.load(PATH+name))
# actor=model


#### video

In [None]:

import gym
from colabgymrender.recorder import Recorder
env = Recorder(env, './video')

state = env.reset()
# model=actor

model.eval()
while True:
    # state = to_tensor(state, dtype=torch.float).to(device)
    state = torch.tensor(state.copy()).type(torch.float).to(device)
    policy = model(state) # [18]
    # action = select_action(policy.detach().numpy()[0])
    # print(policy.shape)
    prob = nn.functional.softmax(policy, dim=-1) #1
    action = prob.multinomial(1).data
    # # print("action",action)
    # # action = env.action_space.sample()
    # state, reward, done, info = env.step(action)
    state, reward, done, info = env.step(action.item())
    if done: break
env.play()

