<a href="https://colab.research.google.com/github/eisbetterthanpi/pytorch/blob/main/Curiosity_lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Curiosity-Driven Exploration

#### setup

In [1]:
# https://github.com/Yangyangii/Curiosity-Driven-A2C
# https://colab.research.google.com/github/Yangyangii/Curiosity-Driven-A2C/blob/master/Curiosity.ipynb

%pip install -U gym
%pip install -U gym[atari,accept-rom-license]
# !pip install gym[box2d]
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim
import collections
device = "cuda" if torch.cuda.is_available() else "cpu"

!pip install gym-super-mario-bros nes-py
# https://github.com/Kautenja/gym-super-mario-bros
from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT, COMPLEX_MOVEMENT
env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = JoypadSpace(env, COMPLEX_MOVEMENT) # SIMPLE_MOVEMENT COMPLEX_MOVEMENT

!pip install colabgymrender


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


  f"The environment {id} is out of date. You should consider "
  "We recommend you to use a symmetric and normalized Box action space (range=[-1, 1]) "


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


#### gym wrappers

In [2]:
# https://colab.research.google.com/github/araffin/rl-tutorial-jnrr19/blob/master/2_gym_wrappers_saving_loading.ipynb
import gym
class SparseEnv(gym.Wrapper): #https://alexandervandekleut.github.io/gym-wrappers/
    def __init__(self, env):
        super().__init__(env)
        self.env = env
        self.total_rewards = 0
    def step(self, action):
        observation, reward, done, info = self.env.step(action)
        self.total_rewards += reward
        if done: reward = self.total_rewards
        else: reward = 0
        return observation, reward, done, info
    def reset(self):
        self.total_rewards = 0
        return self.env.reset()
# env = SparseEnv(gym.make("LunarLander-v2"))

class MarioSparse(gym.Wrapper):
    def __init__(self, env):
        # super().__init__(env)
        super(MarioSparse, self).__init__(env)
        self.env = env
        self.total_score = 0
    def step(self, action):
        observation, reward, done, info = self.env.step(action)
        life = info['life']
        score = info['score']
        self.total_score += score
        if life<2:
            print("MarioSparse: died")
            # return observation, score, True, info # lost one life, end env
            done = True
        # else:
            # self.total_score = 0
        return observation, score, done, info
    def reset(self):
        self.total_score = 0
        return self.env.reset()
# env = MarioSparse(env)

class MarioEarlyStop(gym.Wrapper):
    def __init__(self, env):
        # super().__init__(env)
        super(MarioEarlyStop, self).__init__(env)
        self.env = env
        self.max_pos = 0
        self.count_step = 0
    def step(self, action):
        observation, reward, done, info = self.env.step(action)
        x_pos = info['x_pos']
        if x_pos <= self.max_pos: self.count_step += 1
        else:
            self.max_pos = x_pos
            self.count_step = 0
        if self.count_step > 500:
            print("MarioEarlyStop: early stop ", self.max_pos)
            # return observation, reward, True, info # early stop
            done = True
        # else:
        return observation, reward, done, info
    def reset(self):
        self.max_pos = 0
        self.count_step = 0
        return self.env.reset()
# env = MarioEarlyStop(env)


#### model

In [3]:
# @title Conv_Encoder
class Conv_Encoder(nn.Module):
    # def __init__(self):
    def __init__(self, in_channels=1):
        super(Conv_Encoder, self).__init__()
        self.conv_encoder = nn.Sequential( # embed pi (240, 256, 3) -> 256 when flattened
            nn.Conv2d(in_channels, 8, 3, stride=2, padding=1), nn.ELU(),
            # nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(8, 16, 5, stride=2, padding=2), nn.ELU(),
            nn.AdaptiveAvgPool2d((64,64)),
            nn.Conv2d(16, 8, 7, stride=2, padding=3), nn.ELU(),
            nn.Conv2d(8, 1, 5, stride=2, padding=2), nn.ELU(),
            # # nn.Conv2d(in_channels, out_channels=1, kernel_size=3, stride=2, padding=1),
            # nn.ReLU(),
            )
    def forward(self, x): # in [4, 3, 224, 224]
        # print("conv forward shape",x.shape)
        # x=x.squeeze()
        # x = torch.transpose(x, 1,2)
        # x = torch.transpose(x, 0,1)
        x = torch.transpose(x, -2,-1)
        x = torch.transpose(x, -3,-2)
        # print("conv forward",x.shape)
        x = self.conv_encoder(x)
        x=x.flatten(start_dim=1)
        # x = x.view(-1, 16 * 5 * 5)
        return x # out [batch, 256]

conv=Conv_Encoder(3)
# x=torch.rand(1, 210, 160, 3)#.squeeze()
x=torch.rand(2, 210, 160, 3)#.squeeze()
# x = torch.transpose(x, 1,2)
# x = torch.transpose(x, 0,1)
print(x.shape)
out=conv(x)
print(out.shape)


torch.Size([2, 210, 160, 3])
torch.Size([2, 256])


In [4]:

class Actor(nn.Module):
    def __init__(self, n_actions, space_dims, hidden_dims): #cartpole 2 4 32
        super(Actor, self).__init__()
        # print("init actor",n_actions, space_dims, hidden_dims)
        space_dims=256
        self.feature_extractor = nn.Sequential(
            nn.Linear(space_dims, hidden_dims),
            nn.ReLU(True),
        )
        self.actor = nn.Sequential(
            nn.Linear(hidden_dims, n_actions),
            nn.Softmax(dim=-1),
        )
        self.conv_encoder = Conv_Encoder(3)
    
    def forward(self, x):
        # print("forward1",x.shape) # cp [1, 4] mon [1, 210, 160, 3]
        x = self.conv_encoder(x)
        # print("forward2",x.shape) # cp [1, 4]
        features = self.feature_extractor(x)
        policy = self.actor(features)
        return policy
    
class Critic(nn.Module):
    def __init__(self, space_dims, hidden_dims):
        super(Critic, self).__init__()
        space_dims=256
        self.feature_extractor = nn.Sequential(
            nn.Linear(space_dims, hidden_dims),
            nn.ReLU(True),
        )
        self.critic = nn.Linear(hidden_dims, 1)
        self.conv_encoder = Conv_Encoder(3)
    
    def forward(self, x):
        x = self.conv_encoder(x)
        features = self.feature_extractor(x)
        est_reward = self.critic(features)
        return est_reward






class InverseModel(nn.Module):
    def __init__(self, n_actions, hidden_dims, phist_size):
        super(InverseModel, self).__init__()
        self.fc = nn.Linear(hidden_dims+phist_size, n_actions)
        
    def forward(self, features):
        features = features.view(1, -1) # (1, hidden_dims)
        action = self.fc(features) # (1, n_actions)
        return action

class ForwardModel(nn.Module):
    def __init__(self, n_actions, hidden_dims, phist_size):
        super(ForwardModel, self).__init__()
        self.fc = nn.Linear(hidden_dims+n_actions, hidden_dims)
        self.eye = torch.eye(n_actions, device=device)
        
    def forward(self, action, features):
        # print("ForwardModel",action.shape, features.shape)
        # print("ForwardModel2",self.eye[action], features.shape)
        x = torch.cat([self.eye[action], features], dim=-1) # (1, n_actions+hidden_dims)
        features = self.fc(x) # (1, hidden_dims)
        return features








class InvModel(nn.Module):
    def __init__(self, n_actions, hidden_dims, phist_size):
        super(InvModel, self).__init__()
        self.inv_lstm = nn.LSTMCell(phist_size, hidden_size)
        self.fc = nn.Linear(phist_size+hidden_size, n_actions)
        
    def forward(self, phist1, inv_latent): # [1, 512]
        features = torch.cat([inv_latent[0],phist1], dim=1)
        features = features.view(1, -1) # (1, hidden_dims)
        # print("InverseModel", features.shape)
        athat = self.fc(features) # (1, n_actions)
        inv_latent = self.inv_lstm(phist,inv_latent)
        return athat, inv_latent

class FwdModel(nn.Module):
    def __init__(self, n_actions, hidden_dims, phist_size):
        super(FwdModel, self).__init__()
        self.fwd_lstm = nn.LSTMCell(phist_size, hidden_size)
        self.fc = nn.Linear(hidden_dims+n_actions, phist_size)
        self.eye = torch.eye(n_actions, device=device)
        
    def forward(self, action, phist, fwd_latent):
        fwd_latent = self.fwd_lstm(phist,fwd_latent)
        # print("ForwardModel",action.shape, fwd_latent[0].shape)
        # print("ForwardModel2",self.eye[action], fwd_latent[0].shape)
        x = torch.cat([self.eye[action], fwd_latent[0]], dim=-1) # (1, n_actions+hidden_dims)
        phihat1 = self.fc(x) # (1, hidden_dims)
        return phihat1, fwd_latent

class FeatureExtractor(nn.Module):
    def __init__(self, space_dims, hidden_dims):
        super(FeatureExtractor, self).__init__()
        space_dims=256
        self.fc = nn.Linear(space_dims, hidden_dims)
        self.conv_encoder = Conv_Encoder(3)
        
    def forward(self, x):
        x = self.conv_encoder(x)
        y = torch.tanh(self.fc(x))
        return y

def to_tensor(x, dtype=None):
    # return torch.tensor(x, dtype=dtype).unsqueeze(0)
    return torch.tensor(x.copy(), dtype=dtype).unsqueeze(0)



#### wwwwwwwwwwwwwwwwww

In [5]:
beta = 0.2
lamda = 0.1
eta = 100.0 # scale factor for intrinsic reward
gamma = 0.99
lr_critic = 0.005
lr_actor = 0.001
lr_icm = 0.001
# max_eps = 1000
sparse_mode = True


# env = gym.make('CartPole-v1')
# env = gym.make('PongDeterministic-v4')
# env = gym.make('LunarLander-v2')
# env = gym.make('MontezumaRevengeDeterministic-v4')
# env = SparseEnv(env)
env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = JoypadSpace(env, COMPLEX_MOVEMENT) # SIMPLE_MOVEMENT COMPLEX_MOVEMENT
env = MarioSparse(env)
env = MarioEarlyStop(env)


  f"The environment {id} is out of date. You should consider "
  "We recommend you to use a symmetric and normalized Box action space (range=[-1, 1]) "


In [6]:

# Actor Critic
space_dims = env.observation_space.shape[0]
# space_dims = env.observation_space.shape
# cp 2 (4,) 32 mon 18 (210, 160, 3) 32
actor = Actor(n_actions=env.action_space.n, space_dims=space_dims, hidden_dims=32).to(device)
critic = Critic(space_dims=space_dims, hidden_dims=32).to(device)

conv_encode = Conv_Encoder(3).to(device)
phist_size=256
hidden_size=512
# inv_lstm = nn.LSTMCell(phist_size, hidden_size).to(device)
# fwd_lstm = nn.LSTMCell(phist_size, hidden_size).to(device)

# ICM
feature_extractor = FeatureExtractor(env.observation_space.shape[0], 32).to(device)
forward_model = ForwardModel(env.action_space.n, hidden_size, phist_size).to(device) #256 phist_size(no lstm) hidden_size(lstm)
inverse_model = InverseModel(env.action_space.n, hidden_size, phist_size).to(device) #32

fwd_model = FwdModel(env.action_space.n, hidden_size, phist_size).to(device) #256 phist_size(no lstm) hidden_size(lstm)
inv_model = InvModel(env.action_space.n, hidden_size, phist_size).to(device) #32



# Actor Critic
a_optim = torch.optim.Adam(actor.parameters(), lr=lr_actor)
c_optim = torch.optim.Adam(critic.parameters(), lr=lr_critic)

# ICM
# icm_params = list(feature_extractor.parameters()) + list(forward_model.parameters()) + list(inverse_model.parameters())
icm_params = list(forward_model.parameters()) + list(inverse_model.parameters())
icm_optim = torch.optim.Adam(icm_params, lr=lr_icm)



In [13]:


def pg_loss(action_prob, reward): return -torch.mean(torch.log(action_prob+1e-6)*reward)
mse_loss = nn.MSELoss()
xe_loss = nn.CrossEntropyLoss()

global_step = 0
reward_lst = []
mva_lst = []
mva = 0.
avg_ireward_lst = []

actor.train()

for n_eps in range(100):
    st1 = to_tensor(env.reset(), dtype=torch.float).to(device)
    done = False
    score = 0
    ireward_lst = []
    inv_latent = (torch.zeros(1, 512).to(device), torch.zeros(1, 512).to(device))
    fwd_latent = (torch.zeros(1, 512).to(device), torch.zeros(1, 512).to(device))
    
    while not done:
        st = st1
        a_optim.zero_grad()
        c_optim.zero_grad()
        icm_optim.zero_grad()
        
        # estimate action with policy network
        policy = actor(st) # [18]
        # action = select_action(policy.detach().numpy()[0]) #only for lame
        # print(policy.shape)
        prob = nn.functional.softmax(policy, dim=-1) #1
        action = prob.multinomial(1).data
        
        # interaction with environment
        # st1, reward, done, info = env.step(action)
        st1, reward, done, info = env.step(action.item())
        st1 = to_tensor(st1, dtype=torch.float).to(device)
        advantages = torch.zeros_like(policy)
        # extrinsic_reward = to_tensor([0.], dtype=torch.float) if sparse_mode else to_tensor([reward], dtype=torch.float)
        extrinsic_reward = torch.tensor([reward].copy(), dtype=torch.float, device=device).unsqueeze(0)

        val = critic(st)[0]
        val1 = critic(st1)[0]
        

        # # ICM
        # obs_cat = torch.cat([st, st1], dim=0)
        # features = feature_extractor(obs_cat) # (2, hidden_dims) [2, 32]
        # inverse_action_prob = inverse_model(features) # (n_actions)
        # t_action=t_action.reshape((1))
        # print("t_action",t_action.shape) #[1]
        # # est_next_features = forward_model(t_action, features[0:1]) #[1] [1, 32]
        # athat = forward_model(action.squeeze(0), features[0:1]) #[1] [1, 32]
        # # Loss - ICM
        # # print("1",est_next_features.squeeze(0).shape, features[1].shape)
        # # forward_loss = mse_loss(est_next_features, features[1])
        # forward_loss = mse_loss(athat.squeeze(0), features[1])


        phist = conv_encode(st)
        phist1 = conv_encode(st1)
        athat, inv_latent = inv_model(phist1, inv_latent)
        phihat, fwd_latent = fwd_model(action.squeeze(0), phist, fwd_latent)
        # print(phihat.shape, phist1.shape)
        forward_loss = mse_loss(phihat, phist1)




        # print("2",athat.shape, t_action.view(-1).shape)
        # inverse_loss = xe_loss(athat, t_action.view(-1))
        # inverse_loss = xe_loss(athat.squeeze(1), t_action.view(-1))
        inverse_loss = xe_loss(athat.squeeze(1), action.view(-1))
        # icm_loss = (1-beta)*inverse_loss + beta*forward_loss
        icm_loss = beta*forward_loss #problem
        # icm_loss = (1-beta)*inverse_loss #prob;em
        
        # Reward
        intrinsic_reward = eta*forward_loss.detach()
        # if done:
        #     total_reward = -100 + intrinsic_reward if score < 499 else intrinsic_reward
        #     advantages[0, action] = total_reward - val
        #     c_target = total_reward
        # else:
        # print(extrinsic_reward.dtype , intrinsic_reward.dtype)
        total_reward = extrinsic_reward + intrinsic_reward
        advantages[0, action] = total_reward + gamma*val1 - val
        c_target = total_reward + gamma*val1
        
        # Loss - Actor Critic
        # print("3",policy.shape, advantages.detach().shape)
        actor_loss = pg_loss(policy, advantages.detach())
        # print("4",v.shape, c_target.detach().squeeze(0).shape)
        critic_loss = mse_loss(val, c_target.detach().squeeze(0))
        ac_loss = actor_loss + critic_loss
        # inv_latent[0].detach()
        # inv_latent[1].detach()
        # fwd_latent[0].detach()
        # fwd_latent[1].detach()
        # Update
        loss = lamda*ac_loss + icm_loss
        # loss = icm_loss
        # loss = lamda*ac_loss
        loss.backward()
        icm_optim.step()
        a_optim.step()
        c_optim.step()
        if not done:
            score += reward
        ireward_lst.append(intrinsic_reward.item())
        global_step += 1
    avg_intrinsic_reward = sum(ireward_lst) / len(ireward_lst)
    mva = 0.95*mva + 0.05*score
    reward_lst.append(score)
    avg_ireward_lst.append(avg_intrinsic_reward)
    mva_lst.append(mva)
    print('Episodes: {}, AVG Score: {:.3f}, Score: {}, AVG reward i: {:.6f}'.format(n_eps, mva, score, avg_intrinsic_reward))


RuntimeError: ignored

#### Visualization

In [None]:
# @title plot
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

plt.plot(reward_lst)
plt.ylabel('Score')
plt.show()

plt.plot(mva_lst)
plt.ylabel('Moving Average Score')
plt.show()


#### save

In [None]:

from google.colab import drive
drive.mount('/content/gdrive')
PATH="/content/gdrive/MyDrive/curious/" # for saving to google drive
name='curiousity_mario.pth'
# PATH="/content/" # for saving on colab only
# name='model.pth'

model=actor
torch.save(model.state_dict(), PATH+name)

# model.load_state_dict(torch.load(PATH+name))
# actor=model


#### video

In [None]:

import gym
from colabgymrender.recorder import Recorder
env = Recorder(env, './video')

state = env.reset()

model.eval()
while True:
    # state = to_tensor(state, dtype=torch.float).to(device)
    state = torch.tensor(state.copy()).type(torch.float).to(device)
    policy = model(state) # [18]
    # action = select_action(policy.detach().numpy()[0])
    # print(policy.shape)
    prob = nn.functional.softmax(policy, dim=-1) #1
    action = prob.multinomial(1).data
    # # print("action",action)
    # # action = env.action_space.sample()
    # state, reward, done, info = env.step(action)
    state, reward, done, info = env.step(action.item())
    if done: break
env.play()

