<a href="https://colab.research.google.com/github/eisbetterthanpi/pytorch/blob/main/reinforce_curious.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### setup

In [None]:
# https://huggingface.co/blog/deep-rl-pg
# https://colab.research.google.com/github/huggingface/deep-rl-class/blob/main/unit5/unit5.ipynb
# Reinforce (aka Monte Carlo Policy Gradient) Policy-Gradient Method
!apt install python-opengl ffmpeg xvfb
# !pip3 install pyvirtualdisplay
# Virtual display
# from pyvirtualdisplay import Display
# virtual_display = Display(visible=0, size=(500, 500))
# virtual_display.start()

!pip install gym
%pip install gym[atari,accept-rom-license]
# !pip install git+https://github.com/ntasfi/PyGame-Learning-Environment.git
# !pip install git+https://github.com/qlan3/gym-games.git # Extra gym environments made with PyGame

# !pip install pyyaml==6.0 # avoid key error metadata
# !pip install pyglet # Virtual Screen

import numpy as np
from collections import deque
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
import gym
# import gym_pygame

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

!pip install colabgymrender

!pip install gym-super-mario-bros nes-py
# https://github.com/Kautenja/gym-super-mario-bros
from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT, COMPLEX_MOVEMENT
env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = JoypadSpace(env, COMPLEX_MOVEMENT) # SIMPLE_MOVEMENT COMPLEX_MOVEMENT



#### gym wrappers

In [None]:
# https://colab.research.google.com/github/araffin/rl-tutorial-jnrr19/blob/master/2_gym_wrappers_saving_loading.ipynb
import gym
class SparseEnv(gym.Wrapper): #https://alexandervandekleut.github.io/gym-wrappers/
    def __init__(self, env):
        super().__init__(env)
        self.env = env
        self.total_rewards = 0
    def step(self, action):
        observation, reward, done, info = self.env.step(action)
        self.total_rewards += reward
        if done: reward = self.total_rewards
        else: reward = 0
        return observation, reward, done, info
    def reset(self):
        self.total_rewards = 0
        return self.env.reset()
# env = SparseEnv(gym.make("LunarLander-v2"))

class MarioSparse(gym.Wrapper):
    def __init__(self, env):
        # super().__init__(env)
        super(MarioSparse, self).__init__(env)
        self.env = env
        self.total_score = 0
    def step(self, action):
        observation, reward, done, info = self.env.step(action)
        life = info['life']
        score = info['score']
        self.total_score += score
        if life<2:
            print("MarioSparse: died")
            # return observation, score, True, info # lost one life, end env
            done = True
        # else:
            # self.total_score = 0
        return observation, score, done, info
    def reset(self):
        self.total_score = 0
        return self.env.reset()
# env = MarioSparse(env)

class MarioEarlyStop(gym.Wrapper):
    def __init__(self, env):
        # super().__init__(env)
        super(MarioEarlyStop, self).__init__(env)
        self.env = env
        self.max_pos = 0
        self.count_step = 0
    def step(self, action):
        observation, reward, done, info = self.env.step(action)
        x_pos = info['x_pos']
        if x_pos <= self.max_pos: self.count_step += 1
        else:
            self.max_pos = x_pos
            self.count_step = 0
        if self.count_step > 500:
            print("MarioEarlyStop: early stop ", self.max_pos)
            # return observation, reward, True, info # early stop
            done = True
        # else:
        return observation, reward, done, info
    def reset(self):
        self.max_pos = 0
        self.count_step = 0
        return self.env.reset()
# env = MarioEarlyStop(env)


#### policy

In [None]:
# https://github.com/pytorch/examples/blob/main/reinforcement_learning/reinforce.py
# https://github.com/udacity/deep-reinforcement-learning/blob/master/reinforce/REINFORCE.ipynb

class Policy(nn.Module):
    def __init__(self, s_size, a_size, h_size):
        super(Policy, self).__init__()
        self.model=nn.Sequential(
            nn.Linear(s_size, h_size), nn.ReLU(),
            nn.Linear(h_size, a_size),
            # nn.Linear(h_size, h_size*2), nn.ReLU(),
            # nn.Linear(h_size*2, a_size),
            nn.Softmax(dim=1),
        )
    
    def forward(self, state):
        # state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        # state = torch.tensor(state.copy(), dtype=torch.float).unsqueeze(0).to(device)
        probs = self.model(state).cpu()
        m = Categorical(probs)
        action = m.sample() # can't use action = np.argmax(m) use  m.sample(), sample an action with prob dist P(.|s)
        return action.item(), m.log_prob(action)



#### ICM

In [None]:

class Conv_Encoder(nn.Module):
    # def __init__(self):
    def __init__(self, in_channels=1):
        super(Conv_Encoder, self).__init__()
        self.conv_encoder = nn.Sequential( # embed pi (240, 256, 3) -> 256 when flattened
            nn.Conv2d(in_channels, 8, 3, stride=2, padding=1), nn.ELU(),
            # nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(8, 16, 5, stride=2, padding=2), nn.ELU(),
            nn.AdaptiveAvgPool2d((64,64)),
            nn.Conv2d(16, 8, 7, stride=2, padding=3), nn.ELU(),
            nn.Conv2d(8, 1, 5, stride=2, padding=2), nn.ELU(),
            # # nn.Conv2d(in_channels, out_channels=1, kernel_size=3, stride=2, padding=1),
            # nn.ReLU(),
            )
    def forward(self, x): # in [4, 3, 224, 224]
        # print("conv forward shape",x.shape)
        # x=x.squeeze()
        # x = torch.transpose(x, 1,2)
        # x = torch.transpose(x, 0,1)
        x = torch.transpose(x, -2,-1)
        x = torch.transpose(x, -3,-2)
        # print("conv forward",x.shape)
        x = self.conv_encoder(x)
        x=x.flatten(start_dim=1)
        # x = x.view(-1, 16 * 5 * 5)
        return x # out [batch, 256]

class InverseModel(nn.Module):
    def __init__(self, n_actions, hidden_dims):
        super(InverseModel, self).__init__()
        self.fc = nn.Linear(hidden_dims*2, n_actions)
        
    def forward(self, features):
        features = features.view(1, -1) # (1, hidden_dims)
        action = self.fc(features) # (1, n_actions)
        return action

class ForwardModel(nn.Module):
    def __init__(self, n_actions, hidden_dims):
        super(ForwardModel, self).__init__()
        self.fc = nn.Linear(hidden_dims+n_actions, hidden_dims)
        self.eye = torch.eye(n_actions, device=device)
        
    def forward(self, action, features):
        # print("ForwardModel",action.shape, features.shape)
        # print("ForwardModel2",self.eye[action], features.shape)
        x = torch.cat([self.eye[action], features], dim=-1) # (1, n_actions+hidden_dims)
        features = self.fc(x) # (1, hidden_dims)
        return features

class FeatureExtractor(nn.Module):
    def __init__(self, space_dims, hidden_dims):
        super(FeatureExtractor, self).__init__()
        space_dims=256
        self.fc = nn.Linear(space_dims, hidden_dims)
        self.conv_encoder = Conv_Encoder(3)
        
    def forward(self, x):
        x = self.conv_encoder(x)
        y = torch.tanh(self.fc(x))
        return y




# ICM
feature_extractor = FeatureExtractor(env.observation_space.shape[0], 32).to(device)
forward_model = ForwardModel(env.action_space.n, 32).to(device)
inverse_model = InverseModel(env.action_space.n, 32).to(device)
icm_optim = torch.optim.Adam(inverse_model.parameters(), lr=0.001)

beta = 0.2
# lamda = 0.1
eta = 100.0 # scale factor for intrinsic reward
# gamma = 0.99
# def icm(st, st1, inverse_model, forward_model):
def icm(st, st1, action, inverse_model, forward_model):
    # ICM
    obs_cat = torch.cat([st, st1], dim=0)
    features = feature_extractor(obs_cat) # (2, hidden_dims) [2, 32]
    inverse_action_prob = inverse_model(features) # (n_actions)
    # action=action.reshape((1))
    # print("icm act",action.shape)
    # action =torch.tensor([action]).view(1,1)
    action =torch.tensor(action).view(1).to(device)
    # action =torch.tensor(action)
    # print("icm act",action.shape)
    # action =action.view(1,1)
    # est_next_features = forward_model(action.squeeze(0), features[0:1]) #[1] [1, 32]
    est_next_features = forward_model(action, features[0:1]) #[1] [1, 32]
    # Loss - ICM
    # print("1",est_next_features.squeeze(0).shape, features[1].shape)
    forward_loss = nn.MSELoss()(est_next_features.squeeze(0), features[1])
    # print("2",inverse_action_prob.shape, action.view(-1).shape)
    # print("2",inverse_action_prob.device, action.view(-1).device)
    inverse_loss = nn.CrossEntropyLoss()(inverse_action_prob.squeeze(1), action.view(-1))
    icm_loss = (1-beta)*inverse_loss + beta*forward_loss
    intrinsic_reward = eta*forward_loss.detach()
    return icm_loss, intrinsic_reward



class phist_for_policy(nn.Module):
    # def __init__(self):
    def __init__(self, in_channels=3):
        super(phist_for_policy, self).__init__()
        self.conv_encoder = nn.Sequential( # embed pi (240, 256, 3) -> 256 when flattened
            nn.Conv2d(in_channels, 8, 3, stride=2, padding=1), nn.ELU(),
            # nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(8, 16, 5, stride=2, padding=2), nn.ELU(),
            nn.AdaptiveAvgPool2d((64,64)),
            nn.Conv2d(16, 8, 7, stride=2, padding=3), nn.ELU(),
            nn.Conv2d(8, 1, 5, stride=2, padding=2), nn.ELU(),
            # # nn.Conv2d(in_channels, out_channels=1, kernel_size=3, stride=2, padding=1),
            # nn.ReLU(),
            )
    def forward(self, x): # in [4, 3, 224, 224]
        # print("conv forward shape",x.shape)
        # x=x.squeeze()
        # x = torch.transpose(x, 1,2)
        # x = torch.transpose(x, 0,1)
        x = torch.transpose(x, -2,-1)
        x = torch.transpose(x, -3,-2)
        # print("conv forward",x.shape)
        x = self.conv_encoder(x)
        x=x.flatten(start_dim=1)
        # x = x.view(-1, 16 * 5 * 5)
        return x # out [batch, 256]

# # print(state_space, action_space, h_size) 240 12 64
# # policy = Policy(state_space, action_space, h_size).to(device)
# policy = Policy(256, action_space, h_size).to(device)

# convencode = phist_for_policy(3).to(device)
# state = env.reset()
# state = torch.tensor(state.copy(), dtype=torch.float).unsqueeze(0).to(device)
# # print(state.shape) #(240, 256, 3)
# state = convencode(state)
# print(state.shape)
# action, log_prob = policy(state)


#### reinforce

In [None]:

# update policy after every episode
def reinforce(policy, optimizer, n_training_episodes, max_t, gamma):
    # Help us to calculate the score during the training
    scores_deque = deque(maxlen=100)
    scores = []
    for i_episode in range(1, n_training_episodes+1):
        saved_log_probs = []
        rewards = []
        state = env.reset()
        state = torch.tensor(state.copy(), dtype=torch.float).unsqueeze(0).to(device)
        st=state
        # Line 4 of pseudocode run 1 full episode using current policy
        for t in range(max_t):
            # state = conv_encoder(state)
            # print(state.shape) #(240, 256, 3)
            state = convencode(state)
            # print(state.shape)

            action, log_prob = policy(state)
            saved_log_probs.append(log_prob)
            state, reward, done, _ = env.step(action)
            state = torch.tensor(state.copy(), dtype=torch.float).unsqueeze(0).to(device)

            st1=state
            # icm_loss, intrinsic_reward = icm(st, st1, inverse_model, forward_model)
            icm_loss, intrinsic_reward = icm(st, st1, action, inverse_model, forward_model)
            st=st1
            reward = intrinsic_reward

            # print(type(reward))
            # print(reward.cpu)
            rewards.append(reward.cpu())
            if done:
                break
        # print(type(sum(rewards)))
        scores_deque.append(sum(rewards))
        scores.append(sum(rewards))
        
        # Line 6 of pseudocode: calculate the return
        discounts = [gamma**i for i in range(len(rewards)+1)] # [0.99^1, 0.99^2, 0.99^3, ..., 0.99^len(rewards)]
        ## We calculate the return by sum(gamma[t] * reward[t]) 
        # R = sum([a*b for a,b in zip(discounts, rewards)])
        R = sum([a*b for a,b in zip(discounts, rewards)]) - np.mean(scores_deque) # baseline subtraction from discord
        # R = sum([a*b for a,b in zip(discounts, rewards)]) - torch.mean(scores_deque) # baseline subtraction from discord
        # R = sum([gamma**a*b for a,b in enumerate(rewards)])
        
        # Line 7:
        policy_loss = []
        for log_prob in saved_log_probs:
            policy_loss.append(-log_prob * R)
        policy_loss = torch.cat(policy_loss).sum()
        # max πθ(a3|s;θ) = min 1−πθ(a3|s;θ)

        icm_optim.zero_grad()
        icm_loss.backward()
        icm_optim.step()

       
        optimizer.zero_grad() # Line 8: loss.backward()
        policy_loss.backward()
        optimizer.step()
        if i_episode % 100 == 0:
            print('Episode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
    return scores


#### wwwwwwwwwwwwwwww

In [None]:
# env_id = "CartPole-v1"
# env_id = "Pixelcopter-PLE-v0"
# env_id = "Pong-PLE-v0"

# env = gym.make(env_id)


env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = JoypadSpace(env, COMPLEX_MOVEMENT) # SIMPLE_MOVEMENT COMPLEX_MOVEMENT
# env = MarioSparse(env)
env = MarioEarlyStop(env)



# eval_env = gym.make(env_id)
eval_env = env
s_size = env.observation_space.shape[0]
a_size = env.action_space.n




h_size=64 # cp 16 pc 64 p 64
n_training_episodes=10000 # cp 1000 pc 50000 p 20000
n_evaluation_episodes=10
max_t=10000 # cp 1000 pc 10000 p 5000 max episode length
gamma=0.99 # cp 1.0 pc/p 0.99
lr=1e-4  # cp 1e-2 pc 1e-4 p 1e-2
# env_id=env_id
state_space=s_size
action_space=a_size

# policy = Policy(state_space, action_space, h_size).to(device)
policy = Policy(256, action_space, h_size).to(device)
optimizer = optim.Adam(policy.parameters(), lr=lr)


#### Train

In [None]:

scores = reinforce(policy, optimizer, n_training_episodes, max_t, gamma)


#### eval

In [None]:
def evaluate_agent(env, max_steps, n_eval_episodes, policy):
    episode_rewards = []
    for episode in range(n_eval_episodes):
        state = env.reset()
        step = 0
        done = False
        total_rewards_ep = 0
        for step in range(max_steps):
            action, _ = policy(state)
            new_state, reward, done, info = env.step(action)
            total_rewards_ep += reward
            if done:
                break
            state = new_state
            episode_rewards.append(total_rewards_ep)
    mean_reward = np.mean(episode_rewards)
    std_reward = np.std(episode_rewards)
    return mean_reward, std_reward

evaluate_agent(eval_env, max_t, n_evaluation_episodes, policy)


#### save

In [None]:

from google.colab import drive
drive.mount('/content/gdrive')
PATH="/content/gdrive/MyDrive/curious/" # for saving to google drive
name='Curious_reinforce_cp.pth'
# PATH="/content/" # for saving on colab only
# name='model.pth'

model=policy
torch.save(model.state_dict(), PATH+name)

# model.load_state_dict(torch.load(PATH+name))
# actor=model


#### video

In [None]:

import gym
from colabgymrender.recorder import Recorder
# from gym_super_mario_bros.actions import SIMPLE_MOVEMENT, COMPLEX_MOVEMENT

# "MontezumaRevengeDeterministic-v4"
# env = gym.make(env_id)

# env = SparseEnv(env)
# env = gym_super_mario_bros.make('SuperMarioBros-v0')
# env = JoypadSpace(env, COMPLEX_MOVEMENT) # SIMPLE_MOVEMENT COMPLEX_MOVEMENT
# env = MarioSparse(env)
# env = MarioEarlyStop(env)
env = Recorder(env, './video')

state = env.reset()
# model.eval()
x=0

while True:
    # # print("action",action)
    # # action = env.action_space.sample()
    x+=1
    action, _ = policy(state)
    state, reward, done, info = env.step(action)
    if done: break
env.play()
print(x)

