In [100]:
from torch import nn
import torch.nn.functional as F
import numpy as np
import gym
import os

import torch
import json
import torch.nn as nn
import torch.nn.functional as F
from torch import nn, optim
import cv2

import random                 # Handling random number generation
import time                   # Handling time calculation
from skimage import transform # Help us to preprocess the frames

from collections import deque # Ordered collection with ends
import matplotlib.pyplot as plt  # Display graphs

import warnings                  # This ignore all the warning messages that are normally printed during the training because of skiimage
warnings.filterwarnings('ignore')


In [101]:
### MODEL HYPERPARAMETERS
learning_rate =  1e-5     

### TRAINING HYPERPARAMETERS
total_episodes = 100000         # Total episodes for training

batch_size = 128

# FIXED Q TARGETS HYPERPARAMETERS
update_steps = 4
clip_norm = 0.00008 
# EXPLORATION HYPERPARAMETERS for epsilon greedy strategy
explore_start = 0.1           # exploration probability at start
explore_stop = 0.001            # minimum exploration probability        # exponential decay rate for exploration prob
explore = 20000

model_path = f"./models/random_replay/CN{clip_norm}_LR{learning_rate}_B{batch_size}"
os.makedirs(model_path)
log_path = f"./logs/random_replay/CN{clip_norm}_LR{learning_rate}_B{batch_size}.txt"

# Q LEARNING hyperparameters
gamma = 0.99               # Discounting rate

### MEMORY HYPERPARAMETERS
## If you have GPU change to 1million
pretrain_length = 1000             # Number of experiences stored in the Memory when initialized for the first time
memory_size = 50000 ##100000                 # Number of experiences the Memory can keep

env = gym.make('CartPole-v1')



In [102]:
class DDDQNet(nn.Module):
    def __init__(self):
        super().__init__()

        self.fc1 = nn.Linear(4, 64)
        self.relu = nn.ReLU()

        self.fc_value = nn.Linear(64, 256)
        self.value = nn.Linear(256, 1)

        self.fc_adv = nn.Linear(64, 256)
        self.adv = nn.Linear(256, 2)

    def forward(self, x):
        y = self.relu(self.fc1(x))
        value = self.relu(self.fc_value(y))
        adv = self.relu(self.fc_adv(y))

        value = self.value(value)
        adv = self.adv(adv)

        output = value + adv - torch.mean(adv, dim=1, keepdim=True)

        return output

    def select_action(self, x):
        with torch.no_grad():
            Q = self.forward(x)
            action_index = torch.argmax(Q, dim=1)
        return action_index.item()
    



def update_target_params(PolicyNetwork, TargetNetwork):
    """
    Copy parameters of the Policy network to Target network.
    """
    TargetNetwork_state_dict = TargetNetwork.state_dict()
    PolicyNetwork_state_dict = PolicyNetwork.state_dict()
    for key in PolicyNetwork_state_dict:
        TargetNetwork_state_dict[key] = PolicyNetwork_state_dict[key]
        TargetNetwork.load_state_dict(TargetNetwork_state_dict)

    return PolicyNetwork, TargetNetwork   


class Memory(object):
    def __init__(self, memory_size: int) -> None:
        self.memory_size = memory_size
        self.buffer = deque(maxlen=self.memory_size)

    def add(self, experience) -> None:
        self.buffer.append(experience)

    def size(self):
        return len(self.buffer)

    def sample(self, batch_size: int, continuous: bool = True):
        if batch_size > len(self.buffer):
            batch_size = len(self.buffer)
        if continuous:
            rand = random.randint(0, len(self.buffer) - batch_size)
            return [self.buffer[i] for i in range(rand, rand + batch_size)]
        else:
            indexes = np.random.choice(np.arange(len(self.buffer)), size=batch_size, replace=False)
            return [self.buffer[i] for i in indexes]

    def clear(self):
        self.buffer.clear()

In [103]:
import random
# Instantiate memory
memory = Memory(memory_size)
import numpy as np 


env = gym.make('CartPole-v1')
n_state = env.observation_space.shape[0]  # 4
n_action = env.action_space.n

state, _ = env.reset()


for i in range(pretrain_length):
    
    action = random.randint(0,1)
    next_state, reward, done, _, _ = env.step(action)

    if done:
        next_state = np.zeros(n_state)
        experience = state, action, reward, next_state, done
        #print("shape of a stack frame of dead: ", state.shape)
        memory.add(experience)
        
        # Start a new episode
        state, _ = env.reset()
    else:
        experience = state, action, reward, next_state, done
        memory.add(experience)

        state = next_state

env.close()


In [104]:
# Instantiate the DQNetwork
PolicyNetwork = DDDQNet()

# Instantiate the target network
TargetNetwork = DDDQNet()

optimizer = optim.RMSprop(PolicyNetwork.parameters(), lr=learning_rate)

def select_action(epsilon, state):
    """
    This function will do the part
    With ϵ select a random action atat, otherwise select at=argmaxaQ(st,a)

    Input state is 4 stacked states. 
    """

    num = random.random()

    if (num < epsilon):
        action = random.randint(0, 1)
        return action
    # Greedy action
    else:
        
        tensor_state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
        action = PolicyNetwork.select_action(tensor_state)
        return action

epsilon = explore_start
# Set Target network params
PolicyNetwork, TargetNetwork = update_target_params(PolicyNetwork, TargetNetwork)

out_f = open(log_path, 'w')

env = gym.make('CartPole-v1')

total_steps = 0

for episode in range(total_episodes):

    episode_step = 0
    episode_reward = 0
    state, _ = env.reset()
    # print("state: ", state)
    

    for time_steps in range(10000):

        episode_step += 1
        total_steps += 1

        action = select_action(epsilon, state)
        next_state, reward, done, _, _ = env.step(action)

        episode_reward += reward

        if done: 
            next_state = np.zeros(n_state)
            experience = state, action, reward, next_state, done
            memory.add(experience)

        else: 
            experience = state, action, reward, next_state, done
            # print("state: ", state)
            memory.add(experience)
            state = next_state

        
        batch = memory.sample(batch_size, False)

        batch_states, batch_actions, batch_rewards, batch_next_states, batch_dones = zip(*batch)
        # state, action, reward, next_state, done
        # 
        # 

        batch_states = torch.FloatTensor(batch_states)
        batch_next_states = torch.FloatTensor(batch_next_states)
        batch_actions = torch.FloatTensor(batch_actions).unsqueeze(1)
        batch_rewards = torch.FloatTensor(batch_rewards).unsqueeze(1)
        batch_dones = torch.FloatTensor(batch_dones).unsqueeze(1)

        actions_index = batch_actions.detach().numpy().flatten()

        with torch.no_grad():
            policy_q_next = PolicyNetwork(batch_next_states)
            target_q_next = TargetNetwork(batch_next_states)
            online_max_action = torch.argmax(policy_q_next, dim=1, keepdim=True)
            y = batch_rewards + (1 - batch_dones) * gamma * target_q_next.gather(1, online_max_action.long())

        loss = F.mse_loss(PolicyNetwork(batch_states).gather(1, batch_actions.long()), y)
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(PolicyNetwork.parameters(), clip_norm)
        optimizer.step()


        if epsilon > explore_stop:
            epsilon -= (explore_start - explore_stop) / explore

        # print("POLICY PARAMS: " , PolicyNetwork.state_dict())
        if total_steps % update_steps == 0:
            PolicyNetwork, TargetNetwork = update_target_params(PolicyNetwork, TargetNetwork)
            # print("Model updated")

        if done: 
            # Write batch stats to log files 
            loss_at_end_of_episode = loss.item()

            out_f.write(json.dumps({
                'episode': episode,
                'reward': episode_reward,
                'total_steps': total_steps,
                'length': episode_step,
                'loss': loss_at_end_of_episode,
                'epsilon': epsilon,
            }) + '\n')

            out_f.flush()
            

            print(f"Episode {episode}     |      Reward: {episode_reward}     |     length: {episode_step}      |      Loss: {loss_at_end_of_episode}    |     Total timesteps: {total_steps}")
        
            if episode % 100 == 0:
                torch.save(PolicyNetwork, f'{model_path}/E{episode}.pt')
                print(f"======Model saved.======")
            
            break

    if total_steps > 100000:
        break

env.close()






Episode 0     |      Reward: 76.0     |     length: 76      |      Loss: 1.0462478399276733    |     Total timesteps: 76
Episode 1     |      Reward: 64.0     |     length: 64      |      Loss: 1.0304501056671143    |     Total timesteps: 140
Episode 2     |      Reward: 43.0     |     length: 43      |      Loss: 1.038540005683899    |     Total timesteps: 183
Episode 3     |      Reward: 49.0     |     length: 49      |      Loss: 1.0323675870895386    |     Total timesteps: 232
Episode 4     |      Reward: 32.0     |     length: 32      |      Loss: 1.0333003997802734    |     Total timesteps: 264
Episode 5     |      Reward: 52.0     |     length: 52      |      Loss: 1.0358635187149048    |     Total timesteps: 316
Episode 6     |      Reward: 92.0     |     length: 92      |      Loss: 1.02664315700531    |     Total timesteps: 408
Episode 7     |      Reward: 58.0     |     length: 58      |      Loss: 1.0401690006256104    |     Total timesteps: 466
Episode 8     |      Reward:

: 