In [1]:
#add environment to PYTHONPATH
import sys
import os
env_path = os.path.join(os.path.abspath(os.getcwd()), '..\\Environments\\ContinuousCartPole')
sys.path.append(env_path)

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Normal
from continuous_cartpole import ContinuousCartPoleEnv

import gym
from tqdm import tqdm_notebook
import numpy as np
from collections import deque
import random
from copy import deepcopy, copy

In [2]:
#discount factor
γ = 0.99

#soft target constant
τ = 0.001

#Learning rates
α_θ = 0.0001
αw = 0.001

#episode to run
NUM_EPISODES = 1000

#steps per episode
MAX_STEPS = 5000

#batch size
BATCH_SIZE = 16

#replay buffer
BUFFER_SIZE = 1000000

In [3]:
class DeterministicPolicy(nn.Module):
    
    def __init__(self, obs_space, action_space):
        super(DeterministicPolicy, self).__init__()
        self.input_batch_norm = nn.BatchNorm1d(obs_space)
        
        self.first_layer = nn.Linear(obs_space, 400)
        self.first_batch_norm = nn.BatchNorm1d(400)
        
        self.second_layer = nn.Linear(400, 300)
        self.second_batch_norm = nn.BatchNorm1d(300)
        
        self.output_layer = nn.Linear(300, action_space)
        
    def forward(self, x):
        x = self.input_batch_norm(x)
        
        x = self.first_layer(x)
        x = F.relu(x)
        x = self.first_batch_norm(x)
        
        x = self.second_layer(x)
        x = F.relu(x)
        x = self.second_batch_norm(x)
        
        output = self.output_layer(x)
        actions = torch.tanh(output)
        
        return actions

In [4]:
class QNetwork(nn.Module):
    
    def __init__(self, obs_space, action_space):
        super(QNetwork, self).__init__()
        self.input_batch_norm = nn.BatchNorm1d(obs_space)
        
        self.first_layer = nn.Linear(obs_space, 400)
        self.first_batch_norm = nn.BatchNorm1d(400)
        
        self.second_layer = nn.Linear(400 + action_space, 300)
        
        self.output_layer = nn.Linear(300, 1)
        
    def forward(self, x, actions):
        x = self.input_batch_norm(x)
        
        x = self.first_layer(x)
        x = F.relu(x)
        x = self.first_batch_norm(x)
        
        x_with_action = torch.cat([x, actions], dim=1)
        
        x = self.second_layer(x_with_action)
        x = F.relu(x)
        
        q_val = self.output_layer(x)
        
        return q_val
        

In [5]:
class OUNoise:
    """Ornstein-Uhlenbeck process."""

    def __init__(self, size=1, mu=0, theta=0.05, sigma=0.25):
        """Initialize parameters and noise process."""
        self.mu = mu * np.ones(size)
        self.theta = theta
        self.sigma = sigma
        self.reset()

    def reset(self):
        """Reset the internal state (= noise) to mean (mu)."""
        self.state = copy(self.mu)

    def sample(self):
        """Update internal state and return it as a noise sample."""
        x = self.state
        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(len(x))
        self.state = x + dx
        return self.state

In [6]:
def select_action(policy, state):
    
    state_tensor = torch.from_numpy(state).float().unsqueeze(0) 
    
    action = policy(state_tensor)
    action = action + ou.sample().item()
    
    return action

In [50]:
def update_parameters(batch, policy_network, q_network, target_policy_network, target_q_network, policy_optimizer, q_optimizer, τ):
    
    #fix this. I'm creating a new tensor here from action so it doesnt back propagate back to policy function
    action_batch = torch.stack([item[1] for item in batch]).squeeze(1)
    state_batch = torch.Tensor([item[0] for item in batch])
    new_state_batch = torch.Tensor([item[3] for item in batch])
    rewards_batch = [item[2] for item in batch]
    
    train([policy_network, q_network, target_policy_network, target_q_network])
    
    policy_action_batch = target_policy_network(new_state_batch)
    target_q_batch = target_q_network(new_state_batch, policy_action_batch)
    target_q_batch = torch.Tensor([reward + γ * q for reward, q in zip(rewards_batch, target_q_batch)]).unsqueeze(1)
    q_batch = q_network(state_batch, action_batch)
    
    critic_loss = F.mse_loss(target_q_batch, q_batch)
    actor_loss = -torch.mean(q_batch)
    
    q_optimizer.zero_grad()
    critic_loss.backward(retain_graph = True)
    
    policy_optimizer.zero_grad()
    actor_loss.backward()
    
    q_optimizer.step()
    policy_optimizer.step()
    
    update_target_net(policy_network, target_policy_network, τ)
    update_target_net(q_network, target_q_network, τ)

In [8]:
def train(models):
    for model in models:
        model.train()

In [9]:
def eval(models):
    for model in models:
        model.eval()

In [36]:
def update_target_net(net, target_net, τ):
    
    with torch.no_grad():
        for param, target_param in zip(net.parameters(), target_net.parameters()):
            new_param = τ * param.data + (1 - τ) * target_param.data
            target_param.data.copy_(new_param)

In [51]:
#init environment
#env = gym.make('LunarLanderContinuous-v2').env
#env = gym.make('MountainCarContinuous-v0').env
env = ContinuousCartPoleEnv()

#env parameters
obs_space = env.observation_space.shape[0]
action_space = env.action_space.shape[0]

#set seeds
# np.random.seed(1)
# random.seed(1)
# env.seed(1)
# torch.manual_seed(1)

#init networks
policy_network = DeterministicPolicy(obs_space, action_space)
q_network = QNetwork(obs_space, action_space)

target_policy_network = DeterministicPolicy(obs_space, action_space)
target_q_network = QNetwork(obs_space, action_space)

#target network same weights
for param, target_param in zip(policy_network.parameters(), target_policy_network.parameters()):
    param.data.copy_(target_param.data)
    
for param, target_param in zip(q_network.parameters(), target_q_network.parameters()):
    param.data.copy_(target_param.data)


#init optimizers
policy_optimizer = optim.Adam(policy_network.parameters(), lr=α_θ)
q_optimizer = optim.Adam(q_network.parameters(), lr=αw, weight_decay=0.01)

In [52]:
scores = []
replay_buffer = deque(maxlen=BUFFER_SIZE)
total_steps = 0

for episode in tqdm_notebook(range(NUM_EPISODES)):
    state = env.reset()
    score = 0
    done = False
    ou = OUNoise()
    
    for step in range(MAX_STEPS):
        #env.render()
        total_steps += 1
        
        eval([policy_network])
        action = select_action(policy_network, state)
        
        new_state, reward, done, _ = env.step(action.squeeze(0).detach().numpy())
        score += reward
        
        #reward += 100*((np.sin(3*new_state[0]) * 0.0025 + 0.5 * new_state[1] * new_state[1]) - (np.sin(3*state[0]) * 0.0025 + 0.5 * state[1] * state[1]))
        
        replay_buffer.append([state, action, reward, new_state])
        
        if len(replay_buffer) >= BATCH_SIZE and total_steps > 2000:
            batch = random.sample(replay_buffer, BATCH_SIZE)
            update_parameters(batch, policy_network, q_network, target_policy_network, target_q_network, policy_optimizer, q_optimizer, τ)
        
        if done:
            break
        
        state = new_state
        
    scores.append(score)

env.close()

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [300, 1]], which is output 0 of TBackward, is at version 2; expected version 1 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).

In [224]:
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import seaborn as sns
import numpy as np

sns.set()

plt.plot(scores, color='grey', label='Training score')
plt.plot(policy_scores, color='blue', label='Target Policy score')
plt.ylabel('score')
plt.xlabel('episodes')
plt.title('Score history of MountainCarContinuous with DDPG')
plt.legend()

reg = LinearRegression().fit(np.arange(len(policy_scores)).reshape(-1, 1), np.array(policy_scores).reshape(-1, 1))
y_pred = reg.predict(np.arange(len(policy_scores)).reshape(-1, 1))
plt.plot(y_pred, color='orange')
plt.show()

NameError: name 'policy_scores' is not defined

In [22]:
for ep in range(10):
    state = env.reset()
    done = False
    score = 0
    for step in range(MAX_STEPS):
        env.render()

        new_state, reward, done, info = env.step(env.action_space.sample())

        state = new_state

        if done:
            break
env.close()

In [48]:
testing_scores = []

for _ in tqdm_notebook(range(5)):
    state = env.reset()
    done = False
    score = 0
    for step in range(MAX_STEPS):
        #env.render()
        eval([policy_network])
        state_tensor = torch.from_numpy(state).float().unsqueeze(0)
        action = policy_network(state_tensor)
        new_state, reward, done, info = env.step([action.item()])
        
        score += reward
        
        state = new_state
        
        if done:
            break
            
    testing_scores.append(score)
env.close()

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))




In [49]:
testing_scores

[14.0, 13.0, 18.0, 18.0, 16.0]

In [50]:
np.array(testing_scores).mean()

-0.0014359440480486948

In [51]:
np.array(testing_scores).var()

1.0626758033809403e-13

In [57]:
env.close()