In [1]:
import os
import time
import random
import numpy as np
import matplotlib.pyplot as plt
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
from gym import wrappers
from torch.autograd import Variable
from collections import deque
from tensorboardX import SummaryWriter

import json

In [106]:
with open('param.json') as f:
    config = json.load(f)

In [3]:
class OrnsteinUhlenbeckProcess:
    def __init__(self, mu=np.zeros(1), sigma=0.05, theta=.25, dimension=1e-2, x0=None, num_steps=12000):
        self.theta = theta
        self.mu = mu
        self.sigma = sigma
        self.dt = dimension
        self.x0 = x0
        self.reset()

    def step(self):
        x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + \
                self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape)
        self.x_prev = x
        return x

    def reset(self):
        self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu)

In [113]:
class Actor(nn.Module):

    def __init__(self, state_size, action_size, fc1, fc2):
        super(Actor, self).__init__()
        # fci = fully connected i
        self.layer1 = nn.Linear(state_size, fc1)
        self.layer2 = nn.Linear(fc1, fc2)
        self.layer3 = nn.Linear(fc2, action_size)
        
        self.leak = 0.01
        
        self.reset_parameteres()

    def forward(self, states):
        out_l1 = F.relu(self.layer1(states))
        out_l2 = F.relu(self.layer2(out_l1))
        out = F.tanh(self.layer3(out_l2))
        #out = F.tanh(self.layer3(out_l2))
        return out
    
    def reset_parameteres(self):
        torch.nn.init.kaiming_normal_(self.layer1.weight.data, a=self.leak, mode='fan_in')
        torch.nn.init.kaiming_normal_(self.layer2.weight.data, a=self.leak, mode='fan_in')
        torch.nn.init.uniform_(self.layer3.weight.data, -3e-3, 3e-3)
    
    
class Q_network(nn.Module):

    def __init__(self, state_size, action_size, fc1, fc2):
        super(Q_network, self).__init__()
        # fci = fully connected i
        self.layer1 = nn.Linear(state_size + action_size, fc1)
        self.layer2 = nn.Linear(fc1, fc2)
        self.layer3 = nn.Linear(fc2, 1)
        
        self.leak = 0.01
        
        self.reset_parameteres()

    def forward(self, states, actions):
        batch = torch.cat([states, actions], 1)
        out_l1 = F.relu(self.layer1(batch))
        out_l2 = F.relu(self.layer2(out_l1))
        out = self.layer3(out_l2)
        return out

    def reset_parameteres(self):
        torch.nn.init.kaiming_normal_(self.layer1.weight.data, a=self.leak, mode='fan_in')
        torch.nn.init.kaiming_normal_(self.layer2.weight.data, a=self.leak, mode='fan_in')
        torch.nn.init.uniform_(self.layer3.weight.data, -3e-3, 3e-3)    
    
class ReplayBuffer:

    def __init__(self, state_size, action_size, capacity, device):
        self.state_size = state_size
        self.action_size = action_size
        self.capacity = capacity
        self.device = device

        self.index = 0
        self.full = False

        self.state = np.empty(shape=(capacity, state_size), dtype=np.float32)
        self.action = np.empty(shape=(capacity, action_size), dtype=np.float32)
        self.next_state = np.empty(shape=(capacity, state_size), dtype=np.float32)
        self.reward = np.empty(shape=(capacity, 1), dtype=np.int8)
        self.done = np.empty(shape=(capacity, 1), dtype=np.int8)

    def add(self, state, reward, action, next_state, done):
        self.state[self.index] = state
        self.action[self.index] = action
        self.next_state[self.index] = next_state
        self.reward[self.index] = reward
        self.done[self.index] = done

        self.index = (self.index + 1) % self.capacity
        self.full = True if self.index == 0 else False

    def sample(self, batchsize):
        limit = self.index if not self.full else self.capacity - 1

        batch = np.random.randint(0, limit, size=batchsize)

        state = torch.as_tensor(self.state[batch])
        action = torch.as_tensor(self.action[batch])
        next_state = torch.as_tensor(self.next_state[batch])
        reward = torch.as_tensor(self.reward[batch])
        done = torch.as_tensor(self.done[batch])

        return state, action, next_state, reward, done

    def save_memory(self):
        pass

    def load_memory(self):
        pass

In [114]:
class Agent():

    def __init__(self, action_size, state_size, config):
        self.action_size = action_size
        self.state_size = state_size
        self.seed = config["seed"]
        self.lr_actor = config["lr_actor"]
        self.lr_critic = config["lr_critic"]
        self.tau = config["tau"]
        self.gamma = config["gamma"]
        self.batch_size = config["batch_size"]
        if config["device"] == "cuda":
            if torch.cuda.is_available():
                config["device"] == "cpu"
        self.device = config["device"]
        
        # initialize noise
        self.noise = OrnsteinUhlenbeckProcess(sigma=0.2, theta=0.15)
        self.noise.reset()
        
        # replay
        self.memory = ReplayBuffer(state_size, action_size, config["buffer_size"], self.device)
        
        # everything necessary for SummaryWriter
        pathname = 8
        tensorboard_name = str(config["locexp"]) + '/runs' + str(pathname)
        self.writer = SummaryWriter(tensorboard_name)
        self.steps = 0

        # set seeds
        torch.manual_seed(self.seed)
        np.random.seed(self.seed)

        # actor, optimizer of actor, target for actor, critic, optimizer of critic, target for critic
        self.actor = Actor(state_size, action_size, config["fc1_units"], config["fc2_units"]).to(self.device)
        self.optimizer_a = torch.optim.Adam(self.actor.parameters(), self.lr_actor)
        self.target_actor = Actor(state_size, action_size, config["fc1_units"], config["fc2_units"]).to(self.device)
        self.target_actor.load_state_dict(self.actor.state_dict())
        self.critic = Q_network(state_size, action_size, config["fc1_units"], config["fc2_units"]).to(self.device)
        self.optimizer_q = torch.optim.Adam(self.critic.parameters(), self.lr_critic)
        self.target_critic = Q_network(state_size, action_size, config["fc1_units"], config["fc2_units"]).to(self.device)
        self.target_critic.load_state_dict(self.critic.state_dict())
        
    def act(self, state, greedy=False):
        state = torch.as_tensor(state, dtype=torch.float32, device=torch.device("cpu"))
        state = state.unsqueeze(0)
        with torch.no_grad():
            action = self.actor(state).numpy()[0]#.detach().numpy()
        
        # ^ torch.argmax(q_nns) in continuous case
        noise = self.noise.step()
        #print(noise)
        action = action if greedy else np.clip(action + noise, -1, 1)#self.noise.step(), -1, 1)        
        return action
        
    def train(self, episodes, timesteps):
        #env = gym.make("MountainCarContinuous-v0")
        env = gym.make("LunarLanderContinuous-v2")
        
        mean_r = 0
        mean_episode = 0
        dq = deque(maxlen=100)
        for i in range(1, episodes+1):
            state = env.reset()
            if i % 10 == 0:
                self.noise.reset()
            
            for j in range(timesteps):
                action = self.act(state)
                next_state, reward, done, _ = env.step(action)
                #next_state = np.squeeze(next_state, axis=1) bei Car
                self.memory.add(state, reward, action, next_state, done)
                state = next_state
                mean_r += reward
                
                # fill replay buffer with 10 samples before updating the policy
                if i > 10:
                    self.update()
                    
                if done:
                    print(f"timesteps until break: {j}")
                    break
            
            # append mean of episode to list and add average of last 10 means if existent, then reset mean
            #print(mean_r)
            #print(type(mean_r))
            dq.append(mean_r)
            mean_episode = np.mean(dq)
            self.writer.add_scalar("a_rew", mean_episode, i)
            
            # print every tenth episode to keep track
            print(f"Episode: {i}, mean_r: {mean_r}, mean_episode: {mean_episode}")
            #if i % 10 == 0:
            #    print(f"Episode: {i}, mean_r: {mean_r}, mean_episode: {mean_episode}")
            
            mean_r = 0

    def update(self):
        self.steps += 1
        # sample minibatch and calculate target value and q_nns
        state, action, next_state, reward, done = self.memory.sample(self.batch_size)
        #print(state.shape)
        #print(action.shape)
        #print(next_state.shape)
        #print(reward.shape)
        #print(done.shape)
        
        with torch.no_grad():
            next_action = self.target_actor(next_state)#.detach()#.numpy()
            q_target = self.target_critic(next_state, next_action)
            y_target = reward + (self.gamma * q_target * (1-done))
        
        #print(next_action.shape)
        #print(q_target.shape)
        #print(y_target.shape)
        
        # update critic
        q_samples_target = self.critic(state, action)
        loss_critic = F.mse_loss(q_samples_target, y_target)
        self.writer.add_scalar("loss_critic", loss_critic, self.steps)
        
        #print(q_samples_target.shape)
        
        # set gradients to zero and optimize q
        self.optimizer_q.zero_grad()
        loss_critic.backward()
        self.optimizer_q.step
        
        # update actor
        c_action = self.actor(state)
        q_sum_samples = self.critic(state, c_action)
        loss_actor = -q_sum_samples.mean()
        self.writer.add_scalar("loss_actor", loss_actor, self.steps)
        
        # set gradients to zero and optimize a
        self.optimizer_a.zero_grad()
        loss_actor.backward()        
        self.optimizer_a.step()
        
        # update target networks
        self.update_target(self.actor, self.target_actor)
        self.update_target(self.critic, self.target_critic)
        
    def update_target(self, online, target):
        for parameter, target in zip(online.parameters(), target.parameters()):
            target.data.copy_(self.tau * parameter.data + (1 - self.tau) * target.data)

In [115]:
#env = gym.make("MountainCarContinuous-v0")
env = gym.make("LunarLanderContinuous-v2")
#state = env.reset()
action_space = env.action_space.shape[0]
state_space = env.observation_space.shape[0]

In [116]:
agent = Agent(action_size=action_space, state_size=state_space, config=config)

In [117]:
agent.train(episodes=1000, timesteps=1000)



timesteps until break: 66
Episode: 1, mean_r: -149.34171600738364, mean_episode: -149.34171600738364
timesteps until break: 57
Episode: 2, mean_r: -126.9462417048907, mean_episode: -138.14397885613715
timesteps until break: 84
Episode: 3, mean_r: -125.18388763153442, mean_episode: -133.82394844793623
timesteps until break: 76
Episode: 4, mean_r: -33.701864812680355, mean_episode: -108.79342753912226
timesteps until break: 125
Episode: 5, mean_r: -69.55352385453632, mean_episode: -100.94544680220507
timesteps until break: 125
Episode: 6, mean_r: -302.7105510685486, mean_episode: -134.57296417992902
timesteps until break: 99
Episode: 7, mean_r: -333.4336656086961, mean_episode: -162.98163581261002
timesteps until break: 99
Episode: 8, mean_r: -399.8545171660434, mean_episode: -192.59074598178918
timesteps until break: 93
Episode: 9, mean_r: -244.98009539459846, mean_episode: -198.41178480543465
timesteps until break: 96
Episode: 10, mean_r: -328.77245242970184, mean_episode: -211.4478515

timesteps until break: 74
Episode: 83, mean_r: -304.31521178830104, mean_episode: -230.73373862219216
timesteps until break: 52
Episode: 84, mean_r: -217.7429058670583, mean_episode: -230.5790858512977
timesteps until break: 53
Episode: 85, mean_r: -152.8862732675886, mean_episode: -229.66505276207758
timesteps until break: 85
Episode: 86, mean_r: -455.0312696590985, mean_episode: -232.2855901678569
timesteps until break: 55
Episode: 87, mean_r: -302.01250051544906, mean_episode: -233.0870489074844
timesteps until break: 69
Episode: 88, mean_r: -162.6765863403617, mean_episode: -232.28693001467616
timesteps until break: 70
Episode: 89, mean_r: -400.05838867876764, mean_episode: -234.1720025839356
timesteps until break: 68
Episode: 90, mean_r: -366.00445236711, mean_episode: -235.63680758152643
timesteps until break: 51
Episode: 91, mean_r: -265.61484783039214, mean_episode: -235.96623659525022
timesteps until break: 62
Episode: 92, mean_r: -338.0044282458472, mean_episode: -237.0753473

KeyboardInterrupt: 