In [58]:
import numpy as np
import random
import torch
import torch.nn as nn
import gym
import matplotlib.pyplot as plt
from matplotlib import animation
import matplotlib.patches as patches

class Qfunction(nn.Module):
    def __init__(self, state_dim, action_dim):
        super().__init__()
        self.linear_1 = nn.Linear(state_dim, 64)
        self.linear_2 = nn.Linear(64, 64)
        self.linear_3 = nn.Linear(64, action_dim)
        self.activation = nn.LeakyReLU(0.1)

    def forward(self, states):
        hidden = self.linear_1(states)
        hidden = self.activation(hidden)
        hidden = self.linear_2(hidden)
        hidden = self.activation(hidden)
        actions = self.linear_3(hidden)
        return actions

In [74]:
%matplotlib notebook

In [65]:
class DQN:
    def __init__(self, state_dim, action_dim, gamma=0.98, lr=1e-3, batch_size=128, epsilon_decrease=0.01, epsilon_min=0.03):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.q_function = Qfunction(self.state_dim, self.action_dim)
        self.gamma = gamma
        self.batch_size = batch_size
        self.epsilon = 1
        self.epsilon_decrease = epsilon_decrease
        self.epsilon_min = epsilon_min
        self.memory = []
        self.memory_lim = 150000
        self.optimzaer = torch.optim.Adam(self.q_function.parameters(), lr=lr)

    def get_action(self, state):
        q_values = self.q_function(torch.FloatTensor(state))
        argmax_action = torch.argmax(q_values,dim=-1)
        probs = self.epsilon * np.ones((argmax_action.shape[0],self.action_dim)) / self.action_dim
        probs[np.arange(argmax_action.shape[0]),argmax_action] += 1 - self.epsilon
        action = np.ones(q_values.shape[0], dtype=np.int32)
        for sample in range(q_values.shape[0]):
            action[sample] = np.random.choice(np.arange(self.action_dim), p=probs[sample,:])
        return action
    
    def fit(self, state, action, reward, done, next_state):
        self.memory.append([state, action, reward, int(done), next_state])
        if len(self.memory) > self.batch_size:
            batch = random.sample(self.memory, self.batch_size)
            states, actions, rewards, dones, next_states = map(torch.tensor, list(zip(*batch)))
            targets = rewards + self.gamma * (1 - dones) * (torch.max(self.q_function(next_states), dim=1).values)
            q_values = self.q_function(states)[torch.arange(self.batch_size), actions]
            
            loss = torch.mean((q_values - targets.detach()) ** 2)
            loss.backward()
            self.optimzaer.step()
            self.optimzaer.zero_grad()
            
            if self.epsilon > self.epsilon_min:
                self.epsilon -= self.epsilon_decrease
                if(self.epsilon<self.epsilon_min):
                    self.epsilon = self.epsilon_min

In [68]:
env = gym.make('LunarLander-v2')

In [80]:
%%time
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

episode_n = 2000
t_max = 1000

agent = DQN(state_dim, action_dim,epsilon_decrease=0.02/1000)
hist = []
hist_v = []
hist_v_disc = []
do_clear = True

for episode in range(episode_n):
    total_reward = 0

    state = env.reset()
    for t in range(t_max):
        action = agent.get_action([state])
        next_state, reward, done, _ = env.step(action[0])

        total_reward += reward
        agent.fit(state, int(action), reward, done, next_state)

        state = next_state

        if done:
            break
    
    hist.append(total_reward)
    
    #validate
    if(episode%10==0):
        eps = agent.epsilon
        agent.epsilon = 0
        total_reward = 0
        total_reward_disc = 0
        rew_mult = 1
        state = env.reset()
        for t in range(t_max):
            action = agent.get_action([state])
            next_state, reward, done, _ = env.step(action[0])

            total_reward += reward
            
            total_reward_disc += rew_mult*reward
            rew_mult *= agent.gamma

            state = next_state

            if done:
                break
        agent.epsilon = eps
        
        hist_v.append(total_reward)
        hist_v_disc.append(total_reward_disc)
        
    if(np.mean(hist_v[-10:])>220):
        break
        
    

    print(f'episode: {episode}, \
rew: {round(np.mean(hist[-100:]),2)}, \
v: {round(np.mean(hist_v[-10:]),2)}, \
v_disc: {round(np.mean(hist_v_disc[-10:]),2)}, \
epsilon: {round(agent.epsilon,2)}, \
mem: {len(agent.memory)}')

episode: 0, rew: -193.4, v: -376.05, v_disc: -158.4, epsilon: 1, mem: 70
episode: 1, rew: -231.21, v: -376.05, v_disc: -158.4, epsilon: 1.0, mem: 153
episode: 2, rew: -184.56, v: -376.05, v_disc: -158.4, epsilon: 1.0, mem: 247
episode: 3, rew: -184.54, v: -376.05, v_disc: -158.4, epsilon: 1.0, mem: 329
episode: 4, rew: -172.24, v: -376.05, v_disc: -158.4, epsilon: 0.99, mem: 401
episode: 5, rew: -180.59, v: -376.05, v_disc: -158.4, epsilon: 0.99, mem: 488
episode: 6, rew: -178.39, v: -376.05, v_disc: -158.4, epsilon: 0.99, mem: 579
episode: 7, rew: -197.18, v: -376.05, v_disc: -158.4, epsilon: 0.99, mem: 698
episode: 8, rew: -200.46, v: -376.05, v_disc: -158.4, epsilon: 0.99, mem: 792
episode: 9, rew: -191.68, v: -376.05, v_disc: -158.4, epsilon: 0.98, mem: 922
episode: 10, rew: -179.22, v: -266.23, v_disc: -89.91, epsilon: 0.98, mem: 987
episode: 11, rew: -185.53, v: -266.23, v_disc: -89.91, epsilon: 0.98, mem: 1108
episode: 12, rew: -183.23, v: -266.23, v_disc: -89.91, epsilon: 0.98,

KeyboardInterrupt: 

In [75]:
plt.grid(True)
plt.plot([i for i in range(0,len(hist[::10])*10,10)],np.convolve(hist[::10],np.ones(10)/10,mode='same'))
plt.plot([i for i in range(0,len(hist[::10])*10,10)],np.convolve(hist_v,np.ones(10)/10,mode='same'),label='hist_v')
plt.plot([i for i in range(0,len(hist[::10])*10,10)],np.convolve(hist_v_disc,np.ones(10)/10,mode='same'),label='hist_v_disc')
plt.xlabel('episodes')
plt.ylabel('mean reward (10 episodes)')
plt.legend()

<IPython.core.display.Javascript object>

<matplotlib.legend.Legend at 0x2470ffc3580>

In [76]:
plt.savefig('dqn_final.png',dpi = 300, bbox_inches='tight', facecolor='w')

In [77]:
ce_val = np.loadtxt('ce_v.txt')

In [78]:
plt.grid(True)
plt.plot([i for i in range(0,len(hist[::10])*10,10)],np.convolve(hist_v,np.ones(10)/10,mode='same'),label='DQN validation')
plt.plot(ce_val[:,0]*20,ce_val[:,1],label='CE validation')
plt.xlabel('episodes')
plt.ylabel('mean reward (10 episodes)')
plt.legend()

<IPython.core.display.Javascript object>

<matplotlib.legend.Legend at 0x2470a13fc70>

In [79]:
plt.savefig('dqn_vs_ce_final.png',dpi = 300, bbox_inches='tight', facecolor='w')