In [None]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import deque, namedtuple
import random

In [None]:
Transition = namedtuple('Transition', ('state', 'action', 'next_state', 'reward', 'done'))

In [None]:
CAPACITY = 10000
BATCH_SIZE = 32

GAMMA = 0.99
LEARNING_RATE = 0.001
UPDATE_TARGET_MODEL_INTERVAL = 1

In [None]:
class DDQNAgent:
    def __init__(self, num_states, num_actions):
        self.epsilon = 1.0
        self.min_epsilon = 0.01
        self.epsilon_decay = 0.5
        
        self.num_states = num_states
        self.num_actions = num_actions
        self.memory = deque(maxlen=CAPACITY)
        self.main_model = self._get_model()
        self.target_model = self._get_model()

        if torch.cuda.is_available():
            self.main_model.cuda()
            self.target_model.cuda()
        
        self.optimizer = torch.optim.Adam(self.main_model.parameters(), lr=LEARNING_RATE)
        
    def train(self, state, action, next_state, reward, done, episode=None):
        self.memory.append(Transition(state, action, next_state, reward, done))
        
        if len(self.memory) < BATCH_SIZE:
            return None
        
        transitions = random.sample(self.memory, BATCH_SIZE)
        batch = Transition(*zip(*transitions))
        
        state_batch = torch.FloatTensor(batch.state).view(-1, self.num_states)
        action_batch = torch.LongTensor(batch.action).view(-1, 1)
        reward_batch = torch.FloatTensor(batch.reward).view(-1, 1)
        next_state_batch = torch.FloatTensor(batch.next_state).view(-1, self.num_states)
        done_batch = torch.ByteTensor(batch.done).view(-1, 1)

        if torch.cuda.is_available():
            state_batch = state_batch.cuda()
            action_batch = action_batch.cuda()
            reward_batch = reward_batch.cuda()
            next_state_batch = next_state_batch.cuda()
            done_batch = done_batch.cuda()
        
        state_action_values = self.main_model(state_batch).gather(1, action_batch)
        
        next_state_values = self.target_model(next_state_batch).max(1)[0].view(-1, 1)
        td_targets = reward_batch + GAMMA * (1 - done_batch) * next_state_values
        
        loss = F.smooth_l1_loss(state_action_values, td_targets)
        
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        if done and episode % UPDATE_TARGET_MODEL_INTERVAL == 0:
            self._update_target_model()

        return loss.item()
        
    def get_action(self, state, train=True, episode=None):
        if train:
            if self.epsilon <= np.random.uniform(0, 1):
                with torch.no_grad():
                    state = torch.FloatTensor(state).view(1, -1)
                    if torch.cuda.is_available():
                        state = state.cuda()
                    action = self.main_model(state).max(1)[1].item()
            else:
                action = random.randrange(self.num_actions)
        else:
            with torch.no_grad():
                state = torch.FloatTensor(state).view(1, -1)
                if torch.cuda.is_available():
                    state = state.cuda()
                action = self.main_model(state).max(1)[1].item()
            
        return action
        
    def decay_epsilon(self, episode):
        if self.epsilon > self.min_epsilon:
            self.epsilon *= self.epsilon_decay
       
    def _get_model(self):
        model = nn.Sequential()
        model.add_module('fc1', nn.Linear(self.num_states, 32))
        model.add_module('relu1', nn.ReLU())
        model.add_module('fc2', nn.Linear(32, 32))
        model.add_module('relu2', nn.ReLU())
        model.add_module('fc3', nn.Linear(32, self.num_actions))
        return model
    
    def _update_target_model(self):
        self.target_model.load_state_dict(self.main_model.state_dict())  

In [None]:
env = gym.make('CartPole-v0')

In [None]:
agent = DDQNAgent(env.observation_space.shape[0], env.action_space.n)

In [None]:
N_EPOCH = 500

In [None]:
continues_sucess = 0

for episode in range(N_EPOCH):
    done = False
    state = env.reset()
    step = 0
    while not done:
        action = agent.get_action(state)
        next_state, reward, done, _ = env.step(action)
        if done:
            if step < 195:
                reward = -1.0
                continues_sucess = 0
            else:
                reward = 1.0
                continues_sucess += 1
        else:
            reward = 0.0
            
        agent.train(state, action, next_state, reward, done, episode)
        
        state = next_state
        step += 1
        
    agent.decay_epsilon(episode)
    if continues_sucess >= 10:
        break
        
    print("Episode {} Step {}".format(episode, step))