In [46]:
import gym
import random
import torch
import numpy as np
from collections import deque
import matplotlib as plt
import torch.nn as nn 
import torch.nn.functional as F

In [47]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [48]:
class DQN(nn.Module):
    def __init__(self, in_dim, out_dim) -> None:
        super(DQN,self).__init__( )

        self.model = nn.Sequential(
            nn.Linear(in_dim,64),
            nn.ReLU(),
            nn.Linear(64,64),
            nn.ReLU(),
            nn.Linear(64,out_dim)
        )

    def forward(self,x):
        return self.model(x)

    

In [49]:
import random
from collections import namedtuple
class RelpayBuffer:
    def __init__(self, buffer_size, batch_size):
        
        self.memory = deque(maxlen=buffer_size)
        self.batch_size = batch_size
        self.example =  namedtuple("example",field_names=["state","action","reward","next_state","done"])
    
    def push(self,state,action,reward,next_state,done):
        
        example = self.example(state,action,reward,next_state,done)
        self.memory.append(example)
    def sample(self):
        
        experiences = random.sample(self.memory,k=self.batch_size)
        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device) # gpu
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)

        return (states,actions,rewards,next_states,dones)

    def __len__(self):
        return len(self.memory)
        

In [50]:
import torch.optim as optim

lr = 5e-4
buffer_size = int(1e5)
batch_size = 64
gamma = 0.99
up = 4


In [51]:
class DQNAgent():
    
    def __init__(self) -> None:
        self.in_dim = 8 
        self.out_dim = 4
        self.model = DQN(self.in_dim,self.out_dim).to(device)
        self.targetmodel = DQN(self.in_dim,self.out_dim).to(device)
        self.optimizer = optim.Adam(self.model.parameters(),lr=lr)
        self.memory = RelpayBuffer(buffer_size,batch_size)
        self.t_step = 0
    def step(self, state, action, reward, next_state,done):
        self.memory.push(state,action,reward,next_state,done)
        self.t_step = (self.t_step + 1) % up
        if self.t_step == 0:
            if len(self.memory) < batch_size:
                return
            else:
                train_set = self.memory.sample()
                self.train(train_set)

    def train(self,train_set):
        tau = 1e-3
        state, action, reward, next_state, done = train_set
        
        self.model.eval()
        with torch.no_grad():
            q_pred = self.model(next_state)
            max_action = torch.argmax(q_pred,dim=1).long().unsqueeze(1)
            q_next = self.targetmodel(next_state)
        self.model.train()
        q_targets = reward + (gamma * q_next.gather(1,max_action) * (1.0 - done))
        q_expect = self.model(state).gather(1,action)
        loss = F.mse_loss(q_expect,q_targets)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step() 
        self.update(self.model,self.targetmodel,tau)

    def choose_action(self,state,epsilon):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.model.eval()
        with torch.no_grad():
            action_value = self.model(state)
        self.model.train()

        if random.random() > epsilon:
            return np.argmax(action_value.cpu().data.numpy())
        else:
            return random.choice(np.arange(4))
    
    def update(self, local_model, target_model, tau):
 
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
              
        
        

In [52]:
agent = DQNAgent()
env = gym.make("LunarLander-v2")
state = env.reset()

In [53]:
def train(epoch = 2000,maxt = 1000,eps_start = 1, eps_end = 0.01, eps_decay = 0.995):

    rewards = []
    avg_rewards = []

    epsilon = eps_start
    
    for i in range(1,epoch +1):
        
        state = env.reset()

        score = 0
        
        for j in range(maxt):
            action = agent.choose_action(state,epsilon)
            next_state, reward, done, _ = env.step(action)
            agent.step(state, action ,reward, next_state, done)
            score += reward
            state = next_state
            if done:
                break
        rewards.append(score)
        avg_rewards.append(np.mean(rewards[-100:]))
        epsilon = max(eps_end,epsilon* eps_decay)

        print(f"epoch{i} reward:{score} avg_reward:{np.mean(rewards[-100:])}")
    

In [54]:
train()

epoch1 reward:-144.65051229053697 avg_reward:-144.65051229053697
epoch2 reward:-114.77787760544294 avg_reward:-129.71419494798994
epoch3 reward:-94.9100991712675 avg_reward:-118.11282968908246
epoch4 reward:-104.23606384667073 avg_reward:-114.64363822847953
epoch5 reward:-251.81578258192516 avg_reward:-142.07806709916866
epoch6 reward:-218.6146720827718 avg_reward:-154.83416792976917
epoch7 reward:-142.40113905165788 avg_reward:-153.05802094718183
epoch8 reward:-131.01507612068752 avg_reward:-150.30265284387005
epoch9 reward:-258.0452345859146 avg_reward:-162.27405081520834
epoch10 reward:-232.7437988733298 avg_reward:-169.32102562102048
epoch11 reward:-95.12226148975591 avg_reward:-162.57568342726915
epoch12 reward:-208.92405304804714 avg_reward:-166.438047562334
epoch13 reward:-158.47900501529756 avg_reward:-165.82581352025426
epoch14 reward:-99.04272521449111 avg_reward:-161.05559292698544
epoch15 reward:-206.12034904142013 avg_reward:-164.05991000128108
epoch16 reward:28.7881650944

KeyboardInterrupt: 