In [97]:
import gym
import random
import torch
import numpy as np
from collections import deque
import matplotlib as plt
import torch.nn as nn 
import torch.nn.functional as F

In [98]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [99]:
class DQN(nn.Module):
    def __init__(self, in_dim, out_dim) -> None:
        super(DQN,self).__init__( )

        self.model = nn.Sequential(
            nn.Linear(in_dim,64),
            nn.ReLU(),
            nn.Linear(64,64),
            nn.ReLU(),
            nn.Linear(64,out_dim)
        )

    def forward(self,x):
        return self.model(x)

    

In [100]:
import random
from collections import namedtuple
class RelpayBuffer:
    def __init__(self, buffer_size, batch_size):
        
        self.memory = deque(maxlen=buffer_size)
        self.batch_size = batch_size
        self.example =  namedtuple("example",field_names=["state","action","reward","next_state","done"])
    
    def push(self,state,action,reward,next_state,done):
        
        example = self.example(state,action,reward,next_state,done)
        self.memory.append(example)
    def sample(self):
        
        experiences = random.sample(self.memory,k=self.batch_size)
        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device) # gpu
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)

        return (states,actions,rewards,next_states,dones)

    def __len__(self):
        return len(self.memory)
        

In [101]:
import torch.optim as optim

lr = 5e-4
buffer_size = int(1e5)
batch_size = 64
gamma = 0.99
up = 4


In [102]:
class DQNAgent():
    
    def __init__(self) -> None:
        self.in_dim = 8 
        self.out_dim = 4
        self.model = DQN(self.in_dim,self.out_dim)
        self.targetmodel = DQN(self.in_dim,self.out_dim)
        self.optimizer = optim.Adam(self.model.parameters(),lr=lr)
        self.memory = RelpayBuffer(buffer_size,batch_size)
        self.t_step = 0
    def step(self, state, action, reward, next_state,done):
        self.memory.push(state,action,reward,next_state,done)
        self.t_step = (self.t_step + 1) % up
        if self.t_step == 0:
            if len(self.memory) < batch_size:
                return
            else:
                train_set = self.memory.sample()
                self.train(train_set)

    def train(self,train_set):
        tau = 1e-3
        state, action, reward, next_state, done = train_set
        
        self.model.eval()
        with torch.no_grad():
            q_pred = self.model(next_state)
            max_action = torch.argmax(q_pred,dim=1).long().unsqueeze(1)
            q_next = self.targetmodel(next_state)
        self.model.train()
        q_targets = reward + (gamma * q_next.gather(1,max_action) * (1.0 - done))
        q_expect = self.model(state).gather(1,action)
        loss = F.mse_loss(q_expect,q_targets)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step() 
        self.update(self.model,self.targetmodel,tau)

    def choose_action(self,state,epsilon):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.model.eval()
        with torch.no_grad():
            action_value = self.model(state)
        self.model.train()

        if random.random() > epsilon:
            return np.argmax(action_value.cpu().data.numpy())
        else:
            return random.choice(np.arange(4))
    
    def update(self, local_model, target_model, tau):
 
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
              
        
        

In [103]:
agent = DQNAgent()
env = gym.make("LunarLander-v2")
state = env.reset()

In [104]:
def train(epoch = 2000,maxt = 1000,eps_start = 1, eps_end = 0.25, eps_decay = 0.995):

    rewards = []
    avg_rewards = []

    epsilon = eps_start
    
    for i in range(1,epoch +1):
        
        state = env.reset()

        score = 0
        
        for j in range(maxt):
            action = agent.choose_action(state,epsilon)
            next_state, reward, done, _ = env.step(action)
            agent.step(state, action ,reward, next_state, done)
            score += reward
            state = next_state
            if done:
                break
        rewards.append(score)
        avg_rewards.append(np.mean(rewards[-100:]))
        epsilon = max(eps_end,epsilon* eps_decay)

        print(f"epoch{i} reward:{score} avg_reward:{np.mean(rewards[-100:])}")
    

In [105]:
train()

epoch1 reward:-183.30163653802373 avg_reward:-183.30163653802373
epoch2 reward:-273.07648876434996 avg_reward:-228.18906265118684
epoch3 reward:-199.6962797203148 avg_reward:-218.6914683408962
epoch4 reward:-416.0307169780808 avg_reward:-268.02628050019234
epoch5 reward:-156.51534742458296 avg_reward:-245.7240938850705
epoch6 reward:-261.89975539706677 avg_reward:-248.42003747040317
epoch7 reward:-103.71063867228264 avg_reward:-227.7472662135288
epoch8 reward:-459.59722937714855 avg_reward:-256.7285116089813
epoch9 reward:-121.6721916374494 avg_reward:-241.72225383436665
epoch10 reward:-340.9586304442572 avg_reward:-251.64589149535573
epoch11 reward:-281.0030713116256 avg_reward:-254.31472602410756
epoch12 reward:-113.61653997331067 avg_reward:-242.58987718654114
epoch13 reward:-237.67400155466623 avg_reward:-242.21173290716615
epoch14 reward:-275.5325869359283 avg_reward:-244.5917939092206
epoch15 reward:-185.2022657868107 avg_reward:-240.63249203439327
epoch16 reward:-301.57567232768

KeyboardInterrupt: 