In [65]:
import gym
import random
import torch
import numpy as np
from collections import deque
import matplotlib as plt
import torch.nn as nn 
import torch.nn.functional as F
from torch.optim.lr_scheduler import StepLR
from matplotlib import pyplot as plt

In [66]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [67]:
class DQN(nn.Module):
    def __init__(self, in_dim, out_dim) -> None:
        super(DQN,self).__init__( )

        self.model = nn.Sequential(
            nn.Linear(in_dim,64),
            nn.ReLU(),
            nn.Linear(64,64),
            nn.ReLU(),
            nn.Linear(64,out_dim)
        )

    def forward(self,x):
        return self.model(x)

    

In [68]:
import random
from collections import namedtuple
class RelpayBuffer:
    def __init__(self, buffer_size, batch_size):
        
        self.memory = deque(maxlen=buffer_size)
        self.batch_size = batch_size
        self.example =  namedtuple("example",field_names=["state","action","reward","next_state","done"])
    
    def push(self,state,action,reward,next_state,done):
        
        example = self.example(state,action,reward,next_state,done)
        self.memory.append(example)
    def sample(self):
        
        experiences = random.sample(self.memory,k=self.batch_size)
        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device) # gpu
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)

        return (states,actions,rewards,next_states,dones)

    def __len__(self):
        return len(self.memory)
        

In [69]:
import torch.optim as optim

lr = 5e-4
buffer_size = int(1e5)
batch_size = 64
gamma = 0.99
up = 4


In [70]:
class DQNAgent():
    
    def __init__(self) -> None:
        self.in_dim = 8 
        self.out_dim = 4
        self.model = DQN(self.in_dim,self.out_dim).to(device)
        self.targetmodel = DQN(self.in_dim,self.out_dim).to(device)
        self.optimizer = optim.Adam(self.model.parameters(),lr=lr)
        self.memory = RelpayBuffer(buffer_size,batch_size)
        self.t_step = 0
        self.scheduler = StepLR(self.optimizer, step_size=80, gamma=1)
    def step(self, state, action, reward, next_state,done):
        self.memory.push(state,action,reward,next_state,done)
        self.t_step = (self.t_step + 1) % up
        if self.t_step == 0:
            if len(self.memory) < batch_size:
                return
            else:
                train_set = self.memory.sample()
                self.train(train_set)

    def train(self,train_set):
        tau = 1e-3
        state, action, reward, next_state, done = train_set
         
        self.model.eval()
        with torch.no_grad():
            q_pred = self.model(next_state)
            max_action = torch.argmax(q_pred,dim=1).long().unsqueeze(1)
            q_next = self.targetmodel(next_state)
        self.model.train()
        q_targets = reward + (gamma * q_next.gather(1,max_action) * (1.0 - done))
        q_expect = self.model(state).gather(1,action)
        loss = F.mse_loss(q_expect,q_targets)
        self.scheduler.step()
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step() 
        self.update(self.model,self.targetmodel,tau)

    def choose_action(self,state,epsilon):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.model.eval()
        with torch.no_grad():
            action_value = self.model(state)
        self.model.train()

        if random.random() > epsilon:
            return np.argmax(action_value.cpu().data.numpy())
        else:
            return random.choice(np.arange(4))
    
    def update(self, local_model, target_model, tau):
 
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
              
        
        

In [71]:
agent = DQNAgent()
env = gym.make("LunarLander-v2")
state = env.reset()

In [72]:
def train(epoch = 2000,maxt = 1000,eps_start = 1, eps_end = 0.01, eps_decay = 0.995):

    rewards = []
    avg_rewards = []
    epochs = [] 
    epsilon = eps_start
    
    for i in range(1,epoch +1):
        epochs.append(i)
        state = env.reset()

        score = 0
        
        for j in range(maxt):
            action = agent.choose_action(state,epsilon)
            next_state, reward, done, _ = env.step(action)
            agent.step(state, action ,reward, next_state, done)
            score += reward
            state = next_state
            if done:
                break
        rewards.append(score)
        avg_rewards.append(np.mean(rewards[-100:]))
        epsilon = max(eps_end,epsilon* eps_decay)

        # if score>200 and np.mean(rewards[-100:])>200:
        #     torch.save(agent.model, f'good_model{i}.pth')
            
        if (i + 1) % 1000 ==0:
            torch.save(agent.model, f'good_model{i}.pth')
            plt.figure(figsize=(24,6))
            plt.plot(epochs,rewards,label="reward")
            plt.plot(epochs,avg_rewards,label = "avg_reward")
            plt.xlabel("epoch")
            plt.legend()
            plt.savefig(f"model{i}reward.png", dpi = 400)
            plt.close()
           

        print(f"epoch{i} reward:{score} avg_reward:{np.mean(rewards[-100:])} lr:{agent.optimizer.param_groups[0]['lr']}")
    

In [73]:
train()

epoch1 reward:-285.76458040849184 avg_reward:-285.76458040849184 lr:0.0005
epoch2 reward:-132.00717267999326 avg_reward:-208.88587654424254 lr:0.0005
epoch3 reward:-284.127703985861 avg_reward:-233.9664856914487 lr:0.0005
epoch4 reward:-402.4293587423192 avg_reward:-276.0822039541663 lr:0.0005
epoch5 reward:-167.35131745987906 avg_reward:-254.33602665530884 lr:0.0005
epoch6 reward:-368.1569736106394 avg_reward:-273.30618448119725 lr:0.0005
epoch7 reward:-148.38509048350087 avg_reward:-255.46031391009777 lr:0.0005
epoch8 reward:-140.76788096228205 avg_reward:-241.1237597916208 lr:0.0005
epoch9 reward:-433.9893860713821 avg_reward:-262.5532738227054 lr:0.0005
epoch10 reward:-123.21836976392524 avg_reward:-248.61978341682737 lr:0.0005
epoch11 reward:-73.88078102774965 avg_reward:-232.73441956327486 lr:0.0005
epoch12 reward:-216.62685233885946 avg_reward:-231.39212229457357 lr:0.0005
epoch13 reward:-283.6356429737348 avg_reward:-235.41085465450902 lr:0.0005
epoch14 reward:-107.297326795138