In [2]:
import numpy as np
import pandas as pd
import random
from collections import deque
import torch
import torch.nn as nn

In [None]:
from neural import Net

class Agent:
    def __init__(self, state_size, is_eval=False, model_name=""):
        self.state_size = state_size  # normalized previous days
        self.action_size = 3  # hold, buy, sell
        self.memory = deque(maxlen=5000)
        self.model_name = model_name
        self.is_eval = is_eval

        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995

        self.use_cuda = torch.cuda.is_available()
        self.model = Net(self.state_size, self.action_size).float()
        
        if self.use_cuda:
            self.net = self.net.to(device='cuda')
        if is_eval:
            self.load(model_name)        
        
        self.model =  load if is_eval else 
        self.model_target = Net(self.state_size, self.action_size).float()
        
        self.model = load_model(model_name) if is_eval else self._model()
        self.model_target = load_model(model_name) if is_eval else self._model()
        self.update_target_from_model()
        self.loss = []




    def update_target_from_model(self):
        #Update the target model from the base model
        self.model_target.set_weights(self.model.get_weights())


    def act(self, state):
        if not self.is_eval and np.random.rand() <= self.epsilon:
            a1 = random.random()
            a2 = random.uniform(0, 1 - a1)
            a3 = 1 - a1 - a2
            return np.array([[a1, a2, a3]])

        options = self.model.predict(state)
        return softmax(options)

    def exp_replay(self, batch_size):

        minibatch = random.sample(self.memory, batch_size)

        states = np.array([tup[0][0] for tup in minibatch])
        actions = np.array([tup[1] for tup in minibatch])
        rewards = np.array([tup[2] for tup in minibatch])
        next_states = np.array([tup[3][0] for tup in minibatch])
        done = np.array([tup[4] for tup in minibatch])

        st_predict = self.model.predict(states)
        nst_predict = self.model.predict(next_states)
        nst_predict_target = self.model_target.predict(next_states)

        nst_predict_max_index = np.argmax(nst_predict, axis=1) # leanring agent의 Q값중 큰 action
        one_hot_max_index = tf.one_hot(nst_predict_max_index, self.action_size)

        target = rewards + self.gamma * np.amax(nst_predict_target*one_hot_max_index,axis=1) # 미래
        target[done] = rewards[done]

        target_f = st_predict
        target_f[range(batch_size), actions] = target

        # Q(s', a)
        #target = rewards + self.gamma * np.amax(self.model.predict(next_states), axis=1) #미래
        # end state target is reward itself (no lookahead)
        #target[done] = rewards[done]

        # Q(s, a)
        #target_f = self.model.predict(states) #현재로 예측하고? 사실상 array 만들어주는 역할, q(s,a)

        # make the agent to approximately map the current state to future discounted reward
        #target_f[range(batch_size), actions] = target #Q(s', a) 값을 업데이트, argmaxQ(s_t+1,a)

        hist = self.model.fit(states, target_f, epochs=1, verbose=0) #현재 스테이트 넣고 계산된 미래 Q값을 학습시키는 것
        #print(hist.history['loss'])
        #self.loss.append(hist.history['loss'][0]
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay


## nerual

In [None]:
from torch import nn

class Net(nn.Module):

    def __init__(self, input_dim, output_dim):
        super().__init__()

        self.model = nn.Sequential(
            nn.Linear(input_dim,300)
            nn.ReLU(),
            nn.Linear(300,200)
            nn.ReLU(),
            nn.Linear(200,100)
            nn.ReLU(),
            nn.Linear(100, output_dim)
        )

    def forward(self, input):
        return self.model(input)



## 통째로 바꿔보기

In [None]:
import torch
import random, numpy as np
from pathlib import Path

from neural import Net
from collections import deque


class Agent:
    def __init__(self, state_size, save_dir, checkpoint=None):
        self.state_dim = state_size
        self.action_dim = 3
        self.memory = deque(maxlen=10000)
        self.batch_size = 32

        self.exploration_rate = 1                  #epsilon
        self.exploration_rate_decay = 0.9995       #epsilon_dacay
        self.exploration_rate_min = 0.1            #epsilon_min
        self.gamma = 0.95

        self.curr_step = 0
        self.burnin = 100  # min. experiences before training
        self.learn_every = 3   # no. of experiences between updates to Q_online
        self.sync_every = 1000   # no. of experiences between Q_target & Q_online sync <???>

        self.save_every = 500   # no. of experiences between saving Agent
        self.save_dir = save_dir

        self.use_cuda = torch.cuda.is_available()

        # Mario's DNN to predict the most optimal action - we implement this in the Learn section
        self.net = Net(self.state_dim, self.action_dim).float()
        if self.use_cuda:
            self.net = self.net.to(device='cuda')
        if checkpoint:
            self.load(checkpoint)

        self.optimizer = torch.optim.Adam(self.net.parameters(), lr=0.00025)
        self.loss_fn = torch.nn.MSELoss()


    def act(self, state):

        # EXPLORE
        #if np.random.rand() < self.exploration_rate:
        #    action_idx = np.random.randint(self.action_dim)
            
        if np.random.rand() < self.exploration_rate:
            a1 = random.random()
            a2 = random.uniform(0, 1 - a1)
            a3 = 1 - a1 - a2
            action_values = np.array([[a1, a2, a3]])    

        # EXPLOIT
        else:
            state = torch.FloatTensor(state).cuda() if self.use_cuda else torch.FloatTensor(state)
            #state = state.unsqueeze(0)
            action_values = self.net(state)
           
        action_idx = torch.argmax(action_values, axis=1).item()
        
        # decrease exploration_rate
        self.exploration_rate *= self.exploration_rate_decay
        self.exploration_rate = max(self.exploration_rate_min, self.exploration_rate)

        # increment step
        self.curr_step += 1
        return action_idx
        
    
    def cache(self, state, next_state, action, reward, done):
        """
        Store the experience to self.memory (replay buffer)
        Inputs:
        state (LazyFrame),
        next_state (LazyFrame),
        action (int),
        reward (float),
        done(bool))
        """
        state = torch.FloatTensor(state).cuda() if self.use_cuda else torch.FloatTensor(state)
        next_state = torch.FloatTensor(next_state).cuda() if self.use_cuda else torch.FloatTensor(next_state)
        action = torch.LongTensor([action]).cuda() if self.use_cuda else torch.LongTensor([action])
        reward = torch.DoubleTensor([reward]).cuda() if self.use_cuda else torch.DoubleTensor([reward])
        done = torch.BoolTensor([done]).cuda() if self.use_cuda else torch.BoolTensor([done])

        self.memory.append( (state, next_state, action, reward, done,) )


    def recall(self):
        """
        Retrieve a batch of experiences from memory
        """
        batch = random.sample(self.memory, self.batch_size)
        state, next_state, action, reward, done = map(torch.stack, zip(*batch))
        return state, next_state, action.squeeze(), reward.squeeze(), done.squeeze()


    def td_estimate(self, state, action):
        current_Q = self.net(state)[np.arange(0, self.batch_size), action] # Q_online(s,a)
        return current_Q


    @torch.no_grad()
    def td_target(self, reward, next_state, done):
        next_state_Q = self.net(next_state)
        best_action = torch.argmax(next_state_Q, axis=1)
        next_Q = self.net(next_state, model='target')[np.arange(0, self.batch_size), best_action]
        return (reward + (1 - done.float()) * self.gamma * next_Q).float()


    def update_Q_online(self, td_estimate, td_target) :
        loss = self.loss_fn(td_estimate, td_target)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return loss.item()


    def sync_Q_target(self):
        self.net.target.load_state_dict(self.net.online.state_dict())


    def learn(self):
        if self.curr_step % self.sync_every == 0:
            self.sync_Q_target()

        if self.curr_step % self.save_every == 0:
            self.save()

        if self.curr_step < self.burnin:
            return None, None

        if self.curr_step % self.learn_every != 0:
            return None, None

        # Sample from memory
        state, next_state, action, reward, done = self.recall()

        # Get TD Estimate
        td_est = self.td_estimate(state, action)

        # Get TD Target
        td_tgt = self.td_target(reward, next_state, done)

        # Backpropagate loss through Q_online
        loss = self.update_Q_online(td_est, td_tgt)

        return (td_est.mean().item(), loss)


    def save(self):
        save_path = self.save_dir / f"mario_net_{int(self.curr_step // self.save_every)}.chkpt"
        torch.save(
            dict(
                model=self.net.state_dict(),
                exploration_rate=self.exploration_rate
            ),
            save_path
        )
        print(f"MarioNet saved to {save_path} at step {self.curr_step}")


    def load(self, load_path):
        if not load_path.exists():
            raise ValueError(f"{load_path} does not exist")

        ckp = torch.load(load_path, map_location=('cuda' if self.use_cuda else 'cpu'))
        exploration_rate = ckp.get('exploration_rate')
        state_dict = ckp.get('model')

        print(f"Loading model at {load_path} with exploration rate {exploration_rate}")
        self.net.load_state_dict(state_dict)
        self.exploration_rate = exploration_rate