In [7]:
import gymnasium as gym
import numpy as np
from collections import deque
import random
import matplotlib.pyplot as plt

from keras.models import Sequential
from keras.layers import Dense
from tensorflow.keras.optimizers.legacy import Adam

In [8]:
class DQLAgent:
    
    def __init__(self, env):
        # parameter / hyperparameter
        self.state_size = env.observation_space.shape[0]
        self.action_size = env.action_space.n
        
        self.gamma = 0.95
        self.learning_rate = 0.001
        
        self.epsilon = 1
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01
        
        self.memory = deque(maxlen = 1000)
        
        self.model = self.build_model()
        
    def build_model(self):
        # NN for Deep Q
        model = Sequential()
        model.add(Dense(48, input_dim=self.state_size, activation='tanh'))
        model.add(Dense(self.action_size, activation='linear'))
        
        model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate))
    
        return model
        
    def remember(self, state, action, reward, next_state, done):
        # storage
        self.memory.append((state, action, reward, next_state, done))
    
    def act(self, state):
        # acting
        if random.uniform(0,1) < self.epsilon:
            action = env.action_space.sample()
        else:
            act_values = self.model.predict(state, verbose=0)
            action = np.argmax(act_values[0])
            
        return action
    
    def replay(self, batch_size):
        # training
        if len(self.memory) < batch_size:
            return
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            if done:
                target = reward
            else:
                target = reward + self.gamma * np.amax(self.model.predict(next_state, verbose=0)[0])
            train_target = self.model.predict(state, verbose=0)
            train_target[0][action] = target
            self.model.fit(state, train_target, verbose=0)
            
    def adaptiveGreedy(self):
        # epsilon decay
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

In [9]:
# initialize env and agent
env = gym.make('CartPole-v1')
agent = DQLAgent(env)

batch_size = 16
episodes = 100
for e in range(episodes):
    # init env
    state = env.reset()[0]
    state = np.reshape(state, [1,4])

    done = False
    while True:
        #act
        action = agent.act(state)
        #step
        next_state, reward, done, _, _ = env.step(action)
        next_state = np.reshape(next_state, [1,4])
        #remember/storage
        agent.remember(state, action, reward, next_state, done)
        #update state
        state = next_state
        #replay
        agent.replay(batch_size)
        #adjust epsilon
        agent.adaptiveGreedy()

        if done:
            print(f'Episode: {e}')
            break

Episode: 0
Episode: 1
Episode: 2



KeyboardInterrupt

