### ***Random Agent***

In [1]:
import gym
env = gym.make('CartPole-v1')

episodes = 10

for episode in range (1, episodes+1):
    state = env.reset()
    done = False
    score = 0

    while not done:
        action = env.action_space.sample()
        n_state, reward, done, info, _ = env.step(action)
        score += reward
    print(f"Episode: {episode}, Score: {score}")
env.close()

Episode: 1, Score: 62.0
Episode: 2, Score: 20.0
Episode: 3, Score: 11.0
Episode: 4, Score: 38.0
Episode: 5, Score: 23.0
Episode: 6, Score: 14.0
Episode: 7, Score: 18.0
Episode: 8, Score: 28.0
Episode: 9, Score: 37.0
Episode: 10, Score: 12.0


  if not isinstance(terminated, (bool, np.bool8)):


### *Model*

In [2]:
import numpy as np
import sys
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
#from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.legacy import Adam

from collections import deque
import random




# ***Deep Q Networks (DQN)***

In [6]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        # To see Cartpole learning, change to True
        self.render = False
        self.load_model = False

        # Define size of state and actions
        self.states = states
        self.actions = actions

        # Hyper parameters for the DQN
        self.discount_factor = 0.99
        self.learning_rate = 0.001
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.999
        self.train_start = 1000
        self.batch_size = 64

        self.buffer_size = 2000
        self.memory = deque(maxlen=self.buffer_size)

        self.model = self.build_model()
        self.target_model = self.build_model()

        self.update_target_model()

    def build_model(self):
        model = Sequential()
        model.add(Dense(24, input_shape=(self.states,), activation='relu',
                        kernel_initializer='he_uniform'))
        model.add(Dense(24, activation='relu',
                        kernel_initializer='he_uniform'))
        model.add(Dense(self.actions, activation='linear',
                        kernel_initializer='he_uniform'))
        model.compile(optimizer=Adam(learning_rate=self.learning_rate),
                    loss='mse')
        #model.summary()
        return model

    def store_transition(self, state, action, reward, next_state, done):
        # Store the experience in the replay buffer
        self.memory.append((state, action, reward, next_state, done))

        # Decay epsilon for exploration-exploitation trade-off
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
        
    def choose_action(self, state):
        # Function to perform epsilon-greedy action selection
        if np.random.rand() <= self.epsilon:
            return np.random.choice(self.actions)
        q_values = self.model.predict(state, verbose = 0)
        return np.argmax(q_values[0])

    def update_target_model(self):
        # Update target network with weights from main network
        self.target_model.set_weights(self.model.get_weights())
    
    def training_loop(self):
        if len(self.memory) < self.train_start:
            return
        batch_size = min(self.batch_size, len(self.memory))
        minibatch = random.sample(self.memory, batch_size)

        update_input = np.zeros((batch_size, self.states))
        update_target = np.zeros((batch_size, self.states))
        action, reward, done = [], [], []

        for i in range(batch_size):
            update_input[i] = minibatch[i][0]
            action.append(minibatch[i][1])
            reward.append(minibatch[i][2])
            update_target[i] = minibatch[i][3]
            done.append(minibatch[i][4])       

        target = self.model.predict(update_input, verbose=0)
        target_val = self.target_model.predict(update_target, verbose=0)

        for i in range(batch_size):
            # Q Learning: get maximum Q value at s' from target model
            if done[i]:
                target[i][action[i]] = reward[i]
            else:
                target[i][action[i]] = reward[i] + self.discount_factor * (
                    np.amax(target_val[i]))

        # and do the model fit!
        self.model.fit(update_input, target, batch_size=batch_size,
                       epochs=1, verbose=0)




In [None]:
if __name__ == "__main__":
    env = gym.make('CartPole-v1')
    # Define size of state and actions
    states = env.observation_space.shape[0]
    actions = env.action_space.n

    # Define number of episodes
    EPISODES = 150

    # Define Agent
    agent = DQNAgent(states, actions)

    scores, episodes = [], []
    
    for e in range(EPISODES):
        done = False
        score = 0
        state, _ = env.reset()
        state = np.reshape(state, [1, states])


        while not done:
            if agent.render:
                env.render()

            # Obtain action for the current state and go one step in environment  
            action = agent.choose_action(state)
            next_state, reward, done, info, _ = env.step(action)
            next_state = np.reshape(next_state, [1, states])

            # If an action make the episode end, then gives penalty of -100
            reward = reward if not done or score == 499 else -100   

            # Store the sample to memory
            agent.store_transition(state, action, reward, next_state, done)
            # Train the model
            agent.training_loop()
            # Update reward and state
            score += reward
            state = next_state
    
            if done:
                # Update the target model with the network wheight 
                agent.update_target_model()
                # every episode, plot the play time
                score = score if score == 500 else score + 100
                scores.append(score)
                episodes.append(e)

                print("episode:", e, "  score:", score, "  memory length:",
                        len(agent.memory), "  epsilon:", agent.epsilon)

                # If the mean of scores of last 10 episode is bigger than 490
                # Stop training
                if np.mean(scores[-min(10, len(scores)):]) > 490:
                    sys.exit()

 

In [None]:
import matplotlib.pyplot as plt

In [None]:
# Test the trained model
test_episodes = 10
for episode in range(test_episodes):
    state = env.reset()
    state = np.reshape(state, [1, states])

    total_reward = 0
    done = False

    while not done:
        # Choose action using the trained model
        action = np.argmax(model.predict(state)[0])

        # Take the chosen action and observe the next state and reward
        next_state, reward, done, _ = env.step(action)
        next_state = np.reshape(next_state, [1, states])

        state = next_state
        total_reward += reward

    print(f"Test Episode: {episode + 1}, Total Reward: {total_reward}")

# Close the environment
env.close()