In [1]:
import os
import collections
import random
import gym
import numpy as np
from typing import Deque

In [2]:
from cartPoleDqn import DQN

In [3]:
PROJECT_PATH = os.path.abspath("C:/Selbststudium/Udemy/Udemy_AI_")
MODELS_PATH = os.path.join(PROJECT_PATH, "models")
MODEL_PATH = os.path.join(MODELS_PATH, "dqn_cartpole.h5")

In [4]:
class Agent:
    def __init__(self, env: gym.Env):
        # DQN Env Variables
        self.env = env
        self.observations = self.env.observation_space.shape
        self.actions = self.env.action_space.n
        # DQN Agent Variables
        self.replay_buffer_size = 50_000
        self.train_start = 1_000 
            # Ab wie vielen gemachten Spielzügen im Replay-Buffer soll mit dem Training begonnen werden?
            # Das Netzwerk wird so nach 1_000 Spielzügen eingeschaltet
        self.memory = collections.deque(maxlen=self.replay_buffer_size)
            # Eine Liste an der an beiden Seiten etwas ändern kann
            # Ist der Speicher erstmal voll, werden die Daten von rechts nach links verschoben, 
            # bzw. die ältesten Daten werden zugunsten der neuen Daten gelöscht
        self.gamma = 0.95
        self.epsilon = 1.0
            # Wie viel Prozent der der Aktionen sollen schon zu Beginn zufällig gewählt sein?
            # Dieser Wert wird im Laufe des Trainings reduziert bis epsilon_min
        self.epsilon_min = 0.01
            # Minimaler Prozentsatz um zufällige Aktionen auszuführen
            # In einem Prozent der Fälle wollen wir noch eine zufällige Aktion haben
        self.epsilon_decay = 0.999 
            # Je näher an der 1, desto mehr Spielzüge benötigt man,
            # um mit epsilon an epsilon_min anzukommen
        # DQN Network Variables
        self.state_shape = self.observations
        self.learning_rate = 1e-3
        self.dqn = DQN(
            self.state_shape,
            self.actions,
            self.learning_rate
        )
        self.target_dqn = DQN(
            self.state_shape,
            self.actions,
            self.learning_rate
        )
        self.target_dqn.update_model(self.dqn)
        self.batch_size = 32

    def get_action(self, state):
        if np.random.rand() <= self.epsilon: 
            return np.random.randint(self.actions)
        else:
            return np.argmax(self.dqn(state)) # Die Aktion mit dem höchsten q-Value

    def train(self, num_episodes):
        last_rewards: Deque = collections.deque(maxlen=10)
        print(f"last rewards: {last_rewards}")
        best_reward_mean = 0.0
        for episode in range(1, num_episodes + 1):
            total_reward = 0.0
            state = self.env.reset()
            state = np.reshape(state, newshape=(1, -1)).astype(np.float32) # Wieder für TF
            while True:
                action = self.get_action(state)
                next_state, reward, done, _ = self.env.step(action) # ausführen des steps
                next_state = np.reshape(next_state, newshape=(1, -1)).astype(np.float32)
                if done and total_reward < 500: # reward = 500 --> Gewonnen
                    reward = -100 # Verloren "böse bestrafen"
                self.remember(state, action, reward, next_state, done)
                self.replay()
                total_reward += reward
                state = next_state
                if done:
                    if total_reward < 500:
                        total_reward += 100
                    self.target_dqn.update_model(self.dqn)
                    print(f"Episode: {episode} --- Reward: {reward} --- Epsilon: {self.epsilon}")
                    last_rewards.append(total_reward)
                    current_reward_mean = np.mean(last_rewards)
                    print(f"current reward mean: {current_reward_mean} --- best_reward_mean: {best_reward_mean}")
                    if current_reward_mean >= best_reward_mean:
                        best_reward_mean = current_reward_mean
                        self.dqn.save_model(MODEL_PATH)
                    break

    def remember(self, state, action, reward, next_state, done ):
        self.memory.append((state, action, reward, next_state, done))
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
    
    def replay(self):
        if len(self.memory) < self.train_start:
            return
        
        minibatch = random.sample(self.memory, self.batch_size)
        states, actions, rewards, states_next, dones = zip(*minibatch)
        
        states = np.concatenate(states).astype(np.float32)
        states_next = np.concatenate(states_next).astype(np.float32)

        q_values = self.dqn(states)
        q_values_next = self.target_dqn(states_next)

        # Nun folgt die Umsetzung der theoretischen Formel:
        for i in range(self.batch_size):
            a = actions[i]
            done = dones[i]
            if done:
                q_values[i][a] = rewards[i]
            else: 
                q_values[i][a] = rewards[i] + self.gamma * np.max(q_values_next[i])

        self.dqn.fit(states, q_values) 
            # Training des Netzwerks auf den aktualisierten q_values, 
            # basierend auf den Aktionen, welche ausgeführt wurden

    def play(self, num_episodes, render=True):
        self.dqn.load_model(MODEL_PATH)
        for episode in range(1, num_episodes + 1):
            total_reward = 0.0
            state = self.env.reset()
            state = np.reshape(state, newshape=(1, -1)).astype(np.float32) # Wieder für TF
            while True:
                action = self.get_action(state)
                next_state, reward, done, _ = self.env.step(action) # ausführen des steps
                next_state = np.reshape(next_state, newshape=(1, -1)).astype(np.float32)
                total_reward += reward
                state = next_state
                if done:
                    print(f"Episode: {episode} --- Reward: {reward} --- Epsilon: {self.epsilon}")
                    break

In [5]:
if __name__ == "__main__":
    env = gym.make("CartPole-v1")
    agent = Agent(env)
    agent.train(num_episodes=200)
    input("Play?")
    agent.play(num_episodes=10, render=True)

last rewards: deque([], maxlen=10)
Episode: 1 --- Reward: -100 --- Epsilon: 0.9714023696327185
current reward mean: 28.0 --- best_reward_mean: 0.0
Episode: 2 --- Reward: -100 --- Epsilon: 0.9455126435024219
current reward mean: 27.0 --- best_reward_mean: 28.0
Episode: 3 --- Reward: -100 --- Epsilon: 0.9323611649219127
current reward mean: 22.333333333333332 --- best_reward_mean: 28.0
Episode: 4 --- Reward: -100 --- Epsilon: 0.9203129279589385
current reward mean: 19.75 --- best_reward_mean: 28.0
Episode: 5 --- Reward: -100 --- Epsilon: 0.8868671875860644
current reward mean: 23.0 --- best_reward_mean: 28.0
Episode: 6 --- Reward: -100 --- Epsilon: 0.8780383184956015
current reward mean: 20.666666666666668 --- best_reward_mean: 28.0
Episode: 7 --- Reward: -100 --- Epsilon: 0.8597827393003539
current reward mean: 20.571428571428573 --- best_reward_mean: 28.0
Episode: 8 --- Reward: -100 --- Epsilon: 0.838544138970058
current reward mean: 21.0 --- best_reward_mean: 28.0
Episode: 9 --- Rewar