# Einfache Einführung in Reinforcement Learning

In diesem Beispiel implementieren wir das Q-Learning-Verfahren, um einem Agenten beizubringen, sich durch eine einfache Umgebung zu bewegen.

In [5]:
import numpy as np

# Definition der Umgebung als einfache Grid-Welt
class GridWorld:
    def __init__(self):
        self.grid = np.zeros((4, 4))  # 4x4 Gitter
        self.terminal_states = [(0, 0), (3, 3)]  # Zielfelder
        self.state = (3, 0)  # Startzustand

    def reset(self):
        self.state = (3, 0)
        return self.state

    def step(self, action):
        x, y = self.state
        if action == 0:  # Hoch
            x = max(0, x-1)
        elif action == 1:  # Rechts
            y = min(3, y+1)
        elif action == 2:  # Runter
            x = min(3, x+1)
        elif action == 3:  # Links
            y = max(0, y-1)

        self.state = (x, y)

        if self.state in self.terminal_states:
            return self.state, 1 if self.state == (3, 3) else -1, True
        return self.state, -0.01, False

ModuleNotFoundError: No module named 'numpy'

In [None]:
# Q-Learning Algorithmus
q_table = np.zeros((4, 4, 4))  # 4x4 Zustände, 4 Aktionen

gamma = 0.9  # Diskontierungsfaktor
alpha = 0.1  # Lernrate
epsilon = 0.1  # Exploration vs. Ausbeutung

def choose_action(state):
    if np.random.rand() < epsilon:
        return np.random.randint(4)
    return np.argmax(q_table[state])

def learn():
    env = GridWorld()
    episodes = 500

    for episode in range(episodes):
        state = env.reset()
        total_reward = 0
        done = False

        while not done:
            action = choose_action(state)
            next_state, reward, done = env.step(action)
            total_reward += reward

            # Q-Wert aktualisieren
            q_table[state][action] += alpha * (
                reward + gamma * np.max(q_table[next_state]) - q_table[state][action]
            )

            state = next_state

        if (episode + 1) % 100 == 0:
            print(f"Episode: {episode + 1}, Total Reward: {total_reward}")

learn()

Episode: 100, Total Reward: 0.98
Episode: 200, Total Reward: 0.98
Episode: 300, Total Reward: 0.98
Episode: 400, Total Reward: 0.97
Episode: 500, Total Reward: 0.98


In [None]:
# Agent testen
def test_agent():
    env = GridWorld()
    state = env.reset()
    done = False
    steps = 0

    while not done:
        action = np.argmax(q_table[state])
        next_state, reward, done = env.step(action)
        print(f"Schritt {steps}: Zustand: {state} -> Aktion: {action} -> Nächster Zustand: {next_state} -> Reward: {reward}")
        state = next_state
        steps += 1

    print("Endgültiger Zustand: ", state)
    print("Anzahl der Schritte: ", steps)

test_agent()

Schritt 0: Zustand: (3, 0) -> Aktion: 1 -> Nächster Zustand: (3, 1) -> Reward: -0.01
Schritt 1: Zustand: (3, 1) -> Aktion: 1 -> Nächster Zustand: (3, 2) -> Reward: -0.01
Schritt 2: Zustand: (3, 2) -> Aktion: 1 -> Nächster Zustand: (3, 3) -> Reward: 1
Endgültiger Zustand:  (3, 3)
Anzahl der Schritte:  3
