In [1]:
!pip install gym



In [1]:
from gym.spaces import Discrete, Box
from gym import Env
from gym import spaces
import numpy as np
import gym
import random
import time

In [2]:
class See(gym.Env):
    def __init__(self):  # Initialisiert die Umgebung
        super(See, self).__init__()

        # Gittergröße
        self.grid_size = (7, 10)
        self.start_state = (3, 0)  # S = Start
        self.goal_state = (3, 7)  # G = Ziel
        self.state = self.start_state  # Wo der Agent beginnen soll

        # Aktionen, die vorgenommen werden können --> hoch, runter, links, rechts = 4
        self.action_space = spaces.Discrete(4)
        self.observation_space = spaces.Tuple((
            spaces.Discrete(self.grid_size[0]),
            spaces.Discrete(self.grid_size[1])
        ))

        # Windstärke
        self.wind_strength = [0, 0, 0, 1, 1, 1, 2, 2, 1, 0]

    def step(self,
             action):  # Legt fest, wie die Umgebung auf die Aktion des Agenten reagiert, einschließlich des Windeffekts.
        row, col = self.state

        if action == 0:  # hoch
            row = max(row - 1, 0)
        elif action == 1:  # runter
            row = min(row + 1, self.grid_size[0] - 1)
        elif action == 2:  # links
            col = max(col - 1, 0)
        elif action == 3:  # rechts
            col = min(col + 1, self.grid_size[1] - 1)

        # Windeffekt anwenden
        row = max(row - self.wind_strength[col], 0)

        # Update
        self.state = (row, col)

        # Belohnung/Kosten
        if self.state == self.goal_state:  # Ziel
            cost = 0
            done = True
        else:
            cost = 1
            done = False

        return self.state, cost, done, {}

    def reset(self):
        self.state = self.start_state
        return self.state

    def render(self):  # Das Gitter zeigen
        grid = np.full(self.grid_size, '.', dtype=str)  # Erstellt ein 7x10 Gitter mit Punkten
        grid[self.start_state] = 'S'
        grid[self.goal_state] = 'G'
        row, col = self.state
        if self.state != self.start_state:
            grid[row, col] = 'A'  # Aktuelle Position des Agenten

        # Gitter zeigen
        print("\n".join([" ".join(row) for row in grid]))
        print()


# Umgebung testen
env = See()
env.reset()
env.render()

. . . . . . . . . .
. . . . . . . . . .
. . . . . . . . . .
S . . . . . . G . .
. . . . . . . . . .
. . . . . . . . . .
. . . . . . . . . .



In [3]:
def encode_state(state_tupel): 
    """ Tupel to one diget state. E.g. (3,0) -> 30

    Args:
        state_tupel (_type_): _description_
    """
    
    return state_tupel[0] * 10 + state_tupel[1]

def decode_state(state):
    """ One diget state to tupel. E.g. 30 -> (3,0)

    Args:
        state (_type_): _description_
    """

    return (state // 10, state % 10)

## SARSA Algorithmus

In [4]:
# config
DEBUG = False
time_delay = 0.5
time_delay = 0

In [5]:
# Strategie Funktion 

epsilon = 0.1
alpha = 0.1
m = 4

In [6]:
def epsilon_greedy(a): 
    rand = np.random.rand()
    if rand < 1 - epsilon + epsilon / m: 
        return a 
    else:
        return random.choice(list({0,1,2,3} - {a}))
    

In [17]:
count_done = 0

# Initilalisierung Q = S x A

Q = np.random.rand(70,4)
Q = np.zeros((70,4))

##### Lernen 

env = See()

# innerer Zähler
t_end = 1000
# äußerer Zähler
i = 0

a = np.zeros(t_end + 1, dtype=np.int16)
pi = np.zeros(t_end + 1, dtype=np.int16)

while i < 400000: 
    # Initialisierung Startzustand
    env.reset()
    
    # Wähle minimale Aktion
    a[0] = epsilon_greedy(np.argmin(Q[encode_state(env.state), :]))
    
    # Vektor für states
    s = []
    s.append(env.state)

    # Vektor für Kosten
    c = np.zeros(t_end + 1)

    # Setzte t zurück 
    t = 1

    if DEBUG: 

        print("i: ", i)
        #print(Q)

        env.render()
        time.sleep(time_delay)

    while True: 
        step = env.step(a[t-1])
        
        if DEBUG: 
            print(f"Führe {a[t-1]} aus")
            env.render()
            time.sleep(time_delay)

        # Beobachte Kosten und s_{t+1}  
        c[t] = step[1]
        s.append(step[0])

        # Wähle neue Aktion aus
        a[t] = epsilon_greedy(np.argmin(Q[encode_state(env.state), :]))

        Q[encode_state(s[t-1]), a[t-1]] = (1-alpha) * Q[encode_state(s[t-1]), a[t-1]] + alpha * (c[t] + Q[encode_state(s[t]), a[t]])
        
        t += 1

        if step[2] or t > t_end: 
            if step[2]: 
                count_done += 1

            break

    i += 1

In [18]:
nbr_to_letter = {0:"h", 1:"u", 2:"l", 3:"r"}

In [19]:
pi = np.argmin(Q, axis=1)

for i in range(7):
    print(list(map(nbr_to_letter.get, pi[0+i*10:10+i*10])))

['r', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'u']
['r', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'u']
['r', 'r', 'r', 'r', 'r', 'r', 'h', 'r', 'r', 'u']
['r', 'r', 'r', 'r', 'r', 'r', 'h', 'h', 'r', 'u']
['r', 'r', 'r', 'r', 'r', 'h', 'h', 'h', 'r', 'u']
['r', 'r', 'r', 'r', 'h', 'h', 'h', 'h', 'l', 'u']
['r', 'r', 'r', 'h', 'h', 'h', 'h', 'h', 'h', 'l']


In [23]:
# Testen!

env = See()
env.reset()
s_0 = env.state

count_steps = 0

while True: 

    step = env.step(pi[encode_state(env.state)])

    count_steps += 1

    env.render()
    time.sleep(0.1)

    if step[2]: 
        break

print(f"Anzahl Schritte: {count_steps}")

. . . . . . . . . .
. . . . . . . . . .
. . . . . . . . . .
S A . . . . . G . .
. . . . . . . . . .
. . . . . . . . . .
. . . . . . . . . .

. . . . . . . . . .
. . . . . . . . . .
. . . . . . . . . .
S . A . . . . G . .
. . . . . . . . . .
. . . . . . . . . .
. . . . . . . . . .

. . . . . . . . . .
. . . . . . . . . .
. . . A . . . . . .
S . . . . . . G . .
. . . . . . . . . .
. . . . . . . . . .
. . . . . . . . . .

. . . . . . . . . .
. . . . A . . . . .
. . . . . . . . . .
S . . . . . . G . .
. . . . . . . . . .
. . . . . . . . . .
. . . . . . . . . .

. . . . . A . . . .
. . . . . . . . . .
. . . . . . . . . .
S . . . . . . G . .
. . . . . . . . . .
. . . . . . . . . .
. . . . . . . . . .

. . . . . . A . . .
. . . . . . . . . .
. . . . . . . . . .
S . . . . . . G . .
. . . . . . . . . .
. . . . . . . . . .
. . . . . . . . . .

. . . . . . . A . .
. . . . . . . . . .
. . . . . . . . . .
S . . . . . . G . .
. . . . . . . . . .
. . . . . . . . . .
. . . . . . . . . .

. . . . . . .

In [None]:
pi_star = []