---
# Programmation dynamique


Fabrice Mulotti<br>

v2 2023

---

In [2]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
import time

---
## Frozen Lake

Découvrons notre environnement <br>
<br>
![ForzenLake](images/frozen_lake.gif)

https://gymnasium.farama.org/environments/toy_text/frozen_lake/

In [3]:
env = gym.make('FrozenLake8x8-v1',is_slippery = False,map_name="4x4", render_mode="ansi") # ,render_mode="human")

In [4]:
# affichage
env.reset()
print(env.render())


[41mS[0mFFF
FHFH
FFFH
HFFG



In [5]:
# nombre d'états
env.observation_space.n

np.int64(16)

In [6]:
# nombre d'actions possibles
env.action_space.n

np.int64(4)

In [7]:
LEFT=0
DOWN=1
RIGHT=2
UP=3

In [8]:
# tirage aléatoire de fonction
env.action_space.sample()

np.int64(1)

In [9]:
print(env.reset())

(0, {'prob': 1})


---
## action

https://gymnasium.farama.org/api/env/#gymnasium.Env.step
<br>
env.step retourne les infos suivantes :<br>
- observation (s')<br>
- reward (r)<br>
- termination (bool)<br>
- truncated (bool)<br>
- info <br>

In [10]:
r=env.step(2)

In [11]:
print(r)

(1, 0.0, False, False, {'prob': 1.0})


In [12]:
print(f"Récompense {r[1]}")

Récompense 0.0


## Matrice de transition

__env.P[etat][action] retourne :__<br>
n fois :<br>
Probabilité<br>
s'<br>
r<br>
état final ? <br>


In [13]:
# Matrice de transition, exemple s=4
env.unwrapped.P[4]

{0: [(1.0, 4, 0.0, False)],
 1: [(1.0, 8, 0.0, False)],
 2: [(1.0, 5, 0.0, True)],
 3: [(1.0, 0, 0.0, False)]}

Si le sol n'est pas glissant : <br>
1 action => 1 état suivant <br>

Si le sol est glissant : <br>
3 destinations possibles (33% de prob), dont une en terminaison <br>

In [14]:
# récompense
env.unwrapped.P[4][RIGHT][0][2]

0.0

In [15]:
# prochain état 
env.unwrapped.P[4][RIGHT][0][1]

5

---
## Test complet

In [16]:
# S: initial state
# F: frozen lake
# H: hole
# G: the goal

env.reset()
fin=False
print(env.render())
c=0
while not fin:
    action=env.action_space.sample()
    r=env.step(action)
    print(f"Action={action}, {r}")
    fin = r[2] or r[3]
    time.sleep(0.5)
    print(env.render())
    c+=1
    if c==10:
        fin=True


[41mS[0mFFF
FHFH
FFFH
HFFG

Action=1, (4, 0.0, False, False, {'prob': 1.0})
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG

Action=0, (4, 0.0, False, False, {'prob': 1.0})
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG

Action=0, (4, 0.0, False, False, {'prob': 1.0})
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG

Action=2, (5, 0.0, True, False, {'prob': 1.0})
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG



---
# Itération sur politique

![Politique](images/politique.png)

In [17]:
theta = 0.005 # Notre limite de convergence
gamma = 0.8 # dépréciation du futur

V = np.zeros((env.observation_space.n)) # initialisation fonction de valeur
Policy = np.zeros((env.observation_space.n)) # initialisation d'une politique

loopCounter=0
while True:
    # Policy evaluation -----------------------------------
    while True:
        delta = 0
        loopCounter+=1
        for s in range(env.observation_space.n):
            v = V[s]
            action = Policy[s]
            q=0
            for destination in env.P[s][action]:
                probabilite=destination[0]
                s_prime=destination[1]
                recompense=destination[2]
                q+=probabilite*(recompense+gamma*V[s_prime])
            V[s]=q
            delta = max(delta,np.abs(v-V[s]))

        if delta < theta:
            break;

    # Policy improvement --------------------------------
    policy_stable=True
    for s in range(env.observation_space.n):
        old_action=Policy[s]
        Q=[]
        for a in range(env.action_space.n):
            q=0
            for destination in env.P[s][a]:
                probabilite=destination[0]
                s_prime=destination[1]
                recompense=destination[2]
                q+=probabilite*(recompense+gamma*V[s_prime])
            Q.append(q)
 
        new_action=np.argmax(Q)
        if new_action!=old_action:
            policy_stable=False
            Policy[s]=new_action
    if policy_stable==True:
        break

  logger.warn(


In [18]:
loopCounter

13

In [19]:
env_test = gym.make('FrozenLake8x8-v1',is_slippery = False,map_name="4x4", render_mode="human") # ,render_mode="human")

In [20]:
s,_ = env_test.reset()
fin=False
print(env_test.render())
c=0
while not fin:
    action=Policy[s]
    print(action)
    s,r,end,trunc,_=env_test.step(int(action))
    # print(f"Action={action}, {r}")
    fin = end or trunc
    time.sleep(0.5)
    env_test.render()
    c+=1
    if c==20:
        fin=True

None
1.0
1.0
2.0
1.0
2.0
2.0


In [24]:
V.reshape(4,4)

array([[0.32768, 0.4096 , 0.512  , 0.4096 ],
       [0.4096 , 0.     , 0.64   , 0.     ],
       [0.512  , 0.64   , 0.8    , 0.     ],
       [0.     , 0.8    , 1.     , 0.     ]])

In [25]:
Policy.reshape(4,4)

array([[1., 2., 1., 0.],
       [1., 0., 1., 0.],
       [2., 1., 1., 0.],
       [0., 2., 2., 0.]])

(0, {'prob': 1})

# Conclusion
slippery = False , deterministe<br>
slippery = True , stocastique, choix des actions évitant le risque<br>

---
# Itération sur valeurs

![valeur](images/iteration_valeur.png)

In [None]:
theta = 0.005 # Notre limite de convergence
gamma = 0.8 # dépréciation du futur

V = np.zeros((env.observation_space.n)) # initialisation fonction de valeur
Policy = np.zeros((env.observation_space.n)) # initialisation d'une politique

# Update value function -----------------------------------
while True:
        delta = 0
        loopCounter+=1
        # pour chaque etat
            # pour chaque action possible
                # pour toutes les destinations possibles
                    # cumuler la recompense
            # mettre a jour V

            delta = max(delta,np.abs(v-V[s]))
        print(delta)
        if delta < theta:
            break;

            
# Policy  --------------------------------
for s in range(env.observation_space.n):
    Q=[]
    for a in range(env.action_space.n):
        q=0
        for destination in env.P[s][a]:
            probabilite=destination[0]
            s_prime=destination[1]
            recompense=destination[2]
            q+=probabilite*(recompense+gamma*V[s_prime])
        Q.append(q)
    Policy[s]=# votre code glouton 
