# Tp2

In [42]:
import gym
import gridworld

In [43]:
env = gym.make("gridworld-v0")

In [59]:
env.setPlan("gridworldPlans\\plan1.txt",{0:-0.001,3:1,4:1,5:-1,6:-1})

In [60]:
states, P = env.getMDP()

# Value iteration

In [70]:
def val_iter(env = env, eps=.0001, gamma = .9, max_iter = 500):
    states, P = env.getMDP()
    ps = [p for p in P]
    v1, v0 = np.zeros(len(states)), np.ones(len(states))
    t = 0
    pol = [0]*len(states)
    while np.sum(np.abs(v1-v0)) > eps and t < max_iter:
        v0=v1.copy()
        t+=1
        for i in ps:
            best_reward = -2
            for act in range(env.nA):
                reward = 0
                for state_probability, next_state, next_reward, terminated  in P[i][act]:
                    reward = reward + state_probability*(next_reward + gamma*v0[next_state])
                if reward > best_reward:
                    best_reward=reward
                    pol[i] = act
                
            v1[i] = best_reward
            
    print("Number of iterations:", t)
    return v1, pol


epsilon = [0.001, 0.01, 0.1]
gamma = [1 - e for e in epsilon]
    
for e in epsilon:
    for g in gamma:
        print("Epsilon:", e, "Gamma:", g)
        v1, pol = val_iter(eps=e, gamma=g)
        print('')

Epsilon: 0.001 Gamma: 0.999
Number of iterations: 118

Epsilon: 0.001 Gamma: 0.99
Number of iterations: 40

Epsilon: 0.001 Gamma: 0.9
Number of iterations: 28

Epsilon: 0.01 Gamma: 0.999
Number of iterations: 66

Epsilon: 0.01 Gamma: 0.99
Number of iterations: 35

Epsilon: 0.01 Gamma: 0.9
Number of iterations: 23

Epsilon: 0.1 Gamma: 0.999
Number of iterations: 31

Epsilon: 0.1 Gamma: 0.99
Number of iterations: 29

Epsilon: 0.1 Gamma: 0.9
Number of iterations: 17



# Police evaluation

In [88]:
env = gym.make("gridworld-v0")
env.setPlan("gridworldPlans\\plan0.txt",{0:-0.001,3:1,4:1,5:-1,6:-1})
states, P = env.getMDP()


def pol_eval(pol, env, max_iter=500, eps=.01, gamma = .99):
    states, P = env.getMDP()
    ps = [p for p in P]
    v1, v0 = np.zeros(len(states)), np.ones(len(states))
    t = 0
    while np.sum(np.abs(v1-v0)) > eps and t < max_iter:
        v0=v1.copy()
        t+=1
        for i in ps:
            r=0
            for action, action_probability in enumerate(pol[i]):
                for state_probability, next_state, reward, terminated in P[i][action]:
                     r+= action_probability * state_probability * (reward + gamma * v0[next_state])
            v1[i] = r
                       
            
    return v1



def one_step_lookahead(env, state, V, gamma = .9):
    P = env.P
    act_val = np.zeros(env.nA)
    for act in range(env.nA):
        r=0
        for state_probability, next_state, reward, terminated in P[state][act]:
            r+= state_probability * (reward + gamma * V[next_state])
        act_val[act] = r
            
    return act_val


def pol_iter(env = env, eps=.01, gamma = .9, max_iter=500):
    states, P = env.getMDP()
    ps = [p for p in P]
    pol0, pol1 = np.zeros([env.nS, env.nA]) / env.nA, np.ones([env.nS, env.nA]) / env.nA
    # Initialize counter of evaluated policies
    t = 0
    while (pol0 != pol1).any() and t < max_iter:
        pol0 = pol1.copy()
        V = pol_eval(pol1, env, max_iter, eps, gamma)
        
        for i in ps:
            current_action = np.argmax(pol1[i])
            action_value = one_step_lookahead(env, i, V, gamma)
            best_action = np.argmax(action_value)
            #print(action_value)
            if current_action != best_action:
                pol1[i] = np.eye(env.nA)[best_action]
        t += 1
        
    print("Number of iterations:", t)
    
    return V, np.argmax(pol1, axis=1)

#V, pol = pol_iter()
epsilon = [0.001, 0.01, 0.1]
gamma = [1 - e for e in epsilon]
#epsilon = [0.001, 0.01]
for e in epsilon:
    for g in gamma:
        print("Epsilon:", e, "Gamma:", g)
        v1, pol = pol_iter(eps=e, gamma=g)
        print('')
#V, pol

Epsilon: 0.001 Gamma: 0.999
Number of iterations: 4

Epsilon: 0.001 Gamma: 0.99
Number of iterations: 3

Epsilon: 0.001 Gamma: 0.9
Number of iterations: 3

Epsilon: 0.01 Gamma: 0.999
Number of iterations: 4

Epsilon: 0.01 Gamma: 0.99
Number of iterations: 3

Epsilon: 0.01 Gamma: 0.9
Number of iterations: 3

Epsilon: 0.1 Gamma: 0.999
Number of iterations: 500

Epsilon: 0.1 Gamma: 0.99
Number of iterations: 500

Epsilon: 0.1 Gamma: 0.9
Number of iterations: 3



In [82]:
env = gym.make("gridworld-v0")


env.setPlan("gridworldPlans/plan0.txt", {0: -0.001, 3: 1, 4: 1, 5: -1, 6: -1})

env.seed(0)  # Initialise le seed du pseudo-random
env.render(mode="human") #visualisation sur la console
states, mdp = env.getMDP()  # recupere le mdp et la liste d'etats
print("Nombre d'etats : ",len(states))
print("done")
state, transitions = list(mdp.items())[0]

episode_count = 1000
reward = 0
done = False
rsum = 0

re = []
actions = []
epsilon = [0.001, 0.01, 0.1]
gamma = [1 - e for e in epsilon]

for e in epsilon:
    for g in gamma:
        re = []
        actions = []
        
        print("Epsilon:", e, "Gamma:", g)
        v1, pol = val_iter(env=env, eps=e, gamma=g)
        #v1, pol = pol_iter(env=env, eps=e, gamma=g)
        print('')

        for i in range(episode_count):
            obs = env.reset()
            env.verbose = 0
            if env.verbose:
                env.render()

            j = 0
            rsum = 0
            while True:
                action = pol[env.getStateFromObs(obs)]

                obs, reward, done, _ = env.step(action)
                rsum += reward
                j += 1
                if env.verbose:
                    env.render()
                if done:
                    #print("Episode : " + str(i) + " rsum=" + str(rsum) + ", " + str(j) + " actions")
                    re.append(rsum)
                    actions.append(j)
                    env.reset()
                    break

        print("Average reward:", np.mean(re))
        print("Average nb actions:", np.mean(actions))
        print('')
        env.close();


[40m [0m[40m [0m[40m [0m[40m [0m[40m [0m[40m [0m
[40m [0m[47m [0m[47m [0m[47m [0m[42m [0m[40m [0m
[40m [0m[47m [0m[40m [0m[47m [0m[41m [0m[40m [0m
[40m [0m[47m [0m[47m [0m[47m [0m[44m [0m[40m [0m
[40m [0m[40m [0m[40m [0m[40m [0m[40m [0m[40m [0m
[40m [0m[40m [0m[40m [0m[40m [0m[40m [0m[40m [0m
Nombre d'etats :  11
done
Epsilon: 0.001 Gamma: 0.999
Number of iterations: 51

Average reward: 0.980483
Average nb actions: 20.517

Epsilon: 0.001 Gamma: 0.99
Number of iterations: 46

Average reward: 0.980839
Average nb actions: 20.161

Epsilon: 0.001 Gamma: 0.9
Number of iterations: 16

Average reward: 0.542124
Average nb actions: 4.876

Epsilon: 0.01 Gamma: 0.999
Number of iterations: 31

Average reward: 0.981018
Average nb actions: 19.982

Epsilon: 0.01 Gamma: 0.99
Number of iterations: 27

Average reward: 0.980223
Average nb actions: 20.777

Epsilon: 0.01 Gamma: 0.9
Number of iterations: 12

Average reward: 0.566209
Ave

In [90]:
env = gym.make("gridworld-v0")


env.setPlan("gridworldPlans/plan2.txt", {0: -0.001, 3: 1, 4: 1, 5: -1, 6: -1})

env.seed(0)  # Initialise le seed du pseudo-random
env.render(mode="human") #visualisation sur la console
states, mdp = env.getMDP()  # recupere le mdp et la liste d'etats
state, transitions = list(mdp.items())[0]

episode_count = 1000
reward = 0
done = False
rsum = 0

re = []
actions = []
epsilon = 0.001
gamma = 0.999



for plan in range(7,8):
    for methode in ['val', 'pol']:
        if plan != 9:
            jeu = 'gridworldPlans/plan' + str(plan) + '.txt'
            env.setPlan(jeu, {0: -0.001, 3: 1, 4: 1, 5: -1, 6: -1})
            re = []
            actions = []
            #print("Epsilon:", e, "Gamma:", g)
            print("Jeu:", jeu)
            print("Methode:", methode)
            if methode == 'val':
                v1, pol = val_iter(env=env, eps=epsilon, gamma=gamma)
            else:
                v1, pol = pol_iter(env=env, eps=epsilon, gamma=gamma)

            for i in range(episode_count):
                obs = env.reset()
                env.verbose = 0
                if env.verbose:
                    env.render()

                j = 0
                rsum = 0
                while True:
                    action = pol[env.getStateFromObs(obs)]

                    obs, reward, done, _ = env.step(action)
                    rsum += reward
                    j += 1
                    if env.verbose:
                        env.render()
                    if done:
                        #print("Episode : " + str(i) + " rsum=" + str(rsum) + ", " + str(j) + " actions")
                        re.append(rsum)
                        actions.append(j)
                        env.reset()
                        break

            print("Average reward:", np.mean(re))
            print("Average nb actions:", np.mean(actions))
            print('')
            env.close();


[40m [0m[40m [0m[40m [0m[40m [0m[40m [0m[40m [0m[40m [0m[40m [0m
[40m [0m[47m [0m[47m [0m[47m [0m[40m [0m[47m [0m[47m [0m[40m [0m
[40m [0m[47m [0m[47m [0m[47m [0m[40m [0m[47m [0m[47m [0m[40m [0m
[40m [0m[47m [0m[44m [0m[47m [0m[47m [0m[47m [0m[47m [0m[40m [0m
[40m [0m[40m [0m[40m [0m[40m [0m[40m [0m[40m [0m[47m [0m[40m [0m
[40m [0m[47m [0m[47m [0m[47m [0m[40m [0m[47m [0m[47m [0m[40m [0m
[40m [0m[47m [0m[46m [0m[47m [0m[47m [0m[47m [0m[42m [0m[40m [0m
[40m [0m[40m [0m[40m [0m[40m [0m[40m [0m[40m [0m[40m [0m[40m [0m
Jeu: gridworldPlans/plan7.txt
Methode: val
Number of iterations: 298
Average reward: 1.2855890000000736
Average nb actions: 696.721

Jeu: gridworldPlans/plan7.txt
Methode: pol
Number of iterations: 10
Average reward: 0.7960590000000698
Average nb actions: 647.79

