In [60]:
import gym
import numpy as np
import operator
from IPython.display import clear_output
from time import sleep
import random
import itertools
import tqdm
tqdm.monitor_interval = 0

In [84]:
def create_random_policy(env):
    policy = {}
    for key in range(0,env.observation_space.n):
        current_end = 0
        p = {}
        for action in range(0,env.action_space.n):
            p[action] = 1/env.action_space.n
            policy[key] = p
    return policy

def create_state_action_dictionary(env,policy):
    Q = {}
    for key in policy.keys():
        Q[key] = {a: 0.0 for a in range(0,env.action_space.n)}
    return Q

def run_game(env,policy,display = True):
    env.reset()
    episode = []
    finished = False

    while not finished:
        s = env.env.s
        if display:
            clear_output(True)
            env.render()
            sleep(1)
        timestep = [s]
        n = random.uniform(0,sum(policy[s].values()))
        top_range = 0
        for prob in policy[s].items():
            top_range +=prob[1]
            if n<top_range:
                action =  prob[0]

                break

        state, reward, finished, info, abc = env.step(action)
        timestep.append(action)
        timestep.append(reward)
        episode.append(timestep)
        if display:
          clear_output(True)
          env.render()
          sleep(1)

    return episode
def test_policy(policy,env):
    wins = 0
    r = 100
    for i in range(r):
        print(f'doing {i}')
        w = run_game(env,policy,display=False)[-1][-1]
        if w == 1:
            wins+=1
    return wins / r

In [85]:
def monte_carlo_e_soft(env,episodes=10000,policy=None,epsilon=0.01):
    if not policy:
        policy = create_random_policy(env)
    Q = create_state_action_dictionary(env,policy)
    returns = {}
    for _ in range(episodes):

        G = 0
        episode = run_game(env = env,policy=policy, display=False)
        for i in reversed(range(0,len(episode))):
            s_t,a_t,r_t = episode[i]
            state_action = (s_t, a_t)
            G += r_t
            if not state_action in [(x[0], x[1]) for x in episode[0:i]]:
                if returns.get(state_action):
                    returns[state_action].append(G)
                else:
                    returns[state_action]=[G]

                Q[s_t][a_t] = sum(returns[state_action])/len(returns[state_action])

                Q_list = list(map(lambda x: x[1], Q[s_t].items()))
                indices = [i for i, x in enumerate(Q_list) if x== max(Q_list)]
                max_Q = random.choice(indices)
                A_star = max_Q
                for a in policy[s_t].items():
                    if a[0]==A_star:
                        policy[s_t][a[0]] = 1
                    else:
                        policy[s_t][a[0]] = 0

    return policy

In [86]:
env = gym.make('FrozenLake-v1', desc=None, map_name="4x4", render_mode = 'rgb_array')
policy = monte_carlo_e_soft(env)
a = test_policy(policy,env)

doing 0
doing 1
doing 2
doing 3
doing 4
doing 5
doing 6
doing 7
doing 8
doing 9
doing 10
doing 11
doing 12
doing 13
doing 14
doing 15
doing 16
doing 17
doing 18
doing 19
doing 20
doing 21
doing 22
doing 23
doing 24
doing 25
doing 26
doing 27
doing 28
doing 29
doing 30
doing 31
doing 32
doing 33
doing 34
doing 35
doing 36
doing 37
doing 38
doing 39
doing 40
doing 41
doing 42
doing 43
doing 44
doing 45
doing 46
doing 47
doing 48
doing 49
doing 50
doing 51
doing 52
doing 53
doing 54
doing 55
doing 56
doing 57
doing 58
doing 59
doing 60
doing 61
doing 62
doing 63
doing 64
doing 65
doing 66
doing 67
doing 68
doing 69
doing 70
doing 71
doing 72
doing 73
doing 74
doing 75
doing 76
doing 77
doing 78
doing 79
doing 80
doing 81
doing 82
doing 83
doing 84
doing 85
doing 86
doing 87
doing 88
doing 89
doing 90
doing 91
doing 92
doing 93
doing 94
doing 95
doing 96
doing 97
doing 98
doing 99


In [87]:
print(a)

0.02


In [88]:
print(policy)


{0: {0: 0, 1: 1, 2: 0, 3: 0}, 1: {0: 0, 1: 0, 2: 1, 3: 0}, 2: {0: 1, 1: 0, 2: 0, 3: 0}, 3: {0: 0, 1: 1, 2: 0, 3: 0}, 4: {0: 0, 1: 0, 2: 1, 3: 0}, 5: {0: 0.25, 1: 0.25, 2: 0.25, 3: 0.25}, 6: {0: 0, 1: 0, 2: 1, 3: 0}, 7: {0: 0.25, 1: 0.25, 2: 0.25, 3: 0.25}, 8: {0: 1, 1: 0, 2: 0, 3: 0}, 9: {0: 1, 1: 0, 2: 0, 3: 0}, 10: {0: 0, 1: 1, 2: 0, 3: 0}, 11: {0: 0.25, 1: 0.25, 2: 0.25, 3: 0.25}, 12: {0: 0.25, 1: 0.25, 2: 0.25, 3: 0.25}, 13: {0: 0, 1: 0, 2: 1, 3: 0}, 14: {0: 0, 1: 0, 2: 1, 3: 0}, 15: {0: 0.25, 1: 0.25, 2: 0.25, 3: 0.25}}


In [42]:
!pip install gym[toy_text]

Collecting pygame==2.1.0
  Downloading pygame-2.1.0-cp38-cp38-win_amd64.whl (4.8 MB)
     ---------------------------------------- 4.8/4.8 MB 2.5 MB/s eta 0:00:00
Installing collected packages: pygame
Successfully installed pygame-2.1.0


In [13]:
import gym
import numpy as np
import operator
from IPython.display import clear_output
from time import sleep
import random
import itertools
import tqdm

tqdm.monitor_interval = 0

In [14]:
def create_random_policy(env):
     policy = {}
     for key in range(0, env.observation_space.n):
          current_end = 0
          p = {}
          for action in range(0, env.action_space.n):
               p[action] = 1 / env.action_space.n
          policy[key] = p
     return policy

In [15]:
def create_state_action_dictionary(env, policy):
    Q = {}
    for key in policy.keys():
         Q[key] = {a: 0.0 for a in range(0, env.action_space.n)}
    return Q