In [1]:
import gym
import numpy as np
import random
import matplotlib.pyplot as plt
from gym.envs.registration import register, spec
from gym import envs
from tqdm import tqdm
from datetime import datetime
from json_tricks import dumps, loads
%matplotlib inline

In [2]:
import random
def generate(size=8, p=None):
    if p == None or p == 0:
        p = random.uniform(0.1, 1)
    valid = False
    def is_valid(arr, r=0, c=0):
        if arr[r][c] == 'G':
            return True
        
        tmp = arr[r][c]
        arr[r][c] = "#"
        
        if r+1 < size and arr[r+1][c] not in '#H':
            if is_valid(arr, r+1, c) == True:
                arr[r][c] = tmp
                return True
        
        if c+1 < size and arr[r][c+1] not in '#H':
            if is_valid(arr, r, c+1) == True:
                arr[r][c] = tmp
                return True
        
        if r-1 >= 0 and arr[r-1][c] not in '#H':
            if is_valid(arr, r-1, c) == True:
                arr[r][c] = tmp
                return True
        
        if c-1 >= 0 and arr[r][c-1] not in '#H':
            if is_valid(arr,r, c-1) == True:
                arr[r][c] = tmp
                return True
        arr[r][c] = tmp
        return False

    while not valid:
        p = min(1, p)
        res = np.random.choice(['F','H'], (size, size), p=[p, 1-p])
        res[0][0] = 'S'
        res[-1][-1] = 'G'
        valid = is_valid(res)
        p *= 1.05
    return ["".join(x) for x in res]

In [3]:
def verify(env, Q, num_episodes = 10000):
    # Set learning parameters
    #create lists to contain total rewards and steps per episode
    #jList = []
    rList = []
    for i in tqdm(range(num_episodes)):
        #Reset environment and get first new observation
        s = env.reset()
        rAll = 0
        d = False
        j = 0
        #The Q-Table learning algorithm
        while j < 200:
            j+=1
            #Choose an action by greedily (with noise) picking from Q table
            a = np.argmax(Q[s,:])
            #Get new state and reward from environment
            s1,r,d,_ = env.step(a)
            rAll += r
            s = s1
            if d == True:
                break
        rList.append(rAll)
    print("Score over time: " +  str(sum(rList)/num_episodes))

In [4]:
def new_env(env_map, slippery=True, MY_ENV_NAME='FrozenLakeNonskid-v0'):
    if MY_ENV_NAME in envs.registry.env_specs:
        envs.registry.env_specs.pop(MY_ENV_NAME)

    register(
        id=MY_ENV_NAME,
        entry_point='gym.envs.toy_text:FrozenLakeEnv',
        kwargs={'is_slippery': slippery, 'desc': env_map},
        max_episode_steps=100,
        reward_threshold=0.78, # optimum = .8196
    )
    env = gym.make(MY_ENV_NAME)
    return env

In [5]:
class Experiment(object):
    def __init__(self, env, num_episodes=10000):
        self.env = env
        self.Q = np.zeros([self.env.observation_space.n, self.env.action_space.n])
        self.num_episodes = num_episodes
        self.done = False
        self.score = None
        self.start = None
        self.end = None
    
    def print_score(self):
        if not self.done:
            print("Run first.")
            return
        print("Score over time: " +  str(self.score))
        
    def run(self):
        if self.done:
            print("Already done running")
            return
        
        self.start = datetime.now()
        lr = .8
        e = 0.1
        y = .95
        #create lists to contain total rewards and steps per episode
        jList = []
        rList = []
        for i in tqdm(range(self.num_episodes)):
            #Reset environment and get first new observation
            s = self.env.reset()
            rAll = 0
            d = False
            j = 0
            #The Q-Table learning algorithm
            while j < 200:
                j+=1
                #Choose an action by greedily (with noise) picking from Q table
                a = None
                if random.uniform(0,1) < e:
                    a = self.env.action_space.sample()
                else:
                    a = np.argmax(self.Q[s,:])
                #Get new state and reward from environment
                s1,r,d,_ = self.env.step(a)
                if d == True and r != 1:
                    self.Q[s, a] -= 0.01
                #Update Q-Table with new knowledge
                self.Q[s,a] = self.Q[s,a] + lr*(r + y*np.max(self.Q[s1,:]) - self.Q[s,a])
                rAll += r
                s = s1
                if d == True:
                    #Reduce chance of random action as we train the model.
        #             e = 1./((i/50) + 10)
                    break
            rList.append(rAll)
        self.done = True
        self.end = datetime.now()
        self.score = sum(rList)/self.num_episodes
        
    def dumps(self):
        if not self.done:
            print("Run first.")
            return
        
        return dumps({'Q': self.Q, 'start': self.start, 'end': self.end, 'score': self.score, 'num_episodes': self.num_episodes})
        

In [179]:
env = new_env(generate(size=5), slippery=True)
env.render()
exp = Experiment(env, num_episodes=10000)
exp.run()



  0%|          | 0/10000 [00:00<?, ?it/s][A[A

  2%|▏         | 160/10000 [00:00<00:06, 1577.33it/s][A[A




[41mS[0mFHFF
FHFHF
FFHFF
FHHFH
FFFFG


  4%|▎         | 361/10000 [00:00<00:05, 1793.84it/s][A[A

  6%|▌         | 555/10000 [00:00<00:05, 1841.00it/s][A[A

  7%|▋         | 741/10000 [00:00<00:05, 1844.38it/s][A[A

  9%|▉         | 927/10000 [00:00<00:04, 1846.76it/s][A[A

 11%|█         | 1091/10000 [00:00<00:04, 1811.11it/s][A[A

 13%|█▎        | 1285/10000 [00:00<00:04, 1828.62it/s][A[A

 15%|█▍        | 1482/10000 [00:00<00:04, 1845.34it/s][A[A

 17%|█▋        | 1690/10000 [00:00<00:04, 1870.74it/s][A[A

 19%|█▉        | 1876/10000 [00:01<00:04, 1862.42it/s][A[A

 21%|██        | 2060/10000 [00:01<00:04, 1859.90it/s][A[A

 23%|██▎       | 2278/10000 [00:01<00:04, 1885.33it/s][A[A

 25%|██▍       | 2476/10000 [00:01<00:03, 1891.85it/s][A[A

 27%|██▋       | 2671/10000 [00:01<00:03, 1895.12it/s][A[A

 29%|██▉       | 2895/10000 [00:01<00:03, 1917.90it/s][A[A

[A[A

KeyboardInterrupt: 

In [93]:
exp.Q

array([[ 1.13403430e-01,  1.24157253e-01,  1.15170664e-01,
         1.14980244e-01],
       [ 1.32869444e-01,  1.03705228e-01,  1.04004984e-01,
         1.20668472e-01],
       [ 1.02805442e-01,  2.76177404e-02,  2.39866922e-03,
         2.07540066e-02],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00],
       [ 5.75372869e-02, -7.34761724e-05,  3.26733945e-01,
         2.09972176e-01],
       [ 1.07181716e-01,  1.05431539e-01,  1.35149233e-01,
         1.08948056e-01],
       [ 1.28915708e-01,  1.03603595e-01,  1.34644319e-01,
         1.32220850e-01],
       [ 1.51431349e-01,  2.09230496e-01,  1.28551901e-01,
         1.43137583e-01],
       [ 5.07087424e-03,  2.20045751e-01,  2.77831319e-01,
         1.70728835e-01],
       [ 2.55323427e-01,  3.81878538e-01,  3.21695172e-01,
         2.39223105e-01],
       [ 1.21439276e-01,  1.30520828e-01,  1.19365222e-01,
         1.50997860e-01],
       [ 1.00805675e-01,  7.19266533e-02,  9.80214084e-02,
      

In [94]:
verify(exp.env, exp.Q)

100%|██████████| 10000/10000 [00:04<00:00, 2219.28it/s]

Score over time: 0.3522





In [10]:
experiments = {}

In [11]:
while len(experiments) < 20:
    map_str = generate(size=5)
    joined_map_str = "".join(map_str)
    if joined_map_str in experiments:
        continue
        
    env = new_env(map_str, slippery=True)
    # env.render()
    exp = Experiment(env, num_episodes=10000)
    exp.run()
    experiments[joined_map_str] = exp

100%|██████████| 10000/10000 [00:04<00:00, 2146.10it/s]
100%|██████████| 10000/10000 [00:05<00:00, 1829.96it/s]
100%|██████████| 10000/10000 [00:04<00:00, 2074.35it/s]
100%|██████████| 10000/10000 [00:07<00:00, 1378.86it/s]
100%|██████████| 10000/10000 [00:04<00:00, 2103.69it/s]
100%|██████████| 10000/10000 [00:09<00:00, 1010.82it/s]
100%|██████████| 10000/10000 [00:05<00:00, 1793.54it/s]
100%|██████████| 10000/10000 [00:05<00:00, 1790.02it/s]
100%|██████████| 10000/10000 [00:09<00:00, 1095.81it/s]
100%|██████████| 10000/10000 [00:02<00:00, 3454.02it/s]
100%|██████████| 10000/10000 [00:08<00:00, 1245.92it/s]
100%|██████████| 10000/10000 [00:01<00:00, 5019.58it/s]
100%|██████████| 10000/10000 [00:02<00:00, 4044.23it/s]
100%|██████████| 10000/10000 [00:04<00:00, 2135.58it/s]
100%|██████████| 10000/10000 [00:04<00:00, 2262.01it/s]
100%|██████████| 10000/10000 [00:02<00:00, 3985.36it/s]
100%|██████████| 10000/10000 [00:08<00:00, 1206.96it/s]
100%|██████████| 10000/10000 [00:04<00:00, 2380.

In [114]:
import sys
[(i, exp.score) for (i, exp) in enumerate(experiments.values())]

[(0, 0.0061),
 (1, 0.0656),
 (2, 0.0102),
 (3, 0.083),
 (4, 0.91),
 (5, 0.0534),
 (6, 0.421),
 (7, 0.0402),
 (8, 0.03),
 (9, 0.0053),
 (10, 0.4673),
 (11, 0.1148),
 (12, 0.0842),
 (13, 0.4291),
 (14, 0.0277),
 (15, 0.2895),
 (16, 0.0168),
 (17, 0.0713),
 (18, 0.0243),
 (19, 0.045),
 (20, 0.0466),
 (21, 0.2706),
 (22, 0.6676),
 (23, 0.4577),
 (24, 0.3788),
 (25, 0.0013),
 (26, 0.3242),
 (27, 0.028),
 (28, 0.0266),
 (29, 0.0851),
 (30, 0.0306),
 (31, 0.0041),
 (32, 0.0128),
 (33, 0.0211),
 (34, 0.319),
 (35, 0.632),
 (36, 0.0832),
 (37, 0.1791),
 (38, 0.0231),
 (39, 0.0106),
 (40, 0.0427),
 (41, 0.1555),
 (42, 0.1205),
 (43, 0.0294),
 (44, 0.4255),
 (45, 0.0021),
 (46, 0.0614),
 (47, 0.6085),
 (48, 0.6736),
 (49, 0.0163),
 (50, 0.1488),
 (51, 0.0043),
 (52, 0.5059),
 (53, 0.2764),
 (54, 0.4988),
 (55, 0.2611),
 (56, 0.0161),
 (57, 0.0152),
 (58, 0.0321),
 (59, 0.0192),
 (60, 0.1253),
 (61, 0.2066),
 (62, 0.1137),
 (63, 0.0021),
 (64, 0.7011),
 (65, 0.0666),
 (66, 0.6482),
 (67, 0.3386),


In [None]:
[exp.Q for (i, exp) in enumerate(experiments.values())]

In [13]:
experiments = set()

In [75]:
%%time
# while len(experiments) < 4000:
for i in range(50000*80):
    map_str = generate(size=5,p=0.5)
    joined_map_str = "".join(map_str)
    experiments.add(joined_map_str)

CPU times: user 24min 57s, sys: 6.11 s, total: 25min 3s
Wall time: 25min 10s


In [86]:
print(len(experiments))
# experiments

1120610


1120610

In [87]:
f = open('dump.txt','w')
f.write("\n".join(experiments))
f.close()

In [6]:
experiments = set()

In [18]:
%%time
# while len(experiments) < 3828:
for i in range(10000):
    map_str = generate(size=4,p=0.2)
    joined_map_str = "".join(map_str)
    experiments.add(joined_map_str)

CPU times: user 7.59 s, sys: 85 ms, total: 7.67 s
Wall time: 7.76 s


In [19]:
len(experiments)

3828