In [51]:
import gym
import numpy as np
import random
import matplotlib.pyplot as plt
from gym.envs.registration import register, spec
from gym import envs
from tqdm import tqdm
from datetime import datetime
from json_tricks import dumps, loads
import socket
from bitarray import bitarray
%matplotlib inline

In [57]:
import random
def generate(size=8, p=None):
    if p == None or p == 0:
        p = random.uniform(0.1, 1)
    valid = False
    def is_valid(arr, r=0, c=0):
        if arr[r][c] == 'G':
            return True
        
        tmp = arr[r][c]
        arr[r][c] = "#"
        
        if r+1 < size and arr[r+1][c] not in '#H':
            if is_valid(arr, r+1, c) == True:
                arr[r][c] = tmp
                return True
        
        if c+1 < size and arr[r][c+1] not in '#H':
            if is_valid(arr, r, c+1) == True:
                arr[r][c] = tmp
                return True
        
        if r-1 >= 0 and arr[r-1][c] not in '#H':
            if is_valid(arr, r-1, c) == True:
                arr[r][c] = tmp
                return True
        
        if c-1 >= 0 and arr[r][c-1] not in '#H':
            if is_valid(arr,r, c-1) == True:
                arr[r][c] = tmp
                return True
        arr[r][c] = tmp
        return False

    while not valid:
        p = min(1, p)
        res = np.random.choice(['F','H'], (size, size), p=[p, 1-p])
        res[0][0] = 'S'
        res[-1][-1] = 'G'
        valid = is_valid(res)
        p *= 1.05
    return ["".join(x) for x in res]

In [77]:
def verify(env, Q, num_episodes = 10000):
    print("Running validation...")
    # Set learning parameters
    #create lists to contain total rewards and steps per episode
    #jList = []
    rList = []
    successes = 0
    jTot = 0
    for i in tqdm(range(num_episodes)):
        #Reset environment and get first new observation
        s = env.reset()
        rAll = 0
        d = False
        j = 0
        #The Q-Table learning algorithm
        while j < 200:
            j+=1
            #Choose an action by greedily (with noise) picking from Q table
            a = np.argmax(Q[s,:])
            #Get new state and reward from environment
            s1,r,d,_ = env.step(a)
            rAll += r
            s = s1
            if d == True and r > 0:
                jTot += j
                successes += 1
            if d == True:
                break
        rList.append(rAll)
    print("Score over time: " +  str(sum(rList)/num_episodes))
    valid_score = sum(rList)/num_episodes
    try:
        avg_steps = jTot / successes
    except:
        avg_steps = 0
    return valid_score, avg_steps

In [59]:
def new_env(env_map, slippery=True, MY_ENV_NAME='FrozenLakeNonskid-v0'):
    if MY_ENV_NAME in envs.registry.env_specs:
        envs.registry.env_specs.pop(MY_ENV_NAME)

    register(
        id=MY_ENV_NAME,
        entry_point='gym.envs.toy_text:FrozenLakeEnv',
        kwargs={'is_slippery': slippery, 'desc': env_map},
        max_episode_steps=100,
        reward_threshold=0.78, # optimum = .8196
    )
    env = gym.make(MY_ENV_NAME)
    return env

In [103]:
class Experiment(object):
    def __init__(self, env, num_episodes=10000):
        self.env = env
        self.Q = np.zeros([self.env.observation_space.n, self.env.action_space.n])
        self.machine = socket.gethostname()
        self.num_episodes = num_episodes
        self.done = False
        self.score = None
        self.start = None
        self.end = None
        self.train_successes = []
        self.valid_score = None
        self.valid_avg_steps = None

    def print_score(self):
        if not self.done:
            print("Run first.")
            return
        print("Score over time: " +  str(self.score))

    def run(self):
        print("Running experiment...")
        if self.done:
            print("Already done running")
            return

        self.start = datetime.now()
        lr = .8
        e = 0.1
        y = .95
        #create lists to contain total rewards and steps per episode
        jList = []
        rList = []
        for i in tqdm(range(self.num_episodes)):
            #Reset environment and get first new observation
            s = self.env.reset()
            rAll = 0
            d = False
            j = 0
            #The Q-Table learning algorithm
            while j < 200:
                j+=1
                #Choose an action by greedily (with noise) picking from Q table
                a = None
                if random.uniform(0,1) < e:
                    a = self.env.action_space.sample()
                else:
                    a = np.argmax(self.Q[s,:])
                #Get new state and reward from environment
                s1,r,d,_ = self.env.step(a)
                if d == True and r != 1:
                    self.Q[s, a] -= 0.01
                #Update Q-Table with new knowledge
                self.Q[s,a] = self.Q[s,a] + lr*(r + y*np.max(self.Q[s1,:]) - self.Q[s,a])
                rAll += r
                s = s1
                if d == True and r > 0:
                    self.train_successes.append((i, j))
                if d == True:
                    #Reduce chance of random action as we train the model.
        #             e = 1./((i/50) + 10)
                    break
            rList.append(rAll)
        self.done = True
        self.end = datetime.now()
        self.score = sum(rList)/self.num_episodes

    def validate(self):
        valid_score, avg_steps = verify(self.env, self.Q)
        self.valid_score = valid_score
        self.valid_avg_steps = avg_steps

    def dumps(self):
        if not self.done:
            print("Run first.")
            return

        return dumps({'Q': self.Q,
                      'start': self.start,
                      'end': self.end,
                      'train_score': self.score,
                      'num_episodes': self.num_episodes,
                      'train_successes': exp.train_successes,
                      'train_machine' : self.machine,
                      'valid_score' : self.valid_score,
                      'valid_avg_steps' : self.valid_avg_steps
                      })

In [104]:
em = generate(size=4, p=0.5)
print(em)

['SFHF', 'FFHF', 'FHHH', 'FFFG']


In [105]:
env = new_env(em, slippery=True)
env.render()
exp = Experiment(env, num_episodes=10000)
exp.run()

  2%|▏         | 155/10000 [00:00<00:06, 1548.36it/s]


[41mS[0mFHF
FFHF
FHHH
FFFG
Running experiment...


100%|██████████| 10000/10000 [00:04<00:00, 2008.82it/s]


In [106]:
print(exp.Q)
print("Score", exp.score)

[[ 0.16930421  0.13634573  0.13408723  0.13719187]
 [ 0.12964985  0.03009128  0.10746029  0.02474984]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.26420083  0.06384523  0.16738046  0.04252948]
 [-0.00227414 -0.00237161  0.00329473  0.02497257]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.2976811   0.12560493  0.05770543  0.01408678]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.33679097  0.57237579  0.34895331  0.3466228 ]
 [ 0.01604722  0.85303694  0.01749893  0.10499536]
 [ 0.15398178  0.99002903  0.02835183  0.19175539]
 [ 0.          0.          0.          0.        ]]
Score 0.2549


In [107]:
exp.validate()

  2%|▏         | 189/10000 [00:00<00:05, 1887.69it/s]

Running validation...


100%|██████████| 10000/10000 [00:04<00:00, 2304.71it/s]

Score over time: 0.9883





In [108]:
exp.dumps()

'{"Q": {"__ndarray__": [[0.1693042110874361, 0.13634572531080155, 0.1340872322898653, 0.13719187475650646], [0.12964985425546977, 0.030091281886784567, 0.10746028965260682, 0.02474983704781368], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.26420083069591616, 0.06384522961650005, 0.16738045550376857, 0.042529482963132526], [-0.002274144553489118, -0.00237160787820849, 0.0032947264301400134, 0.024972572173639226], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.29768110408754883, 0.1256049263636717, 0.05770543379965945, 0.014086776213174793], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.3367909658553223, 0.5723757875834844, 0.34895331074969815, 0.3466227960913822], [0.016047219430365925, 0.8530369415422374, 0.017498933053627166, 0.10499536227653422], [0.1539817753879006, 0.9900290341426957, 0.028351833059896456, 0.19175538690784977], [0.0, 0.0, 0.0, 0.0]], "dtype": "float64", "shape": [16, 4], "Corder": true}, "start": {"__datetime__": null, "year": 2018, "month"

In [79]:
experiments = {}

In [11]:
while len(experiments) < 20:
    map_str = generate(size=5)
    joined_map_str = "".join(map_str)
    if joined_map_str in experiments:
        continue
        
    env = new_env(map_str, slippery=True)
    # env.render()
    exp = Experiment(env, num_episodes=10000)
    exp.run()
    experiments[joined_map_str] = exp

100%|██████████| 10000/10000 [00:04<00:00, 2146.10it/s]
100%|██████████| 10000/10000 [00:05<00:00, 1829.96it/s]
100%|██████████| 10000/10000 [00:04<00:00, 2074.35it/s]
100%|██████████| 10000/10000 [00:07<00:00, 1378.86it/s]
100%|██████████| 10000/10000 [00:04<00:00, 2103.69it/s]
100%|██████████| 10000/10000 [00:09<00:00, 1010.82it/s]
100%|██████████| 10000/10000 [00:05<00:00, 1793.54it/s]
100%|██████████| 10000/10000 [00:05<00:00, 1790.02it/s]
100%|██████████| 10000/10000 [00:09<00:00, 1095.81it/s]
100%|██████████| 10000/10000 [00:02<00:00, 3454.02it/s]
100%|██████████| 10000/10000 [00:08<00:00, 1245.92it/s]
100%|██████████| 10000/10000 [00:01<00:00, 5019.58it/s]
100%|██████████| 10000/10000 [00:02<00:00, 4044.23it/s]
100%|██████████| 10000/10000 [00:04<00:00, 2135.58it/s]
100%|██████████| 10000/10000 [00:04<00:00, 2262.01it/s]
100%|██████████| 10000/10000 [00:02<00:00, 3985.36it/s]
100%|██████████| 10000/10000 [00:08<00:00, 1206.96it/s]
100%|██████████| 10000/10000 [00:04<00:00, 2380.

In [114]:
import sys
[(i, exp.score) for (i, exp) in enumerate(experiments.values())]

[(0, 0.0061),
 (1, 0.0656),
 (2, 0.0102),
 (3, 0.083),
 (4, 0.91),
 (5, 0.0534),
 (6, 0.421),
 (7, 0.0402),
 (8, 0.03),
 (9, 0.0053),
 (10, 0.4673),
 (11, 0.1148),
 (12, 0.0842),
 (13, 0.4291),
 (14, 0.0277),
 (15, 0.2895),
 (16, 0.0168),
 (17, 0.0713),
 (18, 0.0243),
 (19, 0.045),
 (20, 0.0466),
 (21, 0.2706),
 (22, 0.6676),
 (23, 0.4577),
 (24, 0.3788),
 (25, 0.0013),
 (26, 0.3242),
 (27, 0.028),
 (28, 0.0266),
 (29, 0.0851),
 (30, 0.0306),
 (31, 0.0041),
 (32, 0.0128),
 (33, 0.0211),
 (34, 0.319),
 (35, 0.632),
 (36, 0.0832),
 (37, 0.1791),
 (38, 0.0231),
 (39, 0.0106),
 (40, 0.0427),
 (41, 0.1555),
 (42, 0.1205),
 (43, 0.0294),
 (44, 0.4255),
 (45, 0.0021),
 (46, 0.0614),
 (47, 0.6085),
 (48, 0.6736),
 (49, 0.0163),
 (50, 0.1488),
 (51, 0.0043),
 (52, 0.5059),
 (53, 0.2764),
 (54, 0.4988),
 (55, 0.2611),
 (56, 0.0161),
 (57, 0.0152),
 (58, 0.0321),
 (59, 0.0192),
 (60, 0.1253),
 (61, 0.2066),
 (62, 0.1137),
 (63, 0.0021),
 (64, 0.7011),
 (65, 0.0666),
 (66, 0.6482),
 (67, 0.3386),


In [None]:
[exp.Q for (i, exp) in enumerate(experiments.values())]

In [13]:
experiments = set()

In [75]:
%%time
# while len(experiments) < 4000:
for i in range(50000*80):
    map_str = generate(size=5,p=0.5)
    joined_map_str = "".join(map_str)
    experiments.add(joined_map_str)

CPU times: user 24min 57s, sys: 6.11 s, total: 25min 3s
Wall time: 25min 10s


In [86]:
print(len(experiments))
# experiments

1120610


1120610

In [87]:
f = open('dump.txt','w')
f.write("\n".join(experiments))
f.close()

In [6]:
experiments = set()

In [18]:
%%time
# while len(experiments) < 3828:
for i in range(10000):
    map_str = generate(size=4,p=0.2)
    joined_map_str = "".join(map_str)
    experiments.add(joined_map_str)

CPU times: user 7.59 s, sys: 85 ms, total: 7.67 s
Wall time: 7.76 s


In [1]:
len(experiments)

NameError: name 'experiments' is not defined