In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Conv2D, MaxPool2D, Flatten
from tensorflow.keras.backend import categorical_crossentropy
from ludus.policies import BaseTrainer
from ludus.env import EnvController
from ludus.utils import preprocess_atari, reshape_train_var
from ludus.memory import MemoryBuffer
import gym
import cv2
import copy
import threading
import time
# Super Mario stuff
from nes_py.wrappers import BinarySpaceToDiscreteSpaceEnv
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT

In [2]:
def make_env():
    env = gym_super_mario_bros.make('SuperMarioBros-v0')
    env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT)
    return env

In [3]:
class CellMemory():
    def __init__(self, state_filter):
        self.n_cells = 0
        self.cells = {}
        self.filter = state_filter
    
    def new_cell(self, state, act_seq=[], total_reward=0, total_steps=0, **attribs):
        new_cell = {'state': state, 
                    'act_seq': act_seq, 
                    'total_reward': total_reward,
                    'total_steps': total_steps,
                    'access': 1}
        
        for a in attribs:
            new_cell[a] = attribs[a]
            
        return new_cell
        
    
    def add_cell(self, state, act_seq=[], total_reward=0, total_steps=0, **attribs):
        nstate = self.filter(state)
        state_key = nstate.data.tobytes()
        if state_key in self.cells:
            cell = self.cells[state_key]
            if total_reward > cell['total_reward']:
                self.cells[state_key] = self.new_cell(nstate, act_seq, total_reward, total_steps, **attribs)
            elif total_reward == cell['total_reward'] and total_steps < cell['total_steps']:
                self.cells[state_key] = self.new_cell(nstate, act_seq, total_reward, total_steps, **attribs)
        else:
            self.cells[state_key] = self.new_cell(nstate, act_seq, total_reward, total_steps, **attribs)
            self.n_cells += 1
        
    def get_cell(self, state):
        nstate = self.filter(state)
        state_key = nstate.data.tobytes()
        if state_key in self.cells:
            self.cells[state_key]['access'] += 1
            return self.cells[state_key]
    
    def get_random_cell(self):
        if self.n_cells == 0:
            return None
        state_key = np.random.choice(list(self.cells.keys()))
        self.cells[state_key]['access'] += 1
        return self.cells[state_key]
    
    def heuristic_choose(self):
        rewards = []
        steps = []
        rew_step_ratio = []
        access = []
        choice_keys = []
        
        for key, cell in self.cells.items():
            rewards.append(cell['total_reward'])
            steps.append(cell['total_steps']+1)
            rew_step_ratio.append(cell['total_reward'] / (cell['total_steps']+1))
            access.append(cell['access'])
            choice_keys.append(key)
            
        max_reward = max(1, max(rewards))
        max_steps = max(steps)
        max_rew_step_ratio = max(1, max(rew_step_ratio))
        max_access = max(access)
        heuristics = np.zeros((len(choice_keys),))
        for i in range(len(choice_keys)):
            rewards[i] /= max_reward
            steps[i] = (max_steps - steps[i]) / max_steps
            rew_step_ratio[i] /= max_rew_step_ratio
            access[i] = (max_access - access[i]) / max_access
            heuristics[i] = 10*rewards[i] + steps[i] + 2*rew_step_ratio[i] + access[i]
            
        s = sum(heuristics)
        heuristics /= s
        key = np.random.choice(choice_keys, p=heuristics)
        return self.cells[key]
    
    def get_best_cell(self):
        if self.n_cells == 0:
            return None
        
        best_cell = None
        max_rew = -1e9
        for _, cell in self.cells.items():
            if cell['total_reward'] > max_rew:
                max_rew = cell['total_reward']
                best_cell = cell
                
        return best_cell
    
    def clear(self):
        self.sells = {}

In [4]:
def state_filter(obs, shape=(8, 12)):
    """Applies grayscale, resize, and scale_color to one observation, returning the new img"""
    gray = np.dot(obs[:,:,:3], [0.299, 0.587, 0.114])
    resized = cv2.resize(gray, dsize=shape, interpolation=cv2.INTER_LANCZOS4).reshape(shape)
    return resized / 255.

In [5]:
archive = CellMemory(state_filter)

In [6]:
def worker_func(lock, stop_signal, wait_signal):
    env = make_env()
    obs = env.reset()
    archive.add_cell(obs)
    for episode in range(n_episodes):
        if stop_signal.is_set():
            break
            
        while wait_signal.is_set():
            time.sleep(0.01)
        
        with lock:
            cell = archive.heuristic_choose()

        obs = env.reset()
        early_terminate = False
        all_acts = []
        for act in cell['act_seq']:
            obs, _, d, _ = env.step(act)
            all_acts.append(act)
            
            if d:
                early_terminate = True
                break
        
        if state_filter(obs).data != cell['state'].data:
            print('stochastic')
            print(state_filter(obs) - cell['state'].data)
        
        if early_terminate:
            break
            
        total_reward = cell['total_reward']
        for step in range(exp_steps):
            new_act = env.action_space.sample()
            obs, r, d, _ = env.step(new_act)

            if d:
                break

            all_acts.append(new_act)
            total_reward += r
            with lock:
                if total_reward >= cell['total_reward'] + 5 and np.random.randint(0, 6) == 0:
                        archive.add_cell(obs, act_seq=all_acts[:], total_reward=total_reward, \
                                         total_steps=len(all_acts))

In [7]:
n_episodes = 50000
exp_steps = 100
n_threads = 4

In [8]:
lock = threading.Lock()
stop_signal = threading.Event()
wait_signal = threading.Event()

threads = []
for i in range(n_threads):
    threads.append(threading.Thread(target=worker_func, args=(lock, stop_signal, wait_signal,)))
    threads[-1].start()
    
while len(threads) > 0:
    time.sleep(60)
    
    with lock:
        best_cell = archive.get_best_cell()

    print('Cell Count: {}, Highest Reward: {}'.format(archive.n_cells, best_cell['total_reward']))
    
    threads = [t for t in threads if t.isAlive()]

  result = entry_point.load(False)


Cell Count: 2700, Highest Reward: 798
Cell Count: 4641, Highest Reward: 798
Cell Count: 6603, Highest Reward: 905
Cell Count: 8064, Highest Reward: 905
Cell Count: 9555, Highest Reward: 953
Cell Count: 10742, Highest Reward: 1030
Cell Count: 12194, Highest Reward: 1082
Cell Count: 13613, Highest Reward: 1117
Cell Count: 14782, Highest Reward: 1117
Cell Count: 16027, Highest Reward: 1117
Cell Count: 17228, Highest Reward: 1117
Cell Count: 18468, Highest Reward: 1117
Cell Count: 19669, Highest Reward: 1254
Cell Count: 20899, Highest Reward: 1254
Cell Count: 22043, Highest Reward: 1254
Cell Count: 23183, Highest Reward: 1254
Cell Count: 24134, Highest Reward: 1271
Cell Count: 25223, Highest Reward: 1413
Cell Count: 26397, Highest Reward: 1413
Cell Count: 27399, Highest Reward: 1413
Cell Count: 28386, Highest Reward: 1413
Cell Count: 29494, Highest Reward: 1413
Cell Count: 30431, Highest Reward: 1413
Cell Count: 31426, Highest Reward: 1413
Cell Count: 32329, Highest Reward: 1413
Cell Count

Cell Count: 127261, Highest Reward: 1783
Cell Count: 127513, Highest Reward: 1783
Cell Count: 127840, Highest Reward: 1783
Cell Count: 128164, Highest Reward: 1783
Cell Count: 128456, Highest Reward: 1783
Cell Count: 128814, Highest Reward: 1783
Cell Count: 129209, Highest Reward: 1783
Cell Count: 129597, Highest Reward: 1783
Cell Count: 129897, Highest Reward: 1783
Cell Count: 130296, Highest Reward: 1783
Cell Count: 130631, Highest Reward: 1783
Cell Count: 130915, Highest Reward: 1783
Cell Count: 131256, Highest Reward: 1783
Cell Count: 131666, Highest Reward: 1783
Cell Count: 131935, Highest Reward: 1783
Cell Count: 132277, Highest Reward: 1783
Cell Count: 132714, Highest Reward: 1783
Cell Count: 133060, Highest Reward: 1783
Cell Count: 133304, Highest Reward: 1783
Cell Count: 133714, Highest Reward: 1783
Cell Count: 134056, Highest Reward: 1783
Cell Count: 134413, Highest Reward: 1783
Cell Count: 134809, Highest Reward: 1783
Cell Count: 135089, Highest Reward: 1783
Cell Count: 1355

KeyboardInterrupt: 

In [9]:
stop_signal.set()

In [10]:
cell = archive.get_best_cell()

env = make_env()
obs = env.reset()
for act in cell['act_seq']:
    obs, _, d, _ = env.step(act)
    env.render()
    time.sleep(0.02)
    if d:
        break

  result = entry_point.load(False)


In [None]:
#### archive.get_best_cell()['total_reward']

In [None]:
# env = make_env()
# obs = env.reset()
# archive.add_cell(obs)

# for episode in range(n_episodes):
#     cell = archive.get_random_cell()
    
#     all_acts = []
#     for act in cell['act_seq']:
#         env.step(act)
#         all_acts.append(act)
    
#     total_reward = cell['total_reward']
#     for step in range(exp_steps):
#         new_act = env.action_space.sample()
#         obs, r, d, _ = env.step(new_act)
        
#         if d:
#             break
        
#         all_acts.append(new_act)
#         total_reward += r
#         if total_reward >= cell['total_reward']:
#             archive.add_cell(obs, act_seq=all_acts[:], total_reward=total_reward)