In [389]:
import numpy as np

In [748]:
# toy example: you left the house and realise you don't have your keys. where are they?
# this is the most naive an inefficient text adventure on purpose.

#world = [('description', reward, [nested_world])]
class find_the_keys():
    def __init__(self):
        self.world = (
            'look around', 0, [
                ('go left', 0, [
                    ('climb tree', 0, []),
                    ('search floor', 0, [
                        ('lift stone', 0,[]),
                        ('lift leaf', 0,[])
                    ])
                ]),
                ('go straight', 0, [
                    ('enter house', 0, [
                        ('check cupboard', 1, []),
                        ('check wardrobe', 0, [])
                    ])
                ]),
                ('go right', 0, [
                    ('check bike', 0, []), # bike is locked
                    ('check mailbox', 0, [
                        ('open first letter', 0, []),
                        ('open second letter', 0, []),
                        ('open third letter', 0, []),
                    ])
                ])
            ]
        )
        self.reset()
        
        
    def _get_state(self):
        state = self.world
        description = state[0]
        reward = state[1]
        actions = state[2]
        for a in self.previous_actions:
            description, reward, actions = actions[a]
        return description, reward, actions
    
    def _set_state(self, previous_actions):
        self.previous_actions = previous_actions
        self.description, self.reward, self.actions = self._get_state()
        
    def reset(self):
        self._set_state([])
    
    def get_description(self):
        return self.description
    
    def get_reward(self):
        return self.reward
    
    def get_actions(self):
        return [(a[0], a[1]) for a in self.actions] + [('go back', 0)]
    
    def do(self, action):
        if action == 'go back':
            self._set_state(self.previous_actions[:-1])
            return "done"
        else:
            for i, a in enumerate(game.get_actions()):
                if a[0] == action:
                    self._set_state(self.previous_actions + [i])
                    return "done"
        return "impossible"

In [749]:
game = find_the_keys()

In [750]:
game.get_description()

'look around'

In [751]:
game.get_reward()

0

In [752]:
game.get_actions()

[('go left', 0), ('go straight', 0), ('go right', 0), ('go back', 0)]

In [753]:
game.do('go left')

'done'

In [754]:
game.get_actions()

[('climb tree', 0), ('search floor', 0), ('go back', 0)]

In [755]:
game.do('go back')

'done'

In [756]:
game.get_actions()

[('go left', 0), ('go straight', 0), ('go right', 0), ('go back', 0)]

In [757]:
# how to solve
game.do('go straight')
game.do('enter house')
game.do('check cupboard')
print(game.reward)

1


In [995]:
# random exploration
game = find_the_keys()
steps = 9000
rewards = []
rng = np.random.default_rng()
for i in range(steps):
    description = game.get_description()
    actions = game.get_actions()
    rewards.append(game.get_reward())
    #print(description, reward)
    if rewards[-1]:
        # this only makes sense because of the specific game here
        #print("You won! Reseting game.")
        game._set_state([])
    else:
        j = rng.integers(0, len(actions))
        #print("do", actions[j][0])
        game.do(actions[j][0])

In [996]:
sum(rewards)

102

In [997]:
# q-learning
# again the most naive way I could think of, see https://en.wikipedia.org/wiki/Q-learning
game = find_the_keys()
rewards = []
q = {}
q_init = 1
learning_rate = .9 # "alpha"
discount_factor = .1 # "gamma"
rng = np.random.default_rng()

state = game.get_description()
actions = game.get_actions()

p_explore = 0.01

steps = 9000
verbose=False

def get_q_s(q, state, actions, q_init):
    q_s = q.get(state, {})
    q_s = {a[0]: q_init for a in actions} | q_s # fill q for missing actions
    return q_s

for i in range(steps):
    if verbose: print()
    if verbose: print("step", i, "state", state)
    q_s = get_q_s(q, state, actions, q_init)
    q[state] = q_s
    if verbose: print("q", q)
    
    if p_explore > rng.uniform():
        if verbose: print("explore")
        j = rng.integers(0, len(actions))
        action = actions[j][0]
    else:
        if verbose: print("exploit")
        if verbose: print(q_s)
        action = max(q_s, key=q_s.get)
        
    if verbose: print("do", action)
    game.do(action)

    new_state = game.get_description()
    new_reward = game.get_reward()
    new_actions = game.get_actions()
    max_new_q = max(get_q_s(q, new_state, new_actions, q_init).values())
    
    if verbose: print("new reward", new_reward, "max new q", max_new_q)

    q_sa = q_s.get(action, q_init)
    if verbose: print("q_sa", q_sa)
    q_s[action] = (1 - learning_rate) * q_sa + learning_rate * (new_reward + discount_factor * max_new_q)
    if verbose: print(f"new q", q)
    
    rewards.append(new_reward)
    
    if new_reward:
        game.reset()
        new_state = game.get_description()
        new_actions = game.get_actions()

    state = new_state
    actions = new_actions  

In [998]:
sum(rewards)

2921

In [999]:
q

{'look around': {'go left': 0.00011000006650270004,
  'go straight': 0.011000000000000003,
  'go right': 0.00011000061319600003,
  'go back': 0.0011000057590000003},
 'go left': {'climb tree': 4.704588099999999e-05,
  'search floor': 0.00024760989999999996,
  'go back': 0.0011000000057590003},
 'climb tree': {'go back': 0.00012376099518310004},
 'search floor': {'lift stone': 0.00024760989999999996,
  'lift leaf': 0.00024760989999999996,
  'go back': 0.0002345041},
 'lift stone': {'go back': 0.00024760989999999996},
 'lift leaf': {'go back': 0.00024760989999999996},
 'go straight': {'enter house': 0.11000000000000001,
  'go back': 0.0011000000000000003},
 'enter house': {'check cupboard': 1.1,
  'check wardrobe': 0.011000008198000002,
  'go back': 0.011000000000989003},
 'go right': {'check bike': 0.00024760989999999996,
  'check mailbox': 0.00024760989999999996,
  'go back': 0.0011000000575900003},
 'check bike': {'go back': 0.00024760989999999996},
 'check mailbox': {'open first lett