In [None]:
from htm_rl.agent.agent import Agent, AgentRunner
from htm_rl.agent.memory import Memory, TemporalMemory
from htm_rl.agent.planner import Planner
from htm_rl.common.sa_sdr_encoder import SaSdrEncoder, format_sa_superposition
from htm_rl.common.base_sa import SaRelatedComposition, Sa, SaSuperposition
from htm_rl.common.int_sdr_encoder import IntSdrEncoder, IntRangeEncoder
from htm_rl.common.int_sdr_encoder import SequenceSdrEncoder
from htm_rl.envs.gridworld_pomdp import GridWorld

from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from random import randint

In [None]:
def init_memory(pars, sa_encoder, start_indicator=None, output_file=None):
    tm = TemporalMemory(**pars)
    memory = Memory(tm, sa_encoder, sa_encoder.format, format_sa_superposition,
                    start_indicator=start_indicator, output_file=output_file)
    return memory

def learn_way(way, memory, environment, verbosity=1):
    memory.reset()
    state, reward, done = environment.reset(), 0, False
    for action in way:
        if verbosity > 1:
            environment.render()
            print(f'Action {action} State: {state}')
        memory.train(Sa(state, action), verbosity)
        state, _, _, info = environment.step(action)

def check_agent(memory, environment, goal_state, planning_horizon=10, max_steps=12, verbosity=1):
    planner = Planner(memory, planning_horizon, 1)
    agent = Agent(memory, planner, environment.n_actions)
    run = AgentRunner(agent, environment, 1, max_steps, 0, verbosity)
    run.agent.planner.add_goal(goal_state)
    run.agent.set_planning_horizon(planning_horizon)
    run.run()
    if run.train_stats.rewards[-1] > 0:
        return True
    else:
        return False

In [None]:
def random_way(max_steps, n_actions):
    return [randint(0, n_actions-1) for _ in range(max_steps)]

def run_way_after_experiments(pars, sa_encoder, goal_state,
                              start_indicator=None,
                              n_experiments=3,
                              verbosity=0,
                              learning_true_count=1,
                              max_noise_steps=1000,
                              step_size=1,
                              save_bad_way=False,
                              save_ways=False,
                              path=None,
                              actions=[2, 2, 1, 2, 2, 1, 2],
                              noise_ways=None,
                              max_steps=12,
                              planning_horizon=10):
    results = list()
    bad_ways_history = {'way': [], 'steps': [], 'experiment': []}
    ways_history = None
    for experiment in tqdm(range(n_experiments)):
        n_steps = 0 # number of noise action sequences
        while True:
            if path is not None:
                output_file = path + f'_exp_{experiment}_n_steps_{n_steps}.pkl'
            else:
                output_file = None
            memory = init_memory(pars, sa_encoder, start_indicator=start_indicator, output_file=output_file)
            way = None
            ways_history = list()
            if noise_ways is None:
                for step in range(n_steps):
                    way = random_way(max_steps, gw.n_actions)
                    learn_way(way, memory, gw)
                    if save_ways:
                        ways_history.append(way)
            else:
                for step in range(n_steps):
                    if step >= len(noise_ways):
                        break
                    learn_way(noise_ways[step], memory, gw)

            for _ in range(learning_true_count):
                learn_way(actions, memory, gw)
                if save_ways:
                    ways_history.append(actions)

            if not check_agent(memory, gw, goal_state,
                               max_steps=max_steps,
                               planning_horizon=planning_horizon,
                               verbosity=verbosity):
                if way is not None and save_bad_way:
                    bad_ways_history['way'].append(str(way))
                    bad_ways_history['steps'].append(n_steps)
                    bad_ways_history['experiment'].append(experiment)
                break
            else:
                n_steps += step_size
                if n_steps > max_noise_steps:
                    print('Max steps are reached')
                    break
        results.append(n_steps)
    return results, bad_ways_history, ways_history

In [None]:
max_steps = 12
actions = [2, 2, 1, 2, 2, 1, 2, 2]
world_description = [[2,0,0],
                     [1,1,0],
                     [0,0,0]]

In [None]:
gw = GridWorld(world_description, (3, 3), agent_initial_position={'row': 2, 'column': 0},
               observable_vars=['distance', 'surface'], one_value_state=True)

In [None]:
gw.render()

In [None]:
gw.observable_state, gw.filtered_observation

In [None]:
state_bits = 10
action_bits = 10

state_encoder = IntSdrEncoder('state', gw.n_obs_states+1, state_bits, state_bits-3)

In [None]:
action_encoder = IntSdrEncoder('action', gw.n_actions+1,
                              value_bits=action_bits, activation_threshold=action_bits-3)

In [None]:
sa_encoder = SaSdrEncoder(state_encoder, action_encoder)

In [None]:
sa_encoder.total_bits, sa_encoder.value_bits, sa_encoder.activation_threshold

In [None]:
goal_state = 6
goal_state

In [None]:
pars = dict(n_columns=sa_encoder.total_bits,
                                cells_per_column=32,
                                activation_threshold=sa_encoder.value_bits,
                                learning_threshold=sa_encoder.value_bits,
                                initial_permanence=0.51,
                                connected_permanence=0.5,
                                maxNewSynapseCount=sa_encoder.value_bits,
                                maxSynapsesPerSegment=sa_encoder.value_bits,
                                permanenceIncrement=0.1,
                                permanenceDecrement=0.05,
                                predictedSegmentDecrement=0.025)

In [None]:
memory = init_memory(pars, sa_encoder, start_indicator=Sa(gw.n_obs_states, 3))

In [None]:
planner = Planner(memory, 10, 1)

In [None]:
agent = Agent(memory, planner, gw.n_actions)

In [None]:
run = AgentRunner(agent, gw, 1000, max_steps, 10, 0)

In [None]:
run.run()

In [None]:
run.agent.planner.add_goal(
    goal_state)
run.agent.set_planning_horizon(10)
run.run()

In [None]:
run.train_stats.rewards[-1]