In [None]:
import sys
sys.path.append('../htm_rl/htm_rl/')

from agent.agent import Agent, AgentRunner
from agent.memory import Memory, TemporalMemory
from agent.planner import Planner
from common.sa_sdr_encoder import SaSdrEncoder, format_sa_superposition
from common.base_sa import SaRelatedComposition, Sa, SaSuperposition
from common.int_sdr_encoder import IntSdrEncoder, IntRangeEncoder
from common.int_sdr_encoder import SequenceSdrEncoder
from envs.gridworld_pomdp import GridWorld

from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
from random import randint

#%
from contextlib import contextmanager
import sys, os

@contextmanager
def suppress_stdout():
    with open(os.devnull, "w") as devnull:
        old_stdout = sys.stdout
        sys.stdout = devnull
        try:
            yield
        finally:
            sys.stdout = old_stdout

In [None]:
world_description = [[2,0,0],
                     [1,1,0],
                     [0,0,0]]

In [None]:
gw = GridWorld(world_description, (3, 3), agent_initial_position={'row': 2, 'column': 0})

In [None]:
gw.world_size

In [None]:
gw.render()

In [None]:
max_steps = 12

state_encoder = SequenceSdrEncoder('state',
                                   encoders=[IntSdrEncoder('distance',
                                                                   gw.world_size[0],
                                                                   5,
                                                                   5),
                                             IntSdrEncoder('surface', 3, 5, 5),
                                             IntRangeEncoder('row', (-(gw.world_size[0]-1),
                                                                     gw.world_size[1]-1), 5, 5),
                                             IntRangeEncoder('column', (-(gw.world_size[0]-1),
                                                                     gw.world_size[1]-1), 5, 5),
                                             IntSdrEncoder('direction', 4, 5, 5)],
                                   size=5)

In [None]:
print(state_encoder.value_bits)
state_encoder.total_bits

In [None]:
action_encoder = IntSdrEncoder('action', gw.n_actions,
                              value_bits=5, activation_threshold=5)

In [None]:
sa_encoder = SaSdrEncoder(state_encoder, action_encoder)

In [None]:
sa_encoder.total_bits, sa_encoder.value_bits, sa_encoder.activation_threshold

In [None]:
action_encoder.activation_threshold

In [None]:
tm = TemporalMemory(n_columns=sa_encoder.total_bits,
                    cells_per_column=16,
                    activation_threshold=sa_encoder.value_bits,
                    learning_threshold=sa_encoder.value_bits,
                    initial_permanence=0.5,
                    connected_permanence=0.5,
                    maxNewSynapseCount=sa_encoder.value_bits,
                    maxSynapsesPerSegment=sa_encoder.value_bits,
                    permanenceIncrement=0.1,
                    permanenceDecrement=0.005,
                    predictedSegmentDecrement=0.0001
                    )

In [None]:
memory = Memory(tm, sa_encoder, sa_encoder.format, format_sa_superposition)

In [None]:
ways = [
    [2, 2, 1, 2, 2, 1, 2, 2],
    [1, 0, 2, 2, 1, 2, 2, 1, 2, 1, 0, 2],
    [2, 2, 1, 1, 2, 2, 0, 0],
    [2, 2, 1, 2, 2, 1, 2, 1, 1, 1, 1, 1],
    [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2],
    [2, 2, 1, 2, 2, 1, 0, 2, 2, 2, 2, 2],
    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
        ]
actions = [2, 2, 1, 2, 2, 1, 2, 2]

Сгенерируем случайные пути и добавим один правильный путь. Промерим, сколько случайных путей нужно добавить,
чтобы планирование сломалось.

In [None]:
def init_memory(pars):
    tm = TemporalMemory(**pars)
    memory = Memory(tm, sa_encoder, sa_encoder.format, format_sa_superposition)
    return memory

def learn_way(way, memory, environment, verbosity=1):
    memory.reset()
    state, reward, done = environment.reset(), 0, False
    for action in way:
        if verbosity > 1:
            environment.render()
            print(f'Action {action} State: {state}')
        memory.train(Sa(state, action), verbosity)
        state, _, _, info = environment.step(action)

def check_agent(memory, environment, verbosity=1):
    goal_state = (0, 2, -2, 1, 2)
    planner = Planner(memory, 10, 1)
    agent = Agent(memory, planner, environment.n_actions)
    run = AgentRunner(agent, environment, 1, max_steps, 0, verbosity)
    run.agent.planner.add_goal(goal_state)
    run.agent.set_planning_horizon(10)
    run.run()
    if run.train_stats.rewards[-1] > 0:
        return True

In [None]:
def random_way(max_steps, n_actions):
    return [randint(0, n_actions-1) for _ in range(max_steps)]

Учим агент оптимальному пути, потом добавляем постепенно случайные пути

In [None]:
memory = init_memory(pars=dict(n_columns=sa_encoder.total_bits,
                                cells_per_column=8,
                                activation_threshold=sa_encoder.value_bits,
                                learning_threshold=sa_encoder.value_bits,
                                initial_permanence=0.5,
                                connected_permanence=0.5,
                                maxNewSynapseCount=sa_encoder.value_bits,
                                maxSynapsesPerSegment=sa_encoder.value_bits,
                                permanenceIncrement=0.1,
                                permanenceDecrement=0.005,
                                predictedSegmentDecrement=0.0001))
learn_way(actions, memory, gw)
ways_history = list()
ways_history.append(actions)

while check_agent(memory, gw):
    way = random_way(max_steps, gw.n_actions)
    ways_history.append(way)
    learn_way(way, memory, gw)

print('Steps:', len(ways_history))

In [None]:
done = check_agent(memory, gw, verbosity=-1)
print(done)

In [None]:
def run_way_before_experiments(pars, n_experiments=3, verbosity=0):
    results = list()
    for _ in tqdm(range(n_experiments)):
        memory = init_memory(pars)
        learn_way(actions, memory, gw)
        ways_history = list()
        ways_history.append(actions)
        while check_agent(memory, gw, verbosity=verbosity):
            way = random_way(max_steps, gw.n_actions)
            ways_history.append(way)
            learn_way(way, memory, gw)
        results.append({'steps': len(ways_history)})
    return results

In [None]:
pars = dict(n_columns=sa_encoder.total_bits,
                                cells_per_column=32,
                                activation_threshold=sa_encoder.value_bits,
                                learning_threshold=sa_encoder.value_bits,
                                initial_permanence=0.5,
                                connected_permanence=0.5,
                                maxNewSynapseCount=sa_encoder.value_bits,
                                maxSynapsesPerSegment=sa_encoder.value_bits,
                                permanenceIncrement=0.1,
                                permanenceDecrement=0.005,
                                predictedSegmentDecrement=0.0001)
results = run_way_before_experiments(pars, n_experiments=10)

In [None]:
df_results = pd.DataFrame(results)
df_results.steps.mean(), df_results.steps.std()

In [None]:

verbosity = 1
for i, way in enumerate(ways):
    if verbosity>1:
        print()
        print(f"### Way {i+1} ###")
        print()
    for i in range(2):
        if verbosity>1:
            print()
            print(f'*** cycle {i+1} ***')
            print()
        memory.reset()
        state, reward, done = gw.reset(), 0, False
        for action in way:
            if verbosity > 1:
                gw.render()
                print(f'Action {action} State: {state}')
            memory.train(Sa(state, action), verbosity)
            state, _, _, info = gw.step(action)

In [None]:
goal_state = (0, 2, -2, 1, 2)

In [None]:
planner = Planner(memory, 10, 1)

In [None]:
agent = Agent(memory, planner, gw.n_actions)

In [None]:
run = AgentRunner(agent, gw, 1, max_steps, 0, 3)

In [None]:
run.agent.planner.add_goal(goal_state)
run.agent.set_planning_horizon(10)
run.run()

In [None]:
run.train_stats.rewards[-1]

In [None]:
run.agent.planner.add_goal(goal_state)
run.agent.set_planning_horizon(10)
run.n_episodes = 50
run.pretrain = 25
run.verbosity = 1
run.run()

In [None]:
fig = plt.figure(figsize=(10, 7))
steps = np.array(run.train_stats.steps[:])
plt.plot(np.arange(steps.size), steps, '.')
