In [None]:
from htm_rl.agent.agent import Agent, AgentRunner
from htm_rl.agent.memory import Memory, TemporalMemory
from htm_rl.agent.planner import Planner
from htm_rl.common.sa_sdr_encoder import SaSdrEncoder, format_sa_superposition
from htm_rl.common.base_sa import SaRelatedComposition, Sa, SaSuperposition
from htm_rl.common.int_sdr_encoder import IntSdrEncoder, IntRangeEncoder
from htm_rl.common.int_sdr_encoder import SequenceSdrEncoder
from htm_rl.envs.gridworld_pomdp import GridWorld

from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
from random import randint

In [None]:
world_description = [[2,0,0],
                     [1,1,0],
                     [0,0,0]]

In [None]:
gw = GridWorld(world_description, (3, 3), agent_initial_position={'row': 2, 'column': 0})

In [None]:
gw.world_size

In [None]:
gw.render()

In [None]:
max_steps = 12

state_encoder = SequenceSdrEncoder('state',
                                   encoders=[IntSdrEncoder('distance',
                                                                   gw.world_size[0],
                                                                   5,
                                                                   5),
                                             IntSdrEncoder('surface', 3, 5, 5),
                                             IntRangeEncoder('row', (-(gw.world_size[0]-1),
                                                                     gw.world_size[1]-1), 5, 5),
                                             IntRangeEncoder('column', (-(gw.world_size[0]-1),
                                                                     gw.world_size[1]-1), 5, 5),
                                             IntSdrEncoder('direction', 4, 5, 5)],
                                   size=5)

In [None]:
print(state_encoder.value_bits)
state_encoder.total_bits

In [None]:
action_encoder = IntSdrEncoder('action', gw.n_actions,
                              value_bits=5, activation_threshold=5)

In [None]:
sa_encoder = SaSdrEncoder(state_encoder, action_encoder)

In [None]:
sa_encoder.total_bits, sa_encoder.value_bits, sa_encoder.activation_threshold

In [None]:
action_encoder.activation_threshold

In [None]:
tm = TemporalMemory(n_columns=sa_encoder.total_bits,
                    cells_per_column=16,
                    activation_threshold=sa_encoder.value_bits,
                    learning_threshold=sa_encoder.value_bits,
                    initial_permanence=0.5,
                    connected_permanence=0.5,
                    maxNewSynapseCount=sa_encoder.value_bits,
                    maxSynapsesPerSegment=sa_encoder.value_bits,
                    permanenceIncrement=0.1,
                    permanenceDecrement=0.005,
                    predictedSegmentDecrement=0.0001
                    )

In [None]:
memory = Memory(tm, sa_encoder, sa_encoder.format, format_sa_superposition)

Оптимальный путь (ахах, самоуверенный человек, думает, что он нашёл оптимальный путь)

In [None]:
actions = [2, 2, 1, 2, 2, 1, 2, 2]
goal_state = (0, 2, -2, 1, 2)

Сгенерируем случайные пути и добавим один правильный путь. Проверим, сколько случайных путей нужно добавить,
чтобы планирование сломалось.

In [None]:
def init_memory(pars, sa_encoder, start_indicator=None):
    tm = TemporalMemory(**pars)
    memory = Memory(tm, sa_encoder, sa_encoder.format, format_sa_superposition, start_indicator=start_indicator)
    return memory

def learn_way(way, memory, environment, verbosity=1):
    memory.reset()
    state, reward, done = environment.reset(), 0, False
    for action in way:
        if verbosity > 1:
            environment.render()
            print(f'Action {action} State: {state}')
        memory.train(Sa(state, action), verbosity)
        state, _, _, info = environment.step(action)

def check_agent(memory, environment, goal_state, verbosity=1):
    planner = Planner(memory, 10, 1)
    agent = Agent(memory, planner, environment.n_actions)
    run = AgentRunner(agent, environment, 1, max_steps, 0, verbosity)
    run.agent.planner.add_goal(goal_state)
    run.agent.set_planning_horizon(10)
    run.run()
    if run.train_stats.rewards[-1] > 0:
        return True
    else:
        return False

In [None]:
def random_way(max_steps, n_actions):
    return [randint(0, n_actions-1) for _ in range(max_steps)]

Учим агент оптимальному пути, потом добавляем постепенно случайные пути

In [None]:
memory = init_memory(pars=dict(n_columns=sa_encoder.total_bits,
                                cells_per_column=8,
                                activation_threshold=sa_encoder.value_bits,
                                learning_threshold=sa_encoder.value_bits,
                                initial_permanence=0.5,
                                connected_permanence=0.5,
                                maxNewSynapseCount=sa_encoder.value_bits,
                                maxSynapsesPerSegment=sa_encoder.value_bits,
                                permanenceIncrement=0.1,
                                permanenceDecrement=0.005,
                                predictedSegmentDecrement=0.0001), sa_encoder=sa_encoder)
learn_way(actions, memory, gw)
ways_history = list()
ways_history.append(actions)

while check_agent(memory, gw, goal_state):
    way = random_way(max_steps, gw.n_actions)
    ways_history.append(way)
    learn_way(way, memory, gw)

print('Steps:', len(ways_history))

In [None]:
done = check_agent(memory, gw, goal_state, verbosity=-1)
print(done)

In [None]:
def run_way_before_experiments(memory_pars, sa_encoder, goal_state, n_experiments=3, verbosity=0):
    results = list()
    for _ in tqdm(range(n_experiments)):
        memory = init_memory(memory_pars, sa_encoder)
        learn_way(actions, memory, gw)
        ways_history = list()
        ways_history.append(actions)
        while check_agent(memory, gw, goal_state, verbosity=verbosity):
            way = random_way(max_steps, gw.n_actions)
            ways_history.append(way)
            learn_way(way, memory, gw)
        results.append({'steps': len(ways_history)})
    return results

In [None]:
pars = dict(n_columns=sa_encoder.total_bits,
                                cells_per_column=32,
                                activation_threshold=sa_encoder.value_bits,
                                learning_threshold=sa_encoder.value_bits,
                                initial_permanence=0.5,
                                connected_permanence=0.5,
                                maxNewSynapseCount=sa_encoder.value_bits,
                                maxSynapsesPerSegment=sa_encoder.value_bits,
                                permanenceIncrement=0.1,
                                permanenceDecrement=0.005,
                                predictedSegmentDecrement=0.0001)
results = run_way_before_experiments(pars, n_experiments=10)

In [None]:
df_results = pd.DataFrame(results)
df_results.steps.mean(), df_results.steps.std()

Построим зависимость устойчивости трекера от числа клеток в колонке.

In [None]:
import time

In [None]:
results = dict(steps_av=[], steps_err=[], n_experiments=[], n_cols=[], elapsed_time=[])
for n_cols in [8, 16, 32, 50, 70]:
    n_experiments = 10
    start = time.time()
    pars = dict(n_columns=sa_encoder.total_bits,
                                    cells_per_column=n_cols,
                                    activation_threshold=sa_encoder.value_bits,
                                    learning_threshold=sa_encoder.value_bits,
                                    initial_permanence=0.5,
                                    connected_permanence=0.5,
                                    maxNewSynapseCount=sa_encoder.value_bits,
                                    maxSynapsesPerSegment=sa_encoder.value_bits,
                                    permanenceIncrement=0.1,
                                    permanenceDecrement=0.005,
                                    predictedSegmentDecrement=0.0001)
    result = run_way_before_experiments(pars, sa_encoder, goal_state, n_experiments=n_experiments)
    end = time.time()

    elapsed_time = end - start

    df_results = pd.DataFrame(result)
    steps_mean, steps_std = df_results.steps.mean(), df_results.steps.std()

    results['steps_av'].append(steps_mean)
    results['steps_err'].append(steps_std)
    results['elapsed_time'].append(elapsed_time)
    results['n_cols'].append(n_cols)
    results['n_experiments'].append(n_experiments)
    print(f'{n_cols:2d} --> done {elapsed_time/60:.2f}')

In [None]:
n_cols_test_results = pd.DataFrame(results)
n_cols_test_results

In [None]:
plt.figure(figsize=(8, 5))
plt.errorbar(n_cols_test_results['n_cols'],
             n_cols_test_results['steps_av'], yerr=n_cols_test_results['steps_err'], fmt='*', capsize=5)
plt.ylabel('number of noise ways')
plt.xlabel('number of cells in column')

Проверим влияние числа синапсов на сегмент на устойчивость

In [None]:
results = dict(steps_av=[], steps_err=[], n_experiments=[], n_synapses=[], elapsed_time=[])
for n_synapses in [5, 10, 20, 25, 50, 100]:
    n_experiments = 10
    start = time.time()
    pars = dict(n_columns=sa_encoder.total_bits,
                                    cells_per_column=50,
                                    activation_threshold=sa_encoder.value_bits,
                                    learning_threshold=sa_encoder.value_bits,
                                    initial_permanence=0.5,
                                    connected_permanence=0.5,
                                    maxNewSynapseCount=sa_encoder.value_bits,
                                    maxSynapsesPerSegment=(sa_encoder.value_bits + n_synapses),
                                    permanenceIncrement=0.1,
                                    permanenceDecrement=0.005,
                                    predictedSegmentDecrement=0.0001)
    result = run_way_before_experiments(pars, sa_encoder, goal_state, n_experiments=n_experiments)
    end = time.time()

    elapsed_time = end - start

    df_results = pd.DataFrame(result)
    steps_mean, steps_std = df_results.steps.mean(), df_results.steps.std()

    results['steps_av'].append(steps_mean)
    results['steps_err'].append(steps_std)
    results['elapsed_time'].append(elapsed_time)
    results['n_synapses'].append(n_synapses + sa_encoder.value_bits)
    results['n_experiments'].append(n_experiments)
    print(f'{n_cols:2d} --> done {elapsed_time/60:.2f}')

In [None]:
n_synapses_test_results = pd.DataFrame(results)
n_synapses_test_results

Проверим влияние learning_threshold на устойчивость

In [None]:
results = dict(steps_av=[], steps_err=[], n_experiments=[], learning_threshold=[], elapsed_time=[])
for threshold in [1, 2, 3, 4, 5, 6, 10, 15, 25, sa_encoder.value_bits]:
    n_experiments = 10
    start = time.time()
    pars = dict(n_columns=sa_encoder.total_bits,
                                    cells_per_column=50,
                                    activation_threshold=sa_encoder.value_bits,
                                    learning_threshold=(sa_encoder.value_bits - threshold),
                                    initial_permanence=0.5,
                                    connected_permanence=0.5,
                                    maxNewSynapseCount=sa_encoder.value_bits,
                                    maxSynapsesPerSegment=sa_encoder.value_bits,
                                    permanenceIncrement=0.1,
                                    permanenceDecrement=0.005,
                                    predictedSegmentDecrement=0.0001)
    result = run_way_before_experiments(pars, sa_encoder, goal_state, n_experiments=n_experiments)
    end = time.time()

    elapsed_time = end - start

    df_results = pd.DataFrame(result)
    steps_mean, steps_std = df_results.steps.mean(), df_results.steps.std()

    results['steps_av'].append(steps_mean)
    results['steps_err'].append(steps_std)
    results['elapsed_time'].append(elapsed_time)
    results['learning_threshold'].append(sa_encoder.value_bits - threshold)
    results['n_experiments'].append(n_experiments)
    print(f'{sa_encoder.value_bits - threshold:2d} --> done {elapsed_time/60:.2f}')

In [None]:
learning_threshold_test_results = pd.DataFrame(results)
learning_threshold_test_results


Теперь сначала кормим случайные последовательности и в конце даём правильную.

In [None]:
def way_to_string(way):
    res = str()
    for action in way:
        res+=str(action)
    return res

In [None]:
def run_way_after_experiments(pars, sa_encoder, goal_state,
                              start_indicator=None,
                              n_experiments=3,
                              verbosity=0,
                              learning_true_count=1):
    results = list()
    ways_history = {'way': [], 'steps': [], 'experiment': []}
    for experiment in tqdm(range(n_experiments)):
        n_steps = 0 # number of noise action sequences
        while True:
            memory = init_memory(pars, sa_encoder, start_indicator=start_indicator)
            way = None
            for step in range(n_steps):
                way = random_way(max_steps, gw.n_actions)
                learn_way(way, memory, gw)
            for _ in range(learning_true_count):
                learn_way(actions, memory, gw)
            if not check_agent(memory, gw, goal_state, verbosity):
                if way is not None:
                    ways_history['way'].append(str(way))
                    ways_history['steps'].append(n_steps)
                    ways_history['experiment'].append(experiment)
                break
            else:
                n_steps += 1
        results.append(n_steps)
    return results, ways_history

In [None]:
goal_state = (0, 2, -2, 1, 2)
pars = dict(n_columns=sa_encoder.total_bits,
                                cells_per_column=50,
                                activation_threshold=sa_encoder.value_bits,
                                learning_threshold=sa_encoder.value_bits,
                                initial_permanence=0.5,
                                connected_permanence=0.5,
                                maxNewSynapseCount=sa_encoder.value_bits,
                                maxSynapsesPerSegment=sa_encoder.value_bits,
                                permanenceIncrement=0.1,
                                permanenceDecrement=0.005,
                                predictedSegmentDecrement=0.0001)
results, ways_history = run_way_after_experiments(pars, sa_encoder, goal_state, n_experiments=100)

In [None]:
df_results = np.array(results)
df_results.mean(), df_results.std()

In [None]:
df_ways = pd.DataFrame(ways_history)
df_ways

In [None]:
df_ways[df_ways['steps'] == df_ways.steps.max()]

In [None]:
ways_counts = df_ways.way.value_counts()

In [None]:
ways_counts.sort_values(ascending=False)

Теперь оставим в кодировке только координаты и проверим устойчивость снова .

In [None]:
gw = GridWorld(world_description, (3, 3), agent_initial_position={'row': 2, 'column': 0},
               observable_vars=['relative_row', 'relative_column', 'relative_direction'])

In [None]:
gw.render()

In [None]:
gw.observable_state, gw.filtered_observation

In [None]:
state_encoder = SequenceSdrEncoder('state',
                                   encoders=[
                                             IntRangeEncoder('row', (-(gw.world_size[0]-1),
                                                                     gw.world_size[1]-1), 5, 5),
                                             IntRangeEncoder('column', (-(gw.world_size[0]-1),
                                                                     gw.world_size[1]-1), 5, 5),
                                             IntSdrEncoder('direction', 4, 5, 5)],
                                   size=3)

In [None]:
action_encoder = IntSdrEncoder('action', gw.n_actions,
                              value_bits=5, activation_threshold=5)

In [None]:
sa_encoder = SaSdrEncoder(state_encoder, action_encoder)

In [None]:
sa_encoder.total_bits, sa_encoder.value_bits, sa_encoder.activation_threshold

In [None]:
goal_state = (-2, 1, 2)
pars = dict(n_columns=sa_encoder.total_bits,
                                cells_per_column=50,
                                activation_threshold=sa_encoder.value_bits,
                                learning_threshold=sa_encoder.value_bits,
                                initial_permanence=0.5,
                                connected_permanence=0.5,
                                maxNewSynapseCount=sa_encoder.value_bits,
                                maxSynapsesPerSegment=sa_encoder.value_bits,
                                permanenceIncrement=0.1,
                                permanenceDecrement=0.005,
                                predictedSegmentDecrement=0.0001)
results = run_way_after_experiments(pars, sa_encoder, goal_state, n_experiments=100)

In [None]:
df_results = np.array(results)
df_results.mean(), df_results.std()

Теперь оставим только расстояние и тип поверхности.

In [None]:
gw = GridWorld(world_description, (3, 3), agent_initial_position={'row': 2, 'column': 0},
               observable_vars=['distance', 'surface'])

In [None]:
gw.render()

In [None]:
gw.observable_state, gw.filtered_observation

In [None]:
state_encoder = SequenceSdrEncoder('state',
                                   encoders=[
                                             IntSdrEncoder('distance',
                                                                   gw.world_size[0] + 1,
                                                                   5,
                                                                   5),
                                             IntSdrEncoder('surface', 3 + 1, 5, 5)
                                            ],
                                   size=2)

In [None]:
action_encoder = IntSdrEncoder('action', gw.n_actions + 1,
                              value_bits=5, activation_threshold=5)

In [None]:
sa_encoder = SaSdrEncoder(state_encoder, action_encoder)

In [None]:
sa_encoder.total_bits, sa_encoder.value_bits, sa_encoder.activation_threshold

In [None]:
goal_state = (0, 2)

In [None]:
pars = dict(n_columns=sa_encoder.total_bits,
                                cells_per_column=50,
                                activation_threshold=sa_encoder.value_bits,
                                learning_threshold=sa_encoder.value_bits,
                                initial_permanence=0.5,
                                connected_permanence=0.5,
                                maxNewSynapseCount=sa_encoder.value_bits,
                                maxSynapsesPerSegment=sa_encoder.value_bits,
                                permanenceIncrement=0.1,
                                permanenceDecrement=0.005,
                                predictedSegmentDecrement=0.0001)
results = run_way_after_experiments(pars, sa_encoder, goal_state, n_experiments=100)

In [None]:
df_results = np.array(results[0])
df_results.mean(), df_results.std()

In [None]:
df_results.max()

In [None]:
pars = dict(n_columns=sa_encoder.total_bits,
                                cells_per_column=8,
                                activation_threshold=sa_encoder.value_bits,
                                learning_threshold=sa_encoder.value_bits,
                                initial_permanence=0.5,
                                connected_permanence=0.5,
                                maxNewSynapseCount=sa_encoder.value_bits,
                                maxSynapsesPerSegment=sa_encoder.value_bits,
                                permanenceIncrement=0.1,
                                permanenceDecrement=0.05,
                                predictedSegmentDecrement=0.025)

In [None]:
verbosity=0
fails = list()
for i in tqdm(range(100)):
    memory = init_memory(pars, sa_encoder)
    learn_way(actions, memory, gw, verbosity)
    fails.append(check_agent(memory, gw, goal_state, verbosity))

In [None]:
arr = np.array(fails, dtype=np.float)

In [None]:
arr.mean()

Ввели состояние - индикатор начала последовательности. Теперь, сразу после ресета TM обрабатывает индикаторное состояние.
Также пришлось увеличить predictedSegmentDecrement, чтобы последовательность могла достаточно быстро выучивать контекст,
однако, пока неизвестно, как это скажется на работе агента в полевых условиях. Теперь, по крайней мере, агент может находить
путь к награде при отсутствии шумовых последовательностей.

In [None]:
pars = dict(n_columns=sa_encoder.total_bits,
                                cells_per_column=8,
                                activation_threshold=sa_encoder.value_bits,
                                learning_threshold=sa_encoder.value_bits,
                                initial_permanence=0.5,
                                connected_permanence=0.5,
                                maxNewSynapseCount=sa_encoder.value_bits,
                                maxSynapsesPerSegment=sa_encoder.value_bits,
                                permanenceIncrement=0.1,
                                permanenceDecrement=0.05,
                                predictedSegmentDecrement=0.025)

In [None]:
max_steps = 12
verbosity = 3
memory = init_memory(pars, sa_encoder, start_indicator=Sa((3, 3), 3))
for _ in range(9):
    learn_way(actions, memory, gw, verbosity=1)
learn_way(actions, memory, gw, verbosity)
check_agent(memory, gw, goal_state, verbosity)

In [None]:
results = run_way_after_experiments(pars, sa_encoder, goal_state,
                                    start_indicator=Sa((3, 3), 3),
                                    n_experiments=100,
                                    learning_true_count=10)

In [None]:
df_results = np.array(results[0])
df_results.mean(), df_results.std()

In [None]:
results = run_way_after_experiments(pars, sa_encoder, goal_state,
                                    start_indicator=Sa((3, 3), 3),
                                    n_experiments=100,
                                    learning_true_count=1)

In [None]:
df_results = np.array(results[0])
df_results.mean(), df_results.std()

Описать пример, на основе которого я решил увеличить predictedSegmentDecrement.

In [None]:
def init_encoders(value_bits, thresholds):
    state_encoder = SequenceSdrEncoder('state',
                                   encoders=[IntSdrEncoder('distance',
                                                                   gw.world_size[0],
                                                                   value_bits[0],
                                                                   thresholds[0]),
                                             IntSdrEncoder('surface', 3, value_bits[1], thresholds[1]),
                                             IntRangeEncoder('row', (-(gw.world_size[0]-1),
                                                                     gw.world_size[1]-1), value_bits[2], thresholds[2]),
                                             IntRangeEncoder('column', (-(gw.world_size[0]-1),
                                                                     gw.world_size[1]-1), value_bits[3], thresholds[3]),
                                             IntSdrEncoder('direction', 4, value_bits[4], thresholds[4])],
                                   size=5)
    action_encoder = IntSdrEncoder('action', gw.n_actions,
                              value_bits=value_bits[5], activation_threshold=thresholds[5])
    return state_encoder, action_encoder

def stability_test(pars):
    pass













Ручной вариант(изначальный)

In [None]:
ways = [
    [2, 2, 1, 2, 2, 1, 2, 2],
    [1, 0, 2, 2, 1, 2, 2, 1, 2, 1, 0, 2],
    [2, 2, 1, 1, 2, 2, 0, 0],
    [2, 2, 1, 2, 2, 1, 2, 1, 1, 1, 1, 1],
    [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2],
    [2, 2, 1, 2, 2, 1, 0, 2, 2, 2, 2, 2],
    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
       ]

actions = [2, 2, 1, 2, 2, 1, 2, 2]

In [None]:
verbosity = 1
for i, way in enumerate(ways):
    if verbosity>1:
        print()
        print(f"### Way {i+1} ###")
        print()
    for i in range(2):
        if verbosity>1:
            print()
            print(f'*** cycle {i+1} ***')
            print()
        memory.reset()
        state, reward, done = gw.reset(), 0, False
        for action in way:
            if verbosity > 1:
                gw.render()
                print(f'Action {action} State: {state}')
            memory.train(Sa(state, action), verbosity)
            state, _, _, info = gw.step(action)

In [None]:
goal_state = (0, 2, -2, 1, 2)

In [None]:
planner = Planner(memory, 10, 1)

In [None]:
agent = Agent(memory, planner, gw.n_actions)

In [None]:
run = AgentRunner(agent, gw, 1, max_steps, 0, 3)

In [None]:
run.agent.planner.add_goal(goal_state)
run.agent.set_planning_horizon(10)
run.run()

In [None]:
run.train_stats.rewards[-1]

Проверка стабильности обратного трекинга

In [None]:
run.agent.planner.add_goal(goal_state)
run.agent.set_planning_horizon(10)
run.n_episodes = 50
run.pretrain = 25
run.verbosity = 1
run.run()

In [None]:
fig = plt.figure(figsize=(10, 7))
steps = np.array(run.train_stats.steps[:])
plt.plot(np.arange(steps.size), steps, '.')