In [11]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import itertools as it
from collections import Counter, defaultdict, deque
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import seaborn as sns
sns.set_style('white')
sns.set_context('notebook', font_scale=1.3)

from toolz.curried import get, curry

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Define environment

In [8]:
from distributions import Normal
from mouselab import MouselabEnv
from exact import solve

def make_env():
    reward = Normal(3, 6).to_discrete(6).apply(int)
    return MouselabEnv.new_symmetric([2,2], reward, cost=1)

envs = Series([make_env() for _ in range(200)])
env = envs[0]

8.9628970551810276

# Optimal policy

In [None]:
Q, V, *_ = solve(env)
V(env.init)

## Simulate rollouts

In [83]:
from policies import SoftmaxPolicy
from agents import run_episode

def simulate(policy, envs, seed=None, repeat=1):
    if seed is None:
        seed = np.random.randint(1e5)
    np.random.seed(seed)
    for env_id, env in envs.items():
        for _ in range(repeat):
            trace = run_episode(policy, env)
            for s, a, r in zip(*get(['states', 'actions', 'rewards'], trace)):
                yield {'seed': seed, 'cost': abs(env.cost), 'env_id': env_id,
                       'state': s, 'action': a, 'reward': r}

optimal_policy = SoftmaxPolicy(Q)
df_optimal = DataFrame(simulate(optimal_policy, envs))

## Model

In [89]:
@curry
def policy_model(policy, data):
    return data.apply(lambda row: policy.action_distribution(row.state)[row.action], axis = 1)


logp_optimal = np.log(policy_model(optimal_policy, df_optimal))
print('Probabilities of optimal policy actions under the optimal model')
print(np.exp(logp_optimal).value_counts().sort_index())
print(f'\nPredictive power: {np.exp(logp_optimal.mean()):.3f}', )

Probabilities of optimal policy actions under the optimal model
0.333333     1
0.500000    46
1.000000    45
dtype: int64

Predictive power: 0.699


# Random Policy

In [64]:
rand_policy = MouselabPolicy({'term_reward': 0})
# NOTE: must run one episode before calling for action_distribution
run_episode(rand_policy, env)
env.reset()
env.step(2)
print(env._state)
print(rand_policy.action_distribution(env._state))

(0, Cat, -4, Cat, Cat, Cat, Cat)
[ 0.     0.167  0.     0.167  0.167  0.167  0.167  0.167]


In [88]:
df = DataFrame(simulate(rand_policy, envs))

logp_rand = np.log(policy_model(rand_policy, df))
print('Probabilities of random policy actions under the random model')
print(np.exp(logp_rand).value_counts().sort_index())
print(f'\nPredictive power: {np.exp(logp_rand.mean()):.3f}', )

Probabilities of random policy actions under the random model
0.142857    20
0.166667    18
0.200000    15
0.250000    14
0.333333     8
0.500000     6
1.000000     3
dtype: int64

Predictive power: 0.219
