In [None]:
import numpy as np

class TwoAssetMarket:
    def __init__(self, mu=0.0005, phi=0.10, vol_base=0.01, vol_persist=0.95, seed=0):
        self.mu, self.phi = mu, phi
        self.vol_base, self.vol_persist = vol_base, vol_persist
        self.r_prev, self.sigma2 = 0.0, vol_base**2
        self.rng = np.random.default_rng(seed)
    def step(self):
        eps = self.rng.normal()
        self.sigma2 = self.vol_base**2 + 0.05*(self.r_prev**2) + self.vol_persist*self.sigma2
        r = self.mu + self.phi*self.r_prev + np.sqrt(abs(self.sigma2))*eps
        self.r_prev = r
        return r

class Discretizer:
    def __init__(self, edges):
        self.edges = np.asarray(edges)
    def bin(self, x):
        return int(np.clip(np.searchsorted(self.edges, x, side="right") - 1, 0, len(self.edges)-2))

class Neuneier96Tabular:
    def __init__(self, grid_n=21, ret_edges=None, gamma=0.99, alpha=0.1, eps_init=1.0, eps_final=0.05, eps_decay=20000, seed=0):
        self.market = TwoAssetMarket(seed=seed)
        self.grid = np.linspace(0.0, 1.0, grid_n)
        if ret_edges is None:
            ret_edges = np.linspace(-0.05, 0.05, 51)
        self.disc = Discretizer(ret_edges)
        self.gamma, self.alpha = gamma, alpha
        self.eps_init, self.eps_final, self.eps_decay = eps_init, eps_final, eps_decay
        self.rng = np.random.default_rng(seed)
        self.n_ret_bins = len(ret_edges)-1
        self.n_w = grid_n
        self.n_states = self.n_ret_bins * self.n_w
        self.n_actions = self.n_w
        self.Q = np.zeros((self.n_states, self.n_actions), dtype=np.float64)
        self.ret_bin = self.disc.bin(0.0)
        self.prev_idx = 0
        self.wealth = 1.0
    def _state_index(self, ret_bin, prev_idx):
        return ret_bin * self.n_w + prev_idx
    def reset(self):
        self.market = TwoAssetMarket(seed=self.rng.integers(1, 1_000_000))
        self.ret_bin = self.disc.bin(0.0)
        self.prev_idx = 0
        self.wealth = 1.0
        return self._state_index(self.ret_bin, self.prev_idx)
    def step(self, a_idx):
        w = float(self.grid[a_idx])
        r = self.market.step()
        port_r = w * r
        self.wealth *= (1.0 + port_r)
        self.ret_bin = self.disc.bin(r)
        self.prev_idx = a_idx
        s_next = self._state_index(self.ret_bin, self.prev_idx)
        return s_next, port_r
    def train(self, steps=100_000, verbose=10_000):
        s = self.reset()
        last_loss = 0.0
        for t in range(1, steps+1):
            eps = self.eps_final + (self.eps_init - self.eps_final) * np.exp(-t / self.eps_decay)
            if self.rng.random() < eps:
                a = self.rng.integers(0, self.n_actions)
            else:
                a = int(np.argmax(self.Q[s]))
            s_next, r = self.step(a)
            td_target = r + self.gamma * np.max(self.Q[s_next])
            td_error = td_target - self.Q[s, a]
            self.Q[s, a] += self.alpha * td_error
            last_loss = td_error**2
            s = s_next
            if verbose and (t % verbose == 0):
                print(f"t={t} eps={eps:.3f} loss={last_loss:.6e} wealth={self.wealth:.4f}")
        return self
    def greedy_eval(self, steps=10_000):
        s = self.reset()
        for _ in range(steps):
            a = int(np.argmax(self.Q[s]))
            s, r = self.step(a)
        print(f"eval_wealth={self.wealth:.4f}")

if __name__ == "__main__":
    agent = Neuneier96Tabular()
    agent.train(steps=200_000, verbose=20_000)
    agent.greedy_eval(steps=20_000)



t=20000 eps=0.399 loss=1.333995e-04 wealth=0.0000
t=40000 eps=0.179 loss=1.125935e-03 wealth=-0.0000
t=60000 eps=0.097 loss=7.497966e-05 wealth=0.0000
t=80000 eps=0.067 loss=3.328577e-07 wealth=0.0000
t=100000 eps=0.056 loss=1.891742e-02 wealth=0.0000
t=120000 eps=0.052 loss=6.341113e-05 wealth=0.0000
t=140000 eps=0.051 loss=1.094459e-07 wealth=-0.0000
