In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

class TwoAssetMarket:
    def __init__(self, mu=0.0005, phi=0.1, vol_base=0.01, vol_persist=0.95, seed=0):
        self.rng = np.random.default_rng(seed)
        self.mu, self.phi, self.vol_base, self.vol_persist = mu, phi, vol_base, vol_persist
        self.r_prev, self.sigma2 = 0.0, vol_base**2
    def step(self):
        eps = self.rng.normal()
        self.sigma2 = self.vol_base**2 + 0.05*(self.r_prev**2) + self.vol_persist*self.sigma2
        r = self.mu + self.phi*self.r_prev + np.sqrt(abs(self.sigma2))*eps
        self.r_prev = r
        return r

class Neuneier96Env:
    def __init__(self, k=10, grid_n=21, seed=0):
        self.k=k; self.grid=np.linspace(0,1,grid_n); self.market=TwoAssetMarket(seed=seed)
        self.reset()
    def reset(self):
        self.buf=[0.0]*self.k; self.w_prev=0.0; self.wealth=1.0
        return self._state()
    def _state(self):
        return np.array(self.buf+[self.w_prev], dtype=np.float32)
    def step(self, a_idx):
        w=float(self.grid[a_idx]); r=self.market.step()
        port_r = w*r
        self.wealth *= (1+port_r)
        self.buf.pop(0); self.buf.append(r); self.w_prev=w
        return self._state(), port_r, False, {}

class QNet(nn.Module):
    def __init__(self, state_dim, n_actions):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim,128), nn.ReLU(),
            nn.Linear(128,128), nn.ReLU(),
            nn.Linear(128,n_actions)
        )
    def forward(self,x): return self.net(x)

def train_neuneier96(steps=10000, gamma=0.99, lr=1e-3, batch=256, verbose=1000):
    env=Neuneier96Env()
    state_dim=len(env._state()); n_actions=len(env.grid)
    q=QNet(state_dim,n_actions); qt=QNet(state_dim,n_actions); qt.load_state_dict(q.state_dict())
    opt=optim.Adam(q.parameters(), lr=lr)
    bufS,bufA,bufR,bufNS,bufD=[],[],[],[],[]
    s=env.reset(); rng=np.random.default_rng(0); last_loss=None
    
    for t in range(steps):
        eps=max(0.05, 1.0 - t/steps)
        a = rng.integers(0,n_actions) if rng.random()<eps else q(torch.tensor(s).unsqueeze(0)).argmax(1).item()
        ns,r,d,_=env.step(a)
        bufS.append(s); bufA.append(a); bufR.append(r); bufNS.append(ns); bufD.append(d); s=ns
        
        if len(bufS)>=batch:
            idx=rng.integers(0,len(bufS),size=batch)
            S=torch.tensor([bufS[i] for i in idx],dtype=torch.float32)
            A=torch.tensor([bufA[i] for i in idx],dtype=torch.long).unsqueeze(1)
            R=torch.tensor([bufR[i] for i in idx],dtype=torch.float32).unsqueeze(1)
            NS=torch.tensor([bufNS[i] for i in idx],dtype=torch.float32)
            D=torch.tensor([bufD[i] for i in idx],dtype=torch.float32).unsqueeze(1)
            with torch.no_grad():
                an=q(NS).argmax(1,keepdim=True)
                y=R+gamma*(1-D)*qt(NS).gather(1,an)
            qvals=q(S).gather(1,A)
            loss=(qvals-y).pow(2).mean()
            opt.zero_grad(); loss.backward(); opt.step()
            last_loss=float(loss.item())
        
        if verbose and (t % verbose == 0) and t>0:
            print(f"step {t}/{steps}  eps={eps:.2f}  loss={last_loss:.6f}")
    
    return q

def eval_policy(q, env=None, steps=1000):
    if env is None: env = Neuneier96Env()
    s = env.reset(); wealth = 1.0
    for _ in range(steps):
        with torch.no_grad():
            a = q(torch.tensor(s).unsqueeze(0)).argmax(1).item()
        s, r, _, _ = env.step(a)
        wealth *= (1.0 + r)
    print(f"Wealth final ~ {wealth:.4f}")
