# Moody & Saffell (1998) — Direct Reinforcement para Trading
Baseline sin redes. Política continua s_t = tanh(w^T x_t). Entrenamiento **online** para maximizar un proxy diferenciable del Sharpe.

**Resumen técnico**
- Señal: s_t ∈ [-1,1]
- PnL: p_t = s_{t-1} r_t - c |Δs_t|
- Objetivo: maximizar SR ≈ m/√(v+β) con momentos EMA.
- Gradiente: regla tipo policy-gradient sobre p_t con cadena para SR.


## Imports

In [None]:

import numpy as np
import matplotlib.pyplot as plt
rng = np.random.default_rng(1998)


## Datos sintéticos con tendencia + choques

In [None]:

def synthetic_returns(n=6000, ar=0.06, sigma=0.012, jump_p=0.0007, jump_s=0.06, seed=1998):
    rng = np.random.default_rng(seed)
    r = np.zeros(n)
    for t in range(1,n):
        r[t] = ar*r[t-1] + rng.normal(0, sigma)
        if rng.random() < jump_p:
            r[t] += rng.normal(0, jump_s)
    return r

r = synthetic_returns()
plt.figure(); plt.plot(r[:800]); plt.title('Rendimientos sintéticos'); plt.show()


## Features, señal y entrenamiento DR (Sharpe proxy)

In [None]:

def features(r, k=12):
    X = []
    for t in range(k, len(r)):
        X.append(r[t-k:t][::-1])
    return np.array(X), r[k:]

def pnl(signal, ret, cost=0.0002):
    ds = np.diff(signal, prepend=0.0)
    return signal*ret - cost*np.abs(ds)

def train_DR_1998(r, k=12, lr=0.03, ema=0.02, beta=1e-3, cost=0.0002, epochs=3):
    X, y = features(r, k=k)
    w = np.zeros(k)
    m, v = 0.0, 1e-4
    SR_hist = []
    for ep in range(epochs):
        for t in range(len(y)):
            x = X[t]
            s = np.tanh(x @ w)
            s_prev = np.tanh((X[t-1] @ w) if t>0 else 0.0)
            ds = s - s_prev
            p = s*y[t] - cost*np.abs(ds)

            m = (1-ema)*m + ema*p
            v = (1-ema)*v + ema*(p - m)**2
            sr = m/np.sqrt(v + beta)
            SR_hist.append(sr)

            # grad aproximado de p w.r.t. w
            ds_sign = np.tanh(1000*ds)
            dp_dw = (1 - s**2) * x * y[t] - cost * ds_sign * (1 - s**2) * x

            # cadena para SR
            d_sr_dm = 1/np.sqrt(v + beta)
            d_sr_dv = -0.5*m*(v+beta)**(-1.5)
            d_sr_dp = d_sr_dm*ema + d_sr_dv*(2*ema*(p - m))
            grad = d_sr_dp * dp_dw

            w += lr * grad
    return w, np.array(SR_hist), X, y

w, SR_hist, X, y = train_DR_1998(r, epochs=4)
s = np.tanh(X @ w)
p = pnl(s, y)
print("Sharpe ex-post:", p.mean()/(p.std()+1e-9))

plt.figure(); plt.plot(np.cumsum(p)); plt.title('Equity curve (DR 1998)'); plt.show()
plt.figure(); plt.plot(SR_hist); plt.title('Proxy SR durante entrenamiento'); plt.show()
