# Moody, Wu, Liao & Saffell (2001) — Learning to Trade via Direct Reinforcement (DSR)
Implementación educativa del **Differential Sharpe Ratio (DSR)** para actualización online. Política continua s_t = tanh(w^T x_t).

**Claves**
- Momentos EMA: m_t y v_t
- DSR_t = (m_t - m_{t-1}) / √(v_t)
- Actualización: gradiente sobre DSR_t vía regla de la cadena


## Imports

In [None]:

import numpy as np
import matplotlib.pyplot as plt
rng = np.random.default_rng(2001)


## Datos sintéticos AR con regímenes

In [None]:

def regime_returns(n=7000, sigma=0.012, seed=2001):
    rng = np.random.default_rng(seed)
    r = np.zeros(n)
    regime = 1
    for t in range(1,n):
        if rng.random() < 0.001:
            regime *= -1  # cambia régimen
        ar = 0.07 if regime>0 else -0.02
        r[t] = ar*r[t-1] + rng.normal(0, sigma)
    return r

r = regime_returns()
plt.figure(); plt.plot(r[:900]); plt.title('Rendimientos con regímenes'); plt.show()


## DSR training

In [None]:

def features(r, k=16):
    X = []
    for t in range(k, len(r)):
        X.append(r[t-k:t][::-1])
    return np.array(X), r[k:]

def pnl(signal, ret, cost=0.00025):
    ds = np.diff(signal, prepend=0.0)
    return signal*ret - cost*np.abs(ds)

def train_DSR_2001(r, k=16, lr=0.02, ema=0.02, beta=1e-3, cost=0.00025, epochs=3):
    X, y = features(r, k=k)
    w = np.zeros(k)
    m_prev, v_prev = 0.0, 1e-4
    m, v = 0.0, 1e-4
    DSR_hist = []
    for ep in range(epochs):
        for t in range(len(y)):
            x = X[t]
            s = np.tanh(x @ w)
            s_prev = np.tanh((X[t-1] @ w) if t>0 else 0.0)
            ds = s - s_prev

            p = s*y[t] - cost*np.abs(ds)
            # actualizar momentos
            m = (1-ema)*m_prev + ema*p
            v = (1-ema)*v_prev + ema*(p - m_prev)**2

            dsr = (m - m_prev) / np.sqrt(v + beta)
            DSR_hist.append(dsr)

            # derivadas
            ds_sign = np.tanh(1000*ds)
            dp_dw = (1 - s**2) * x * y[t] - cost * ds_sign * (1 - s**2) * x
            # ∂m/∂p ≈ ema ; ∂v/∂p ≈ 2*ema*(p - m_prev)
            d_dsr_dm = 1/np.sqrt(v + beta)
            d_dsr_dv = -0.5*(m - m_prev)*(v+beta)**(-1.5)
            d_dsr_dp = d_dsr_dm*ema + d_dsr_dv*(2*ema*(p - m_prev))

            grad = d_dsr_dp * dp_dw
            w += lr * grad

            m_prev, v_prev = m, v
    return w, np.array(DSR_hist), X, y

w, DSR_hist, X, y = train_DSR_2001(r, epochs=4)
s = np.tanh(X @ w)
p = pnl(s, y)
print("Sharpe ex-post:", p.mean()/(p.std()+1e-9))

plt.figure(); plt.plot(np.cumsum(p)); plt.title('Equity curve (DSR 2001)'); plt.show()
plt.figure(); plt.plot(DSR_hist); plt.title('DSR durante entrenamiento'); plt.show()
